In [1]:
import regex as re

from pandas import DataFrame, read_csv, option_context, Series, crosstab, concat

import helpers
import config_keys

import plotly.express as px

# Overall statistic of the dataset.

In [176]:
df_authors_full_fin.to_csv('data/df_authors_full_final_genderized_20230326.csv')

In [4]:
df_authors_full_fin = read_csv('data/df_authors_full_final_genderized.csv', index_col=[0])
df_fields_single = read_csv('data/df_fields_single.csv', index_col=[0])
print(len(df_authors_full_fin), len(df_fields_single))

34639 35576


In [None]:
df_authors_full_fin['Gender'][df_authors_full_fin.Gender.isna()] = 'female'

In [None]:
df_authors_full_fin.isna().sum()

Title                  0
Year                  13
Author                 0
PublicationType    10439
CitationCount         13
URL                   13
FirstName              0
LastName               0
MiddleName         29050
PaperID               13
Journal             5533
Gender                 1
Author_ID              0
FieldsOfStudy       5241
Fields_ext          2174
Fields_s2           2174
dtype: int64

In [None]:
df_authors_full_fin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34639 entries, 0 to 34638
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Title            34639 non-null  object 
 1   Year             34626 non-null  float64
 2   Author           34639 non-null  object 
 3   PublicationType  24200 non-null  object 
 4   CitationCount    34626 non-null  float64
 5   URL              34626 non-null  object 
 6   FirstName        34639 non-null  object 
 7   LastName         34639 non-null  object 
 8   MiddleName       5589 non-null   object 
 9   PaperID          34626 non-null  object 
 10  Journal          29106 non-null  object 
 11  Gender           34638 non-null  object 
 12  Author_ID        34639 non-null  object 
 13  FieldsOfStudy    29398 non-null  object 
 14  Fields_ext       32465 non-null  object 
 15  Fields_s2        32465 non-null  object 
dtypes: float64(2), object(14)
memory usage: 4.2+ MB


In [24]:
df_authors_full_fin.Year = df_authors_full_fin.Year.astype('Int64')

Create a column for author identification.

In [147]:
df_authors_full_fin['FullName'] = df_authors_full_fin['FirstName'] + ' ' + df_authors_full_fin['LastName']

In [151]:
print(
    len(df_authors_full_fin[['Author', 'FirstName', 'LastName']].value_counts()),
    len(df_authors_full_fin['FullName'].value_counts()))

29293 28878


Number of titles and authors per year.

In [169]:
df_authors_full_fin.groupby(['Year'])[['FullName', 'Title']]\
    .agg({'Title':['count', 'nunique'],'FullName':['nunique']})

Unnamed: 0_level_0,Title,Title,FullName
Unnamed: 0_level_1,count,nunique,nunique
Year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2012,3233,1094,3080
2013,3063,1000,2944
2014,3008,1008,2899
2015,3126,1016,2997
2016,3313,1105,3185
2017,3401,1051,3218
2018,3264,1063,3086
2019,3325,1012,3156
2020,1163,379,1102
2021,3716,1024,3490


In [160]:
df_authors_full_fin.groupby(['Year', 'Gender'])[['FullName']].agg('nunique')

Unnamed: 0_level_0,Unnamed: 1_level_0,FullName
Year,Gender,Unnamed: 2_level_1
2012,female,1555
2012,male,1525
2013,female,1411
2013,male,1534
2014,female,1521
2014,male,1374
2015,female,1512
2015,male,1486
2016,female,1688
2016,male,1497


### Overall statistics visualized

In [185]:
fig = px.histogram(
    df_authors_full_fin,
    x='Year',
    color='Gender',
    #histnorm='percent', 
    text_auto='.2f',
    barnorm='fraction',
    labels={'Gender': '', 'percent': ''},
    color_discrete_map = {'female': px.colors.qualitative.Pastel[9], 'male':px.colors.qualitative.Pastel[8]},
    #category_orders={'Fields_ext': fields_most},
    title="Yearly distribution of authors' gender",
    #width=1000, height=1200,
).update_layout(
    yaxis_title="Proportion of respondents in %", 
    xaxis_title='Gender', 
    bargap=0.1,
    xaxis = dict(
    tickmode = 'linear',
        tick0 = 1,
        #dtick = 0.75
    )
)            
fig.update_traces(xbins_size = 1)
for a in fig.layout.annotations:
    a.text = a.text.split("=")[1]

fig.show()

# Add Fields of research

In [186]:
df_add_info = read_csv('data/info_id_journal_filds.csv', index_col=[0])

In [187]:
import string 

def fields_split(group):
    """
    Removes punctuation in each group of the list
    """
    #for group in fields_col:
    #print(group)
    for character in str(group):
        if character in string.punctuation:
            group = group.replace(character,"")
#new_list.append(group)
    return group

def fields2_split_source(group):   
    """
    Extracts categories in two lists depending on the source of category and returns two lists
    """
    fields2_ext = []
    fields2_s2 = []
    for d in group:
        if d['source'] == 'external':
            #print(d['source'] , d['category'])
            fields2_ext.append(d['category'])
        elif d['source'] == 's2-fos-model':
            #print(d['source'] , d['category'])
            fields2_s2.append(d['category'])
    return fields2_ext, fields2_s2

def fields2_split(group):   
    """
    Extracts categories in one list independently to source of category and returns a list
    """
    fields2_set = set()
    fields2_list = []
    for d in group:
        fields2_set.add(d['category'])
    fields2_list = list(fields2_set)
    return fields2_list

Initially FieldsOfStudy_s2 column has only strings, the function below casts them into lists of dictionaries. Apply a function to extract fields of research into two columns according to the source.

In [188]:
df_add_info['Fields'] = df_add_info.FieldsOfStudy_s2.apply(eval)
df_add_info['Fields'] = df_add_info.apply(lambda x: fields2_split(x['Fields']), axis=1)

Create a set of categories, that are available in the dataframe.

In [None]:
fields_set = [set(x) for x in df_add_info.Fields.to_list()]
fields_set = set().union(*fields_set)
fields_set

Create two dataframes depending on the number of fileds of research.

In [194]:
df_authors_full_fin['Fields'] = df_authors_full_fin.PaperID.map(Series(df_add_info.Fields.values, index = df_add_info.PaperID).to_dict())
df_fields_mult = df_authors_full_fin[df_authors_full_fin.Fields.str.len() > 1].copy()
df_fields_mult['fields_nr'] = df_fields_mult.Fields.str.len()
df_fields_single = df_authors_full_fin[df_authors_full_fin.Fields.str.len() == 1].copy()
print('Publications with one field: ', len(df_fields_single),'\n',
      'Publication with more than one field: ', len(df_fields_mult),'\n',
      'Expected number of rows after multiplication of fields: ',df_fields_mult.fields_nr.sum()+ len(df_fields_single),
      sep='')

Publications with one field: 12938
Publication with more than one field: 19323
Expected number of rows after multiplication of fields: 55068


Turn lists with a single value into a string.

In [200]:
df_fields_single['Fields'] = df_fields_single['Fields'].apply(lambda x: ''.join(x))
#df_fields_single['Fields_ext'].value_counts()

In [205]:
df_fields_single.groupby(['Fields'])[['FullName', 'Title']]\
    .agg({'Title':['count', 'nunique'],'FullName':['nunique']}).sort_values((   'Title', 'nunique'), ascending=False)

Unnamed: 0_level_0,Title,Title,FullName
Unnamed: 0_level_1,count,nunique,nunique
Fields,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Medicine,6692,1410,6052
Psychology,2119,832,2008
Economics,1766,782,1550
Sociology,576,301,559
Computer Science,670,203,561
Education,403,160,388
Business,309,128,297
Political Science,154,87,144
Biology,125,31,117
Art,20,14,20


Create a row for each combination title-author-field of research.

In [None]:
for index, row in df_fields_mult.iterrows():
    #print(type(row))
    for elem in row['Fields']:
        new_row = row.copy()
        new_row['Fields'] = elem
        df_fields_single = df_fields_single.append(new_row)

In [207]:
len(df_fields_single)

55068

In [213]:
fields_most = df_fields_single.groupby(['Fields'])[['FullName', 'Title']]\
    .agg({'Title':['count', 'nunique'],'FullName':['nunique']}).sort_values((   'Title', 'nunique'), ascending=False)
print(fields_most)
fields_list = fields_most[fields_most[(   'Title', 'nunique')]>= 500].index.to_list()

                                Title         FullName
                                count nunique  nunique
Fields                                                
Medicine                        18909    4425    16401
Psychology                      13714    4327    12167
Economics                        5011    2219     4202
Sociology                        3528    1588     3260
Education                        3140    1215     2931
Political Science                1707     752     1598
Biology                          3221     717     2982
Business                         1719     703     1604
Computer Science                 2222     695     1874
Geography                         362     136      359
Law                               302     130      291
Physics                           259      92      250
Mathematics                       202      64      194
Linguistics                       159      59      146
Art                               111      57      105
Engineerin

# Visualisations

In [214]:
line_heights = {'Medicine': 0.39,
                'Psychology': 0.49, 
                'Economics': 0.31, 
                'Sociology': 0.47,
                'Education': 0.5,
                'Political Science': 0.43,
                'Biology': 0.38,
                'Business': 0.39,
                'Computer Science': 0.26}

In [262]:
fig = px.histogram(
    df_fields_single[df_fields_single.Fields.isin(fields_list)],
    x='Fields',
    color='Gender',
    barnorm='fraction',
    text_auto='.2f',
    color_discrete_map = {'female': px.colors.qualitative.Pastel[9], 'male':px.colors.qualitative.Pastel[8]}, #Pastel1[3],'male':px.colors.qualitative.Pastel1[2]}, #
    category_orders={'Fields': fields_list, 'Gender': ['female', 'male']},
    title="Yearly distribution of authors' gender for main fields of research",
    height=600, width=1000
).update_layout(yaxis_title="Proportion of authors' genders in %", xaxis_title='Fields of research' )
fig.update_traces(textposition='inside', textfont_size=12)
for nr in range(len(fields_list)):
    fig.add_shape(type='line',
                    x0=-0.4 + nr,
                    y0=line_heights[fields_list[nr]],
                    x1=0.4 + nr,
                    y1=line_heights[fields_list[nr]],
                    line=dict(color='Blue',),
                    line_dash="dot",
                    #text=line_heights[fields_most[nr]],
                    xref='x',
                    yref='y'
    )
    fig.add_annotation(x=nr, y=line_heights[fields_list[nr]]-0.04,
            text=line_heights[fields_list[nr]],
            showarrow=False,
            font=dict(
                size=12,
                color='blue'
            )
            #arrowhead= 2
             )
    fig.add_annotation(x=nr, y='Female',
            text=line_heights[fields_list[nr]],
            showarrow=False,
            #arrowhead= 2
             )
fig.show()

In [359]:
fig = px.histogram(
    df_fields_single[df_fields_single.Fields.isin(fields_list)],
    x='Year',
    color='Gender',
    #barmode='overlay',
    histnorm='percent', 
    barnorm='fraction',
    text_auto='.2f',
    facet_row='Fields',
    labels={'Gender': '', 'percent': ''},
    color_discrete_map = {'female': px.colors.qualitative.Pastel[9], 'male':px.colors.qualitative.Pastel[8]},
    category_orders={'Fields': fields_list},
    title="Yearly distribution of authors' gender for main fields of research",
    height=1500,
    width=800
)

for a in fig.layout.annotations:
    a.text = a.text.split("=")[1] + ': ' + str(line_heights[a.text.split("=")[1]])

fig.add_hline(y=0.39, row=0,col='all', line_dash="dot", #Medicine
                  #annotation='Average proportion of female scientists',
                  fillcolor="red", opacity=0.75)

fig.add_hline(y=0.26, row=1,col='all', line_dash="dot", #Computer Science
                  #annotation='Average proportion of female scientists',
                  fillcolor="red", opacity=0.75)

fig.add_hline(y=0.39, row=2,col='all', line_dash="dot", #Business
                  #annotation='Average proportion of female scientists',
                  fillcolor="red", opacity=0.75)
"""
fig.add_hline(y=0.47, row=3,col='all', line_dash="dot", # Sociology
                  #annotation='Average proportion of female scientists',
                  fillcolor="red", opacity=0.75)
fig.add_hline(y=0.31, row=4,col='all', line_dash="dot", #Economics
                  #annotation='Average proportion of female scientists',
                  fillcolor="red", opacity=0.75)
fig.add_hline(y=0.26, row=5,col='all', line_dash="dot", #Computer science
                  #annotation='Average proportion of female scientists',
                  fillcolor="red", opacity=0.75)
fig.add_hline(y=0.49, row=6,col='all', line_dash="dot", #Psychology
                  #annotation='Average proportion of female scientists',
                  fillcolor="red", opacity=0.75)
"""

#for nr in range(2,len(fields_most)):
#    fig.add_hline(y=line_heights[fields_most[nr]], row=nr,col='all', line_dash="dot",
#                  #annotation='Average proportion of female scientists',
#                  fillcolor="red", opacity=0.75)

fig.update_layout(
    bargap=0.1)
fig.show()

## Calculate correlation between proportion of female authors in dataset and in the field in general

In [302]:
prop = df_fields_single.groupby(['Fields', 'Gender'])['Gender'].count()
prop = prop / prop.groupby(level=0).sum()
prop = prop.reset_index(level=[0])
prop = prop[(prop.index == 'female') & (prop.Fields.isin(fields_list))]
prop['Proportion_lit'] = prop.Fields.map(line_heights)
prop


Unnamed: 0_level_0,Fields,Gender,Proportion_lit
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,Biology,0.45638,0.38
female,Business,0.517161,0.39
female,Computer Science,0.450495,0.26
female,Economics,0.517462,0.31
female,Education,0.555414,0.5
female,Medicine,0.515257,0.39
female,Political Science,0.606327,0.43
female,Psychology,0.56672,0.49
female,Sociology,0.595522,0.47


Correlation coefficients whose magnitude are between 0.7 and 0.9 indicate variables which can be considered highly correlated. Correlation coefficients whose magnitude are between 0.5 and 0.7 indicate variables which can be considered moderately correlated.

In [319]:
fig = px.imshow(
    prop[['Gender', 'Proportion_lit']].corr(),
    text_auto=f'.2f', 
    x = ['in authors dataset','For the field'],
    y = ['in authors dataset','For the field'],
    color_continuous_scale='Purp',
    title='Proportion of female authors',
    width=500,
    height=400
)
#fig.write_image('C:/Git_repositories/MSc_gender_bias/fig/corr_female_datasetVsreality.png')
fig.show()

Create a crosstab for fields and authors' genders per year. Combine with total number of authors and write fields into separate field instead of index.

In [322]:
ctab_fields = crosstab(df_fields_single['Fields'], 
         [df_fields_single['Year'], df_fields_single['Gender']], margins=True, margins_name='Total')
ctab_fields_norm = crosstab(df_fields_single['Fields'], 
         [df_fields_single['Year'], df_fields_single['Gender']], normalize='index')
ctab_fields_norm['Field'] = ctab_fields[:-1].index
ctab_fields_norm['Total'] = ctab_fields[:-1].Total

Filter to fields, that have more than 500 publications.

In [None]:
crosstab(df_fields_single['Fields'], 
         df_fields_single['Year'], margins=True, margins_name='Total')

Year,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,Total
Fields_ext,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Art,6,2,0,0,1,3,2,1,0,1,1,0,17
Biology,144,89,79,91,129,128,45,57,15,36,17,0,830
Business,43,18,40,44,47,46,69,78,14,47,0,0,446
Chemistry,28,5,0,12,12,17,5,2,0,0,0,0,81
Computer Science,106,42,31,96,136,146,222,282,250,366,340,40,2057
Economics,208,125,131,236,259,245,224,191,28,58,5,0,1710
Engineering,16,0,12,12,16,1,13,1,2,0,0,0,73
Environmental Science,0,0,0,0,2,0,7,0,0,0,0,0,9
Geography,49,15,44,52,58,23,40,43,5,33,0,0,362
History,0,0,2,0,3,0,3,16,7,2,0,0,33


In [325]:
ctab_fields_norm[ctab_fields_norm.Field.isin(fields_list)]

Year,2012,2012,2013,2013,2014,2014,2015,2015,2016,2016,...,2020,2020,2021,2021,2022,2022,2023,2023,Field,Total
Gender,female,male,female,male,female,male,female,male,female,male,...,female,male,female,male,female,male,female,male,Unnamed: 20_level_1,Unnamed: 21_level_1
Fields,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Biology,0.052779,0.083204,0.030115,0.048743,0.036324,0.054331,0.06023,0.064576,0.059919,0.06706,...,0.01366,0.007762,0.039739,0.037876,0.025458,0.023595,0.001242,0.002794,Biology,3221
Business,0.054101,0.055846,0.021524,0.018034,0.033159,0.032577,0.054101,0.038976,0.052356,0.045957,...,0.027923,0.022106,0.055265,0.049447,0.037231,0.043048,0.00349,0.002909,Business,1719
Computer Science,0.021602,0.026103,0.007651,0.011251,0.008101,0.007201,0.013951,0.029253,0.023852,0.039604,...,0.049955,0.066607,0.090909,0.09856,0.088659,0.089559,0.009451,0.013051,Computer Science,2222
Economics,0.039314,0.039114,0.024746,0.025344,0.027939,0.02734,0.050289,0.054879,0.056675,0.056077,...,0.018559,0.011974,0.061066,0.050489,0.049092,0.050489,0.020156,0.012373,Economics,5011
Education,0.057325,0.051274,0.027707,0.024204,0.030573,0.027389,0.055732,0.047452,0.057325,0.049682,...,0.032484,0.014013,0.056688,0.049363,0.051274,0.044586,0.006051,0.005732,Education,3140
Medicine,0.049236,0.054048,0.02713,0.033317,0.036914,0.039664,0.047438,0.052462,0.054313,0.04913,...,0.013962,0.010577,0.063462,0.051986,0.058173,0.043577,0.009255,0.009625,Medicine,18909
Political Science,0.059754,0.049209,0.039836,0.026362,0.036907,0.029877,0.073228,0.044523,0.076157,0.048623,...,0.029291,0.012302,0.055067,0.026362,0.02519,0.007616,0.001757,0.001172,Political Science,1707
Psychology,0.06147,0.048855,0.028803,0.025886,0.035876,0.03048,0.051553,0.046668,0.063074,0.049584,...,0.021219,0.016261,0.059647,0.03923,0.038647,0.026907,0.005104,0.005469,Psychology,13714
Sociology,0.043084,0.026644,0.026927,0.022392,0.03458,0.026927,0.053571,0.039399,0.052721,0.042517,...,0.030045,0.017857,0.071429,0.047619,0.060941,0.04195,0.00822,0.002268,Sociology,3528


In [327]:
df_fields_single.groupby(['Fields'])[['Title', 'FullName', ]]\
    .agg({'Title':['count', 'nunique'], 'FullName':['nunique']})\
    .sort_values(('Title',   'count'), ascending=False)

Unnamed: 0_level_0,Title,Title,FullName
Unnamed: 0_level_1,count,nunique,nunique
Fields,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Medicine,18909,4425,16401
Psychology,13714,4327,12167
Economics,5011,2219,4202
Sociology,3528,1588,3260
Biology,3221,717,2982
Education,3140,1215,2931
Computer Science,2222,695,1874
Business,1719,703,1604
Political Science,1707,752,1598
Geography,362,136,359


In [343]:
df_fields_single[df_fields_single.Fields.isin(fields_list)].groupby(['Fields', 'Year', 'Gender'])[['Title', 'FullName']] \
    .agg({'Title':['count', 'nunique'], 'FullName':['nunique']})\
    .reset_index(level=[0,1,2]).sort_values(['Fields', 'Year', 'Gender'], ascending=False)

Unnamed: 0_level_0,Fields,Year,Gender,Title,Title,FullName
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,nunique,nunique
215,Sociology,2023,male,8,7,8
214,Sociology,2023,female,29,10,29
213,Sociology,2022,male,148,93,148
212,Sociology,2022,female,215,114,214
211,Sociology,2021,male,168,94,164
...,...,...,...,...,...,...
4,Biology,2014,female,117,53,114
3,Biology,2013,male,157,53,155
2,Biology,2013,female,97,50,96
1,Biology,2012,male,268,88,263


# Clear data comparing first Names with initials and correct potential duplicates

In [None]:
df = df_authors_full_fin[df_authors_full_fin.Year == 2015][['Author', 'FirstName', 'LastName']].value_counts().to_frame('counts')
df = df.reset_index(level=[0,1,2])
helpers.print_all(df[df.duplicated(['LastName', 'FirstName'], keep=False)].sort_values('LastName'))
#df_authors_full_fin[df_authors_full_fin.Author == 'Y. Kwon']

In [142]:
df_authors_full_fin[['FirstName','Author', 'PaperID', 'Gender']].loc[df_authors_full_fin.Author == 'M. Wiklund']

Unnamed: 0,FirstName,Author,PaperID,Gender
30513,Anncristine,M. Wiklund,76045e91019069127948c6d367cb5ac27c0cc56b,female


In [159]:
#2015
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'T. Andren'] = 'Thomas'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'T. Andren'] = 'male'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'K. de Bruin'] = 'Karina'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'K. de Bruin'] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'M. Cho'] = 'Myeong-Chan'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'C. Choi'] = 'Cheol'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'M. Gómez-Sánchez'] = 'Manuel'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'M. Gómez-Sánchez'] = 'male'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'M. Gyamfi'] = 'Maxwell'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'M. Gyamfi'] = 'male'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'C. Cruz'] = 'Charlie'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author =='A. Hejase'] = 'Ale'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author =='A. Hejase'] = 'female'
#df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author ==
#df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author ==
#df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author ==
#df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author ==

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'T. Andren'] = 'Thomas'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'T. Andren'] = 'male'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'K. de Bruin'] = 'Karina'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: h

In [None]:
#2014
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'N. Ahmadi'] = 'Nahid'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'N. Ahmadi'] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'A. Behera'] = 'Anuguya'
df_authors_full_fin['FirstName'].loc[(df_authors_full_fin.Author == 'K. Lee') & (df_authors_full_fin.FirstName == 'Soo')] = 'Kyung'
df_authors_full_fin['FirstName'].loc[(df_authors_full_fin.Author == 'J. Lee') & (df_authors_full_fin.FirstName == 'Soo')] = 'Jungeon'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'J. Makaryus'] = 'John'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author =="E. O'Loughlin"] = 'Erin'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author =="A. Oberoi"] = 'Avneet' #8f662d8001aa8ff53156de311190ea813a657edc
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author =="A. Oberoi"] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'O. Poveshchenko'] = 'Olga'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'O. Poveshchenko'] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'L. Simons'] = 'Leslie' #931c0641fed2798712e0738da7f351e49c5e3046
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'L. Simons'] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author =='S. Voyer'] = 'Susan'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'S. Voyer'] = 'female'

In [None]:
#2012, 2013, 2022 df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'J. C. Cabré Vila'] = 'Juan'

df_authors_full_fin['Gender'][df_authors_full_fin.Author == 'J. C. Cabré Vila'] = 'male'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'N. K. Valk'] = 'Niek'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'F. Manfredini'] = 'Fabio'
df_authors_full_fin['FirstName'].loc[(df_authors_full_fin.PaperID == 'cdfa55b05c225a7d08c355ca80d8bf27ff68dd4a') 
                                     & (df_authors_full_fin.Author == 'R. Rossi')] = 'Rodolfo'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'L. Andrade'] = 'Laura'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'S. K. Stephen Huang'] = 'Stepen'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'P. Knechtle'] = 'Patrizia'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'P. Knechtle'] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'S. Lehto'] = 'Seppo'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'S. Lehto'] = 'male'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'W. Liang'] = 'Wen-Miin'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'W. Liang'] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'A. Wanke'] = 'Alice'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'M. Shin'] = 'Min-Ho'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'F. Moradi'] = 'Fatemeh'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'F. Moradi'] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'N. Moradi'] = 'Neda'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'N. Moradi'] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'F. Martinez'] = 'Fernando'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'J. Mette'] = 'Jan'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'J. Mette'] = 'male'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'J. Agesa'] = 'Jacqueline'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'J. Agesa'] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'H. Akiskal'] = 'Hagop'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'H. Akiskal'] = 'male'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'J. Chung'] = 'Jae-Wook'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'J. Chung'] = 'male'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'T. Holmen'] = 'Turid'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'T. Holmen'] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'S. Igissinov'] = 'Saginbek'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'K. Morgan'] = 'Kevin'
df_authors_full_fin['FirstName'].loc[(df_authors_full_fin.FirstName == 'Raymond') 
                                     & (df_authors_full_fin.Author == 'W. Lim')] = 'Wen'
df_authors_full_fin['Gender'].loc[(df_authors_full_fin.FirstName == 'Raymond') 
                                     & (df_authors_full_fin.Author == 'W. Lim')] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'F. Müller'] = 'Franz'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'K. R. Reddy'] = 'Kindinti'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'P. K. Shukla'] = 'Pradip'
df_authors_full_fin['Gender'].loc[df_authors_full_fin.Author == 'P. K. Shukla'] = 'male'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'K. Steyn'] = 'Krisela'
df_authors_full_fin['FirstName'].loc[(df_authors_full_fin.PaperID == '0445dddd1f6f166fd6c18858a2eef4c3bea04965') 
                                     & (df_authors_full_fin.Author == 'M. Wong')] = 'Mee-Lian' 
df_authors_full_fin['Gender'].loc[(df_authors_full_fin.PaperID == '0445dddd1f6f166fd6c18858a2eef4c3bea04965') 
                                     & (df_authors_full_fin.Author == 'M. Wong')] = 'female'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'V. Wyller'] = 'Vegard'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'B. Xu'] = 'Bo'
df_authors_full_fin['FirstName'].loc[df_authors_full_fin.Author == 'L. Yang'] = 'Lee-Wen'

Manually add missed years.

In [None]:
index = df_authors_full_fin[df_authors_full_fin.Title.isin(\
    ['Gender differences in level and change in cognitive functioning : Results from the Longitudinal Aging Study',
    'Eighteen Million Cracks: Genders Role in the 2008 Presidential Campaign'])].index
df_authors_full_fin = df_authors_full_fin.drop(index)
df_authors_full_fin['Year'][df_authors_full_fin.Title == 'Gender, Age and Body Mass Differences Influencing the Motivation for Physical Activity among Polish Youths'] = 2014
df_authors_full_fin['Year'][df_authors_full_fin.Title == 'Gender Differences in Motivation and Learning Preferences of Pharmacists in Lifelong Learning'] = 2014
df_authors_full_fin['Year'][df_authors_full_fin.Title == 'Gender and Racial Bias in VisualQuestion Answering Datasets'] = 2022
df_authors_full_fin['Year'][df_authors_full_fin.Title == 'If the Shoe Fits: A Historical Exploration of Gender Bias in the U.S. Sneaker Industry'] = 2019
df_authors_full_fin['Year'][df_authors_full_fin.Title == 'Gender differences in students’ attitudes toward science: An analysis of students’ science process skill using testlet instrument'] = 2017
df_authors_full_fin['Year'][df_authors_full_fin.Title == 'Analyzing Gender Bias in Student Evaluations Acknowledgments'] = 2016
df_authors_full_fin['Year'][df_authors_full_fin.Title == 'Gender Bias in Availability of School Education in Villages - A Study of Kalisindh Thermal Power Project'] = 2018
df_authors_full_fin['Year'][df_authors_full_fin.Title == 'Gender Differences in Perceived Illness, Stress, and Coping in Undergraduates'] = 2021
                                                                                    
