##### Downsample (English and Swedish theses separately) to obtain an even gender distribution in each department. Restricts the included departments to the topics humanities, natural science, architecture and economy.
##### Saves the downsampled datasets to "df_english_downsampled.csv" and "df_swedish_downsampled.csv"

In [6]:
import pandas as pd

import json
import pandas as pd

info_file_gu = "/Users/annl/Desktop/GenieProjekt/Exjobb/GU-metadata-210310.json"
info_file_ch = "/Users/annl/Desktop/GenieProjekt/Exjobb/CTH-metadata-210310.json"


In [7]:
# Read metadata files into dataframes
df_cth = pd.read_json(info_file_ch)
df_gu  = pd.read_json(info_file_gu)

In [8]:
# Create column for gender and filter out non-male/female 
genders_gu  = [val['gender'] for val in df_gu['inferred'].values]
genders_cth = [val['gender'] for val in df_cth['inferred'].values]

df_cth['gender_composition'] = genders_cth
df_gu['gender_composition']  = genders_gu

df_cth = df_cth[df_cth['gender_composition'].isin(['male','female'])]
df_gu = df_gu[df_gu['gender_composition'].isin(['male','female'])]

In [10]:
# Separate data by language
languages_gu  = [val['language'] for val in df_gu['inferred'].values]
languages_cth = [val['language'] for val in df_cth['inferred'].values]

df_gu['language'] = languages_gu
df_cth['language'] = languages_cth

df_cth_swe = df_cth[df_cth['language'] == 'sv']
df_cth_eng = df_cth[df_cth['language'] == 'en']

df_gu_swe = df_gu[df_gu['language'] == 'sv']
df_gu_eng = df_gu[df_gu['language'] == 'en']


In [23]:
# Get the number of female/male theses in the given department and dataframe
def get_sizes(df,dep) : 
    df_dep = df[df['department'] == dep]
    [size_female,size_male] = [len(df_dep[df_dep['gender_composition'] == gender]) 
                                          for gender in ['female','male']] 
    return (size_female,size_male)

In [24]:

# Randomly downsamples data by reducing the majority class
# resulting in an equal number of male and female authors
# for each department.

def decide_fraction(df,dep) :
    (a,b) = get_sizes(df,dep)
    if a == 0 or b == 0 : return ('',1)    
    elif a > b : return ('female',1  - (b / a))
    else : return ('male', 1 - (a / b))
    

def drop_fraction(df,dep,gender,fraction) : 
    df2 = df.drop(df[df.department.eq(dep)][df.gender_composition.eq(gender)].sample(frac=fraction).index)
    return df2

def gender_equal(df) :
    deps = df['department'].unique()
    return gender_equal_deps(df,deps)
    

def gender_equal_deps(df,deps) :
    if len(deps) == 0 : return df
    else : 
       
        (gen,frac) = decide_fraction(df,deps[0])
        df_eq      = drop_fraction(df,deps[0],gen,frac)
        if len(deps) == 1 : return df_eq
        else : return gender_equal_deps(df_eq,deps[1:])

df_equal_gu_swe  = gender_equal(df_gu_swe)
df_equal_gu_eng  = gender_equal(df_gu_eng)
df_equal_cth_swe = gender_equal(df_cth_swe)
df_equal_cth_eng = gender_equal(df_cth_eng)

  del sys.path[0]


In [25]:
print("Gender counts CTH Swe")
print(df_equal_cth_swe['gender_composition'].value_counts())
print("Gender counts GU Swe")
print(df_equal_gu_swe['gender_composition'].value_counts())
print("Gender counts CTH Eng")
print(df_equal_cth_eng['gender_composition'].value_counts())
print("Gender counts GU Eng")
print(df_equal_gu_eng['gender_composition'].value_counts())

Gender counts CTH Swe
male      638
female    636
Name: gender_composition, dtype: int64
Gender counts GU Swe
female    4205
male      4082
Name: gender_composition, dtype: int64
Gender counts CTH Eng
male      2428
female    2427
Name: gender_composition, dtype: int64
Gender counts GU Eng
female    2054
male      2054
Name: gender_composition, dtype: int64


In [None]:
# Restrict to these departments

humaniora      = ['Institutionen för filosofi, lingvistik och vetenskapsteori','Institutionen för litteratur, idéhistoria och religion','Institutionen för språk och litteraturer','Institutionen för svenska språket']
arkitektur     = ['Institutionen för arkitektur och samhällsbyggnadsteknik']
ekonomi        = ['Företagsekonomiska institutionen']
naturvetenskap = ['Institutionen för biologi och bioteknik','Institutionen för fysik','Institutionen för kemi och kemiteknik','Institutionen för rymd- och geovetenskap']


df_eng = pd.concat([df_equal_gu_eng[df_equal_gu_eng['department'].isin(humaniora+ekonomi)], 
                df_equal_cth_eng[df_equal_cth_eng['department'].isin(naturvetenskap + arkitektur) ]])

df_swe = pd.concat([df_equal_gu_swe[df_equal_gu_swe['department'].isin(humaniora+ekonomi)], 
                df_equal_cth_swe[df_equal_cth_swe['department'].isin(naturvetenskap + arkitektur) ]])

In [19]:
df_eng.to_csv("df_english_downsampled.csv")

In [20]:
df_swe.to_csv("df_swedish_downsampled.csv")