In [20]:
import os

platforms = ['preply', 'italki', 'verbling']

In [21]:
import pandas as pd
dfs = {}

for platform in platforms:
    path = "../data/{}/gender/".format(platform)
    languages = set([file for file in os.listdir(path) if not file.startswith('.')])
    df = None
    for language in languages:
        file = [file for file in os.listdir(path+language) if not file.startswith('.')][0]
        if df is None:
            df = pd.read_csv(os.path.join(path+language, file), index_col=0)
        else:
            temp = pd.read_csv(os.path.join(path+language, file), index_col=0)
            df = df.append(temp)
            
    dfs[platform] = df

In [22]:
preply = dfs['preply']
verbling = dfs['verbling']
italki = dfs['italki']

### Preply

In [None]:
preply.head(2)

In [24]:
preply.gender.value_counts()

female     4131
male       2402
Unknown      37
unknown      20
Name: gender, dtype: int64

In [25]:
print("Un {}% son nulos para preply".format(100*46/len(preply)))

Un 0.6980273141122914% son nulos para preply


In [26]:
import numpy as np
preply.gender = preply.gender.replace('Unknown', np.nan)
preply.gender = preply.gender.replace('unknown', np.nan)
preply.gender.value_counts()

female    4131
male      2402
Name: gender, dtype: int64

In [27]:
null_preply = preply.gender.isnull().groupby([preply['nationality']]).sum().astype(int).reset_index(name='count')
null_preply.sort_values(by='count', ascending=False).head(17)

Unnamed: 0,nationality,count
27,CN,27
62,JP,7
118,TW,2
26,CM,2
79,MX,2
60,IT,1
50,HK,1
119,UA,1
24,CI,1
43,GB,1


### Verbling

In [28]:
verbling.gender.value_counts()

female     1233
male        821
Unknown      58
unknown       4
Name: gender, dtype: int64

In [29]:
print("Un {}% son nulos para preply".format(100*62/len(verbling)))

Un 2.9300567107750473% son nulos para preply


In [30]:
import numpy as np
verbling.gender = verbling.gender.replace('Unknown', np.nan)
verbling.gender = verbling.gender.replace('unknown', np.nan)
verbling.gender.value_counts()

female    1233
male       821
Name: gender, dtype: int64

In [31]:
null_verbling = verbling.gender.isnull().groupby([verbling['nationality']]).sum().astype(int).reset_index(name='count')
null_verbling.sort_values(by='count', ascending=False).head(25)

Unnamed: 0,nationality,count
14,CN,7
74,US,6
24,FR,6
22,EG,6
40,KR,5
46,MX,5
68,TH,4
7,BR,4
23,ES,3
61,RU,2


### Italki

In [32]:
italki.gender.value_counts()

female     4165
male       3260
Unknown     242
unknown      17
Name: gender, dtype: int64

In [33]:
print("Un {}% son nulos para Italki".format(100*(242+17)/len(italki)))

Un 3.370640291514836% son nulos para Italki


In [34]:
import numpy as np
italki.gender = italki.gender.replace('Unknown', np.nan)
italki.gender = italki.gender.replace('unknown', np.nan)
italki.gender.value_counts()

female    4165
male      3260
Name: gender, dtype: int64

In [35]:
null_italki = italki.gender.isnull().groupby([italki['nationality']]).sum().astype(int).reset_index(name='count')
null_italki.sort_values(by='count', ascending=False).head(25)

Unnamed: 0,nationality,count
22,CN,47
58,JP,35
37,FR,30
61,KR,27
31,DZ,22
55,IT,9
102,TH,8
117,ZZ,5
25,CU,5
29,DE,5


In [None]:
temp = italki[italki['nationality']=='CN']
temp[temp['gender'].isnull()]

# Overall

In [37]:
nans = preply.gender.isna().sum() + verbling.gender.isna().sum() + italki.gender.isna().sum()
print("In total there are {} null values. Can we use crowd-sourcing??".format(nans))

In total there are 378 null values. Can we use crowd-sourcing??


# Export missing values

In [None]:
preply_null = preply[preply['gender'].isna()]
preply_null

In [39]:
preply_null.to_csv('../data/results/missing_values/preply_missing_values.csv')

In [21]:
verbling_null = verbling[verbling['gender'].isna()]

verbling_null.to_csv('../data/results/missing_values/verbling_missing_values.csv')

In [22]:
italki_null = italki[italki['gender'].isna()]

italki_null.to_csv('../data/results/missing_values/italki_missing_values.csv')

# Incorporate manual values

In [40]:
preply_fixed = pd.read_excel('../data/results/missing_values/preply_missing_values_solved.xlsx', engine='openpyxl', index_col=0)
preply_fixed_join = preply_fixed[['user_name', 'gender_manual']].drop_duplicates()

In [41]:
preply_fixed_join.head()

Unnamed: 0,user_name,gender_manual
164,신윤수 신.,m
57,Zhenia N.,m
102,LOUKOUMANOU M.,m
117,Moffo A.,m
517,Tchawa Zainab C.,f


In [42]:
verbling_fixed = pd.read_excel('../data/results/missing_values/verbling_missing_values_solved.xlsx', engine='openpyxl', index_col=0)
verbling_fixed_join = verbling_fixed[['url', 'gender_manual']].drop_duplicates()

In [43]:
verbling_fixed_join.head()

Unnamed: 0,url,gender_manual
5,https://www.verbling.com/teachers/135560719935...,f
7,https://www.verbling.com/teachers/izarel,f
63,https://www.verbling.com/teachers/amadeusherrera,m
74,https://www.verbling.com/teachers/856963870715...,m
126,https://www.verbling.com/teachers/inma,f


In [44]:
italki_fixed = pd.read_excel('../data/results/missing_values/italki_missing_values_solved.xlsx', engine='openpyxl', index_col=0)
italki_fixed_join = italki_fixed[['user_id', 'gender_manual']].drop_duplicates()

In [45]:
italki_fixed_join.head()

Unnamed: 0,user_id,gender_manual
31,5042130,f
94,3632892,f
101,249152,f
111,6492941,f
135,7233217,f


### Join results

In [46]:
gender_dict = {'m': 'male', 'f': 'female'}

In [47]:
# Preply

In [48]:
preply_correct = preply[~preply['gender'].isnull()]
preply_missing = preply[preply['gender'].isnull()]

In [49]:
preply_missing.columns

Index(['language', 'position', 'retrieval_date', 'is_featured', 'user_name',
       'url', 'nationality_full', 'avg_rating', 'num_ratings', 'teaches',
       'subjects', 'speaks', 'lessons', 'price', 'price_currency',
       'avatar_url', 'nationality', 'clean_name', 'sanitized_name', 'gender',
       'probability', 'count', 'first_name'],
      dtype='object')

In [50]:
preply_missing = preply_missing.drop(columns=['gender']).merge(preply_fixed_join, on='user_name')

In [51]:
preply_missing['gender_manual'] = preply_missing['gender_manual'].str.lower()
preply_missing = preply_missing.replace({"gender_manual": gender_dict})
preply_missing = preply_missing.rename(columns={'gender_manual': 'gender'})
preply_missing['probability'] = 1.0
preply_missing['count'] = np.nan

In [None]:
preply_missing.head()

In [53]:
preply = pd.concat([preply_correct, preply_missing])

In [54]:
# Verbling

In [55]:
verbling_correct = verbling[~verbling['gender'].isnull()]
verbling_missing = verbling[verbling['gender'].isnull()]

In [56]:
verbling_missing.columns

Index(['language', 'position', 'retrieval_date', 'is_featured', 'first_name',
       'last_name', 'url', 'nationality', 'location', 'avg_rating',
       'avg_lessons_per_students', 'num_ratings', 'teaching_levels', 'teaches',
       'class_details', 'speaks', 'lessons', 'students', 'dialect',
       'price_currency', 'avatar_url', 'clean_name', 'sanitized_name',
       'gender', 'probability', 'count', 'price_detail', 'price'],
      dtype='object')

In [57]:
verbling_missing = verbling_missing.drop(columns=['gender']).merge(verbling_fixed_join, on='url')

In [58]:
verbling_missing['gender_manual'] = verbling_missing['gender_manual'].str.lower()
verbling_missing = verbling_missing.replace({"gender_manual": gender_dict})
verbling_missing = verbling_missing.rename(columns={'gender_manual': 'gender'})
verbling_missing['probability'] = 1.0
verbling_missing['count'] = np.nan

In [None]:
verbling_missing.head()

In [60]:
verbling = pd.concat([verbling_correct, verbling_missing])

In [61]:
# Italki

In [62]:
italki_correct = italki[~italki['gender'].isnull()]
italki_missing = italki[italki['gender'].isnull()]

In [63]:
italki_missing.columns

Index(['position', 'retrieval_date', 'user_id', 'user_name',
       'avatar_file_name', 'video_picture', 'is_pro', 'nationality', 'teaches',
       'also_speaks', 'in_platform_since', 'rating', 'number_sessions',
       'price', 'price_time', 'price_currency', 'clean_name', 'sanitized_name',
       'gender', 'probability', 'count', 'language'],
      dtype='object')

In [64]:
italki_missing = italki_missing.drop(columns=['gender']).merge(italki_fixed_join, on='user_id')

In [65]:
italki_missing['gender_manual'] = italki_missing['gender_manual'].str.lower()
italki_missing = italki_missing.replace({"gender_manual": gender_dict})
italki_missing = italki_missing.rename(columns={'gender_manual': 'gender'})
italki_missing['probability'] = 1.0
italki_missing['count'] = np.nan

In [None]:
italki_missing.head()

In [67]:
italki = pd.concat([italki_correct, italki_missing])

# Store new dataframes

In [68]:
italki.to_csv('../data/results/final_dataframes/italki.csv')

In [69]:
preply.to_csv('../data/results/final_dataframes/preply.csv')

In [70]:
verbling.to_csv('../data/results/final_dataframes/verbling.csv')

# Change gender based on computed probability

In `Analyzing gender inference`, we found out that we must shift the decision boundary to balance recall for both genders. `Male if probability of male > 70%, else female`

In [71]:
def shift_prediction(x):
    if str(x['gender']) == 'nan':
        return np.nan
    elif x['probability_male'] > 0.7:
        return 'male'
    else:
        return 'female'

def change_gender_prediction(df, prob_column='probability', gender_col='gender', cutoff_male=0.7):
    # Create unified probability. Assume probability of the other gender is 1-prob
    df['probability_male'] = df.apply(lambda x: x[prob_column] if x[gender_col]=='male' else 1-x[prob_column], axis=1)
    df['probability_female'] = df.apply(lambda x: x[prob_column] if x[gender_col]=='female' else 1-x[prob_column], axis=1)
    
    df['gender_tuned'] = df.apply(lambda x: shift_prediction(x), axis=1)
    return df

In [72]:
preply = change_gender_prediction(preply)
verbling = change_gender_prediction(verbling)
italki = change_gender_prediction(italki)

In [73]:
italki.to_csv('../data/results/final_dataframes/italki.csv')

In [74]:
preply.to_csv('../data/results/final_dataframes/preply.csv')

In [75]:
verbling.to_csv('../data/results/final_dataframes/verbling.csv')

# Analyzing missing values

#### Italki

In [76]:
italki_missing.gender.value_counts()

female    172
male       89
Name: gender, dtype: int64

In [77]:
print("A {:.2f}% of missing values are women".format(100*italki_missing.gender.value_counts()['female']/len(italki_missing)))

A 65.90% of missing values are women


In [78]:
# Comparar con el tuned o con el original
italki.gender_tuned.value_counts()

female    4494
male      3192
Name: gender_tuned, dtype: int64

In [79]:
print("There are {:.2f}% women in the dataset".format(100*italki.gender_tuned.value_counts()['female']/len(italki)))

There are 58.47% women in the dataset


#### Verbling

In [80]:
verbling_missing.gender.value_counts()

female    41
male      21
Name: gender, dtype: int64

In [81]:
print("A {:.2f}% of missing values are women".format(100*verbling_missing.gender.value_counts()['female']/len(verbling_missing)))

A 66.13% of missing values are women


In [82]:
# Comparar con el tuned o con el original
verbling.gender_tuned.value_counts()

female    1331
male       785
Name: gender_tuned, dtype: int64

In [83]:
print("There are {:.2f}% women in the dataset".format(100*verbling.gender_tuned.value_counts()['female']/len(verbling)))

There are 62.90% women in the dataset


#### Preply

In [84]:
preply_missing.gender.value_counts()

female    47
male      10
Name: gender, dtype: int64

In [85]:
print("A {:.2f}% of missing values are women".format(100*preply_missing.gender.value_counts()['female']/len(preply_missing)))

A 82.46% of missing values are women


In [86]:
# Comparar con el tuned o con el original
preply.gender_tuned.value_counts()

female    4425
male      2165
Name: gender_tuned, dtype: int64

In [87]:
print("There are {:.2f}% women in the dataset".format(100*preply.gender_tuned.value_counts()['female']/len(preply)))

There are 67.15% women in the dataset
