In [1]:
import pandas as pd

In [2]:
average_number = pd.read_csv('../../TABLAS LATEX/languages.csv', sep=';', decimal=',')
average_number = average_number[['Language', 'Average']]
average_number

Unnamed: 0,Language,Average
0,English,5607.0
1,Spanish,2348.33
2,French,832.33
3,Chinese,770.0
4,Italian,579.0
5,Russian,560.33
6,Portuguese,462.67
7,Arabic,418.67
8,Japanese,410.33
9,German,388.67


# Preply

In [71]:
preply = pd.read_csv('../data/results/final_dataframes/preply.csv', index_col=0)
preply = preply.drop_duplicates(subset=['user_name', 'language'])

In [None]:
preply.head()

In [73]:
preply.columns

Index(['language', 'position', 'retrieval_date', 'is_featured', 'user_name',
       'url', 'nationality_full', 'avg_rating', 'num_ratings', 'teaches',
       'subjects', 'speaks', 'lessons', 'price', 'price_currency',
       'avatar_url', 'nationality', 'clean_name', 'sanitized_name', 'gender',
       'probability', 'count', 'first_name', 'probability_male',
       'probability_female', 'gender_tuned', 'income_level', 'Code'],
      dtype='object')

In [76]:
from scipy.stats import ks_2samp
import numpy as np

def hypothesis_test(group1, group2, alpha=0.1):
    st, p_value = ks_2samp(group1, group2)
    if p_value<alpha:
        return st, p_value
    else:
        return st, p_value
    
def compute_aggregated_feature_top_k(df, top_k, language_col, aggregation_col, target_cols, group1, group2):
    count_group1 = 'count_{}'.format(group1)
    count_group2 = 'count_{}'.format(group2)
    variance_group1 = 'variance_{}'.format(group1)
    variance_group2 = 'variance_{}'.format(group2)
    mean_group1 = 'mean_{}'.format(group1)
    mean_group2 = 'mean_{}'.format(group2)
    median_group1 = 'median_{}'.format(group1)
    median_group2 = 'median_{}'.format(group2)
    
    results = pd.DataFrame(columns=['language', 'top_k', 'target_col', 'aggregation_col', mean_group1, mean_group2, median_group1, median_group2, count_group1, count_group2, variance_group1, variance_group2, 'statistic', 'p_value'])
    
    for lang in df[language_col].unique():
        temp = df[df[language_col]==lang]
        temp = temp.sort_values(by='position', ascending=True)
        
        for target in target_cols:
            temp = temp.dropna(subset=[target])
            temp[target] = pd.to_numeric(temp[target], errors='coerce')
            if top_k is not None:
                temp = temp.head(top_k)
        
            g1 = temp[temp[aggregation_col]==group1][target].values
            g2 = temp[temp[aggregation_col]==group2][target].values
            
            g1_count = len(g1)
            g2_count = len(g2)

            g1_mean = np.nanmean(g1) if g1_count else None
            g2_mean = np.nanmean(g2) if g2_count else None
            
            g1_median = np.nanmedian(g1) if g1_count else None
            g2_median = np.nanmedian(g2) if g2_count else None
            
            g1_var = g1.var() if g1_count else None
            g2_var = g2.var() if g2_count else None
            
            #Hypothesis testing
            if len(g1)>0 and len(g2)>0:
                st, p_value = hypothesis_test(g1, g2)
            else:
                st, p_value = None, None
            
            results = results.append({'language': lang, 'top_k': len(temp), 'target_col': target, 'aggregation_col': aggregation_col,
                                      mean_group1: g1_mean, mean_group2: g2_mean, median_group1: g1_median, median_group2: g2_median, count_group1: g1_count, count_group2: g2_count,
                                      variance_group1: g1_var, variance_group2: g2_var, 'statistic': st, 'p_value': p_value}, ignore_index=True)

    return results

In [77]:
preply_results_40 = compute_aggregated_feature_top_k(preply, 40, 'language', 'gender_tuned', ['price', 'avg_rating', 'num_ratings'], 'male', 'female')

In [78]:
preply_results_all = compute_aggregated_feature_top_k(preply, None, 'language', 'gender_tuned', ['price', 'avg_rating', 'num_ratings'], 'male', 'female')

In [79]:
preply_results = pd.concat([preply_results_40, preply_results_all])

In [80]:
preply_results = pd.merge(preply_results, average_number, how='left', left_on='language', right_on='Language')
preply_results = preply_results.rename(columns={'Average': 'average_num_teachers'})

In [81]:
preply_results.head()

Unnamed: 0,language,top_k,target_col,aggregation_col,mean_male,mean_female,median_male,median_female,count_male,count_female,variance_male,variance_female,statistic,p_value,Language,average_num_teachers
0,Thai,40,price,gender_tuned,12.25,13.472222,13.0,13.0,4,36,1.6875,10.249228,0.416667,0.478608,Thai,51.67
1,Thai,40,avg_rating,gender_tuned,5.0,4.930769,5.0,5.0,4,36,0.0,,0.277778,0.890579,Thai,51.67
2,Thai,40,num_ratings,gender_tuned,2.5,8.153846,1.5,5.0,4,36,4.25,,0.527778,0.199584,Thai,51.67
3,German,40,price,gender_tuned,29.636364,29.586207,30.0,29.0,11,29,139.867769,73.552913,0.134796,0.992602,German,388.67
4,German,40,avg_rating,gender_tuned,4.91,4.962069,4.95,5.0,11,29,,0.007182,0.213166,0.771131,German,388.67


In [112]:
preply_results.to_csv('../data/results/features_analysis/gender/preply.csv', sep=',')

In [83]:
preply_results[preply_results['p_value']<0.1]

Unnamed: 0,language,top_k,target_col,aggregation_col,mean_male,mean_female,median_male,median_female,count_male,count_female,variance_male,variance_female,statistic,p_value,Language,average_num_teachers
9,Hebrew,40,price,gender_tuned,24.578947,31.380952,24.0,31.0,19,21,68.34903,133.473923,0.413534,0.0451588,Hebrew,46.67
10,Hebrew,40,avg_rating,gender_tuned,4.953846,4.826316,5.0,4.9,19,21,,,0.413534,0.0451588,Hebrew,46.67
18,Persian,24,price,gender_tuned,8.785714,18.3,8.0,17.0,14,10,18.311224,22.81,0.757143,0.0009871225,Persian,54.33
23,Dutch,40,num_ratings,gender_tuned,21.555556,3.181818,7.0,3.0,19,21,,,0.370927,0.09498668,Dutch,73.67
56,Japanese,40,num_ratings,gender_tuned,20.083333,8.625,16.0,6.0,13,27,,,0.470085,0.02984851,Japanese,410.33
68,Thai,58,num_ratings,gender_tuned,3.888889,8.342857,2.0,5.0,9,49,10.320988,,0.44898,0.06644773,Thai,51.67
72,Portuguese,354,price,gender_tuned,10.673913,12.425926,8.0,13.0,138,216,36.654537,45.337106,0.18901,0.004091501,Portuguese,462.67
75,Hebrew,59,price,gender_tuned,24.862069,31.966667,24.0,30.0,29,30,59.084423,160.298889,0.328736,0.05386391,Hebrew,46.67
80,Hindi,119,num_ratings,gender_tuned,15.809524,16.244898,6.0,9.0,45,74,,,0.235135,0.0744852,Hindi,78.0
84,Persian,24,price,gender_tuned,8.785714,18.3,8.0,17.0,14,10,18.311224,22.81,0.757143,0.0009871225,Persian,54.33


# Italki

In [86]:
italki = pd.read_csv('../data/results/final_dataframes/italki.csv', index_col=0)
italki = italki.drop_duplicates(subset=['user_id', 'language'])

In [None]:
italki.head()

In [88]:
italki.columns

Index(['position', 'retrieval_date', 'user_id', 'user_name',
       'avatar_file_name', 'video_picture', 'is_pro', 'nationality', 'teaches',
       'also_speaks', 'in_platform_since', 'rating', 'number_sessions',
       'price', 'price_time', 'price_currency', 'clean_name', 'sanitized_name',
       'gender', 'probability', 'count', 'language', 'probability_male',
       'probability_female', 'gender_tuned', 'income_level', 'Code'],
      dtype='object')

In [89]:
italki_results_40 = compute_aggregated_feature_top_k(italki, 40, 'language', 'gender_tuned', ['price', 'rating', 'number_sessions'], 'male', 'female')

In [90]:
italki_results_all = compute_aggregated_feature_top_k(italki, None, 'language', 'gender_tuned', ['price', 'rating', 'number_sessions'], 'male', 'female')

In [91]:
italki_results = pd.concat([italki_results_40, italki_results_all])

In [92]:
italki_results = pd.merge(italki_results, average_number, how='left', left_on='language', right_on='Language')
italki_results = italki_results.rename(columns={'Average': 'average_num_teachers'})

In [93]:
italki_results.head()

Unnamed: 0,language,top_k,target_col,aggregation_col,mean_male,mean_female,median_male,median_female,count_male,count_female,variance_male,variance_female,statistic,p_value,Language,average_num_teachers
0,Thai,40,price,gender_tuned,957.181818,1067.931034,950.0,1000.0,11,29,101139.4,85381.93,0.253918,0.589953,Thai,51.67
1,Thai,40,rating,gender_tuned,4.972727,4.817241,5.0,5.0,11,29,0.001983471,0.8303924,0.169279,0.941853,Thai,51.67
2,Thai,40,number_sessions,gender_tuned,136.272727,408.896552,73.0,160.0,11,29,26209.11,239900.8,0.291536,0.415214,Thai,51.67
3,German,40,price,gender_tuned,2366.384615,2482.142857,1900.0,1650.0,26,14,1960935.0,2481288.0,0.104396,0.99939,German,388.67
4,German,40,rating,gender_tuned,4.792308,4.992857,5.0,5.0,26,14,0.9199408,0.0006632653,0.120879,0.996088,German,388.67


In [111]:
italki_results.to_csv('../data/results/features_analysis/gender/italki.csv', sep=',')

In [95]:
italki_results[italki_results['p_value']<0.1]

Unnamed: 0,language,top_k,target_col,aggregation_col,mean_male,mean_female,median_male,median_female,count_male,count_female,variance_male,variance_female,statistic,p_value,Language,average_num_teachers
20,Chinese (Mandarin),40,number_sessions,gender_tuned,1426.272727,566.034483,442.0,205.0,11,29,3994764.0,1316269.0,0.554859,0.008342761,,
32,Russian,40,number_sessions,gender_tuned,3031.666667,686.029412,2408.5,371.5,6,34,5717581.0,559666.2,0.607843,0.02559361,Russian,560.33
45,Turkish,40,price,gender_tuned,879.73913,1125.294118,800.0,1100.0,23,17,140221.1,95766.09,0.460358,0.02115657,Turkish,151.33
63,Persian (Farsi),40,price,gender_tuned,804.958333,1143.6875,760.0,1000.0,24,16,80825.46,262810.2,0.416667,0.05755151,,
72,Portuguese,352,price,gender_tuned,1114.641176,1171.010989,950.0,1000.0,170,182,610942.8,248912.8,0.17744,0.006617717,Portuguese,462.67
74,Portuguese,352,number_sessions,gender_tuned,596.029412,742.967033,261.5,319.5,170,182,1350027.0,1163338.0,0.149321,0.03454602,Portuguese,462.67
78,Hindi,95,price,gender_tuned,724.792453,868.452381,640.0,800.0,53,42,77689.26,117386.0,0.244834,0.09761415,Hindi,78.0
87,Dutch,78,price,gender_tuned,1455.886364,1866.470588,1345.0,1630.0,44,34,294104.0,530864.0,0.319519,0.03001359,Dutch,73.67
96,Russian,400,price,gender_tuned,992.94382,1162.03537,900.0,1000.0,89,311,170505.7,226611.8,0.193649,0.00930326,Russian,560.33
108,Arabic,220,price,gender_tuned,1122.465649,1371.157303,1000.0,1000.0,131,89,541311.9,964090.2,0.174029,0.06925793,Arabic,418.67


# Verbling

In [114]:
verbling = pd.read_csv('../data/results/final_dataframes/verbling.csv', index_col=0)
verbling = verbling.drop_duplicates(subset=['first_name', 'last_name', 'language'])

In [None]:
verbling.head()

In [116]:
verbling.columns

Index(['language', 'position', 'retrieval_date', 'is_featured', 'first_name',
       'last_name', 'url', 'nationality', 'location', 'avg_rating',
       'avg_lessons_per_students', 'num_ratings', 'teaching_levels', 'teaches',
       'class_details', 'speaks', 'lessons', 'students', 'dialect',
       'price_currency', 'avatar_url', 'clean_name', 'sanitized_name',
       'gender', 'probability', 'count', 'price_detail', 'price',
       'probability_male', 'probability_female', 'gender_tuned',
       'income_level', 'Code'],
      dtype='object')

In [117]:
verbling_results_40 = compute_aggregated_feature_top_k(verbling, 40, 'language', 'gender_tuned', ['price', 'avg_rating', 'num_ratings', 'avg_lessons_per_students'], 'male', 'female')

In [118]:
verbling_results_all = compute_aggregated_feature_top_k(verbling, None, 'language', 'gender_tuned', ['price', 'avg_rating', 'num_ratings', 'avg_lessons_per_students'], 'male', 'female')

In [119]:
verbling_results = pd.concat([verbling_results_40, verbling_results_all])

In [120]:
verbling_results = pd.merge(verbling_results, average_number, how='left', left_on='language', right_on='Language')
verbling_results = verbling_results.rename(columns={'Average': 'average_num_teachers'})

In [121]:
verbling_results.to_csv('../data/results/features_analysis/gender/verbling.csv', sep=',')

In [122]:
verbling_results

Unnamed: 0,language,top_k,target_col,aggregation_col,mean_male,mean_female,median_male,median_female,count_male,count_female,variance_male,variance_female,statistic,p_value,Language,Average
0,Thai,18,price,gender_tuned,17.333333,18.200000,17.000000,17.000000,3,15,4.222222,8.560000,0.200000,1.000000,Thai,51.67
1,Thai,18,avg_rating,gender_tuned,4.990358,4.326513,4.994152,5.000000,3,15,0.000096,2.880090,0.333333,0.921569,Thai,51.67
2,Thai,18,num_ratings,gender_tuned,107.666667,54.533333,143.000000,6.000000,3,15,5786.888889,7176.915556,0.466667,0.571078,Thai,51.67
3,Thai,18,avg_lessons_per_students,gender_tuned,20.733333,13.500000,23.000000,11.100000,3,15,17.928889,116.926667,0.600000,0.272059,Thai,51.67
4,German,40,price,gender_tuned,30.436667,34.705263,30.000000,30.000000,21,19,70.078679,67.768920,0.318296,0.210598,German,388.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,French,157,avg_lessons_per_students,gender_tuned,11.334375,12.961290,10.950000,12.500000,64,93,33.276943,44.922157,0.190020,0.110307,French,832.33
172,Korean,46,price,gender_tuned,22.428571,24.743750,25.000000,23.500000,14,32,24.959184,41.535586,0.191964,0.790171,Korean,175.00
173,Korean,46,avg_rating,gender_tuned,3.876069,4.350986,4.944928,4.993081,14,32,4.102226,2.708247,0.544643,0.003497,Korean,175.00
174,Korean,46,num_ratings,gender_tuned,148.428571,230.562500,33.000000,59.500000,14,32,35658.530612,133054.683594,0.160714,0.918684,Korean,175.00


In [105]:
verbling_results[verbling_results['p_value']<0.1]

Unnamed: 0,language,top_k,target_col,aggregation_col,mean_male,mean_female,median_male,median_female,count_male,count_female,variance_male,variance_female,statistic,p_value,Language_x,average_num_teachers,Language_y,Average
9,Hebrew,40,price,gender_tuned,24.578947,31.380952,24.0,31.0,19,21,68.34903,133.473923,0.413534,0.0451588,Hebrew,46.67,Hebrew,46.67
10,Hebrew,40,avg_rating,gender_tuned,4.953846,4.826316,5.0,4.9,19,21,,,0.413534,0.0451588,Hebrew,46.67,Hebrew,46.67
18,Persian,24,price,gender_tuned,8.785714,18.3,8.0,17.0,14,10,18.311224,22.81,0.757143,0.0009871225,Persian,54.33,Persian,54.33
23,Dutch,40,num_ratings,gender_tuned,21.555556,3.181818,7.0,3.0,19,21,,,0.370927,0.09498668,Dutch,73.67,Dutch,73.67
56,Japanese,40,num_ratings,gender_tuned,20.083333,8.625,16.0,6.0,13,27,,,0.470085,0.02984851,Japanese,410.33,Japanese,410.33
68,Thai,58,num_ratings,gender_tuned,3.888889,8.342857,2.0,5.0,9,49,10.320988,,0.44898,0.06644773,Thai,51.67,Thai,51.67
72,Portuguese,354,price,gender_tuned,10.673913,12.425926,8.0,13.0,138,216,36.654537,45.337106,0.18901,0.004091501,Portuguese,462.67,Portuguese,462.67
75,Hebrew,59,price,gender_tuned,24.862069,31.966667,24.0,30.0,29,30,59.084423,160.298889,0.328736,0.05386391,Hebrew,46.67,Hebrew,46.67
80,Hindi,119,num_ratings,gender_tuned,15.809524,16.244898,6.0,9.0,45,74,,,0.235135,0.0744852,Hindi,78.0,Hindi,78.0
84,Persian,24,price,gender_tuned,8.785714,18.3,8.0,17.0,14,10,18.311224,22.81,0.757143,0.0009871225,Persian,54.33,Persian,54.33
