In [1]:
import pandas as pd

In [2]:
def map_income(income):
    if income<=9036.8:
        return 'Low'
    else:
        return 'High'

In [3]:
average_number = pd.read_csv('../../TABLAS LATEX/languages.csv', sep=';', decimal=',')
average_number = average_number[['Language', 'Average']]
average_number

Unnamed: 0,Language,Average
0,English,5607.0
1,Spanish,2348.33
2,French,832.33
3,Chinese,770.0
4,Italian,579.0
5,Russian,560.33
6,Portuguese,462.67
7,Arabic,418.67
8,Japanese,410.33
9,German,388.67


# Preply

In [11]:
preply = pd.read_csv('../data/results/final_dataframes/preply.csv', index_col=0)
preply = preply.drop_duplicates(subset=['user_name', 'language'])

In [12]:
preply = pd.merge(preply, average_number, how='left', left_on='language', right_on='Language')
preply = preply.rename(columns={'Average': 'average_num_teachers'})

In [13]:
preply['income_level'] = preply['income_level'].apply(map_income)

In [None]:
preply.head()

In [7]:
preply.columns

Index(['language', 'position', 'retrieval_date', 'is_featured', 'user_name',
       'url', 'nationality_full', 'avg_rating', 'num_ratings', 'teaches',
       'subjects', 'speaks', 'lessons', 'price', 'price_currency',
       'avatar_url', 'nationality', 'clean_name', 'sanitized_name', 'gender',
       'probability', 'count', 'first_name', 'probability_male',
       'probability_female', 'gender_tuned', 'income_level', 'Code'],
      dtype='object')

In [15]:
preply = preply[preply['average_num_teachers'] >= 100]

In [26]:
from scipy.stats import ks_2samp
import numpy as np

def hypothesis_test(group1, group2, alpha=0.1):
    st, p_value = ks_2samp(group1, group2)
    if p_value<alpha:
        return st, p_value
    else:
        return st, p_value
    
def compute_aggregated_feature_top_k(df, top_k, language_col, aggregation_col1, aggregation_col2, target_cols, group1_1, group1_2, group2_1, group2_2):
    
    results = pd.DataFrame(columns=['language', 'top_k', 'target_col', 'measure', 'High|Men', 'High|Women', 'Low|Men', 'Low|Women', 'count_High|Men', 'count_High|Women', 'count_Low|Men', 'count_Low|Women'])
    
    for lang in df[language_col].unique():
        temp = df[df[language_col]==lang]
        temp = temp.sort_values(by='position', ascending=True)
        
        for target in target_cols:
            temp = temp.dropna(subset=[target])
            if top_k is not None:
                temp = temp.head(top_k)
            
            temp[target] = pd.to_numeric(temp[target], errors='coerce')
            g1 = temp.loc[(temp[aggregation_col1]==group1_1) & (temp[aggregation_col2]==group2_1)][target].values
            g2 = temp.loc[(temp[aggregation_col1]==group1_1) & (temp[aggregation_col2]==group2_2)][target].values
            g3 = temp.loc[(temp[aggregation_col1]==group1_2) & (temp[aggregation_col2]==group2_1)][target].values
            g4 = temp.loc[(temp[aggregation_col1]==group1_2) & (temp[aggregation_col2]==group2_2)][target].values
            
            g1_count = len(g1)
            g2_count = len(g2)
            g3_count = len(g3)
            g4_count = len(g4)

            g1_mean = g1.mean() if g1_count else None
            g2_mean = g2.mean() if g2_count else None
            g3_mean = g3.mean() if g3_count else None
            g4_mean = g4.mean() if g4_count else None
            
            
            results = results.append({'language': lang, 'top_k': len(temp), 'target_col': target, 'measure': 'mean',
                                     'High|Men': g1_mean, 'High|Women': g2_mean, 'Low|Men': g3_mean, 'Low|Women': g4_mean, 'count_High|Men': g1_count, 'count_High|Women': g2_count, 'count_Low|Men': g3_count, 'count_Low|Women': g4_count}, ignore_index=True)

            g1_median = np.median(g1) if g1_count else None
            g2_median = np.median(g2) if g2_count else None
            g3_median = np.median(g3) if g3_count else None
            g4_median = np.median(g4) if g4_count else None

            results = results.append({'language': lang, 'top_k': len(temp), 'target_col': target, 'measure': 'median',
                                     'High|Men': g1_median, 'High|Women': g2_median, 'Low|Men': g3_median, 'Low|Women': g4_median, 'count_High|Men': g1_count, 'count_High|Women': g2_count, 'count_Low|Men': g3_count, 'count_Low|Women': g4_count}, ignore_index=True)
            
    return results

In [27]:
preply_results_100 = compute_aggregated_feature_top_k(preply, 100, 'language', 'income_level', 'gender_tuned', ['price'], 'High', 'Low', 'male', 'female')

In [30]:
preply_results_100

Unnamed: 0,language,top_k,target_col,measure,High|Men,High|Women,Low|Men,Low|Women,count_High|Men,count_High|Women,count_Low|Men,count_Low|Women
0,German,100,price,mean,27.588235,27.5,28.8,15.7143,34,54,5,7
1,German,100,price,median,28.0,25.5,21.0,17.0,34,54,5,7
2,Portuguese,100,price,mean,15.785714,15.0,11.55,13.0,14,26,20,40
3,Portuguese,100,price,median,17.0,16.5,12.5,13.0,14,26,20,40
4,Italian,100,price,mean,18.1,18.514706,,15.0,30,68,0,2
5,Italian,100,price,median,17.0,17.5,,15.0,30,68,0,2
6,Russian,100,price,mean,17.0625,15.472727,14.3333,16.1923,16,55,3,26
7,Russian,100,price,median,14.0,15.0,13.0,15.0,16,55,3,26
8,English,100,price,mean,15.956522,16.3,11.0,11.7143,23,40,9,28
9,English,100,price,median,16.0,16.0,8.0,12.0,23,40,9,28


In [32]:
preply_results_100.to_csv('../data/results/features_analysis/combined/preply.csv')

# Italki

In [33]:
italki = pd.read_csv('../data/results/final_dataframes/italki.csv', index_col=0)
italki = italki.drop_duplicates(subset=['user_id', 'language'])

In [34]:
italki = pd.merge(italki, average_number, how='left', left_on='language', right_on='Language')
italki = italki.rename(columns={'Average': 'average_num_teachers'})

In [35]:
italki['income_level'] = italki['income_level'].apply(map_income)

In [None]:
italki.head()

In [37]:
italki = italki[italki['average_num_teachers'] >= 100]

In [38]:
italki.columns

Index(['position', 'retrieval_date', 'user_id', 'user_name',
       'avatar_file_name', 'video_picture', 'is_pro', 'nationality', 'teaches',
       'also_speaks', 'in_platform_since', 'rating', 'number_sessions',
       'price', 'price_time', 'price_currency', 'clean_name', 'sanitized_name',
       'gender', 'probability', 'count', 'language', 'probability_male',
       'probability_female', 'gender_tuned', 'income_level', 'Code',
       'Language', 'average_num_teachers'],
      dtype='object')

In [41]:
italki_results_100 = compute_aggregated_feature_top_k(italki, 100, 'language', 'income_level', 'gender_tuned', ['price'], 'High', 'Low', 'male', 'female')

In [42]:
italki_results_100

Unnamed: 0,language,top_k,target_col,measure,High|Men,High|Women,Low|Men,Low|Women,count_High|Men,count_High|Women,count_Low|Men,count_Low|Women
0,German,100,price,mean,2273.089286,2375.0,836.0,1450.0,56,38,2,4
1,German,100,price,median,1800.0,2050.0,836.0,1500.0,56,38,2,4
2,Portuguese,100,price,mean,1429.444444,1857.142857,947.514,1090.19,9,7,37,47
3,Portuguese,100,price,median,1400.0,1700.0,800.0,1000.0,9,7,37,47
4,Italian,100,price,mean,1630.26087,1599.980769,,1250.0,46,52,0,2
5,Italian,100,price,median,1500.0,1500.0,,1250.0,46,52,0,2
6,Russian,100,price,mean,1212.222222,1256.808511,658.25,1124.84,18,47,4,31
7,Russian,100,price,median,1160.0,1100.0,666.5,1100.0,18,47,4,31
8,English,100,price,mean,1736.854167,1766.071429,1590.0,1302.78,48,28,6,18
9,English,100,price,median,1600.0,1650.0,1325.0,1200.0,48,28,6,18


In [43]:
italki_results_100.to_csv('../data/results/features_analysis/combined/italki.csv')

# Verbling

In [44]:
verbling = pd.read_csv('../data/results/final_dataframes/verbling.csv', index_col=0)
verbling = verbling.drop_duplicates(subset=['first_name', 'last_name', 'language'])

In [45]:
verbling = pd.merge(verbling, average_number, how='left', left_on='language', right_on='Language')
verbling = verbling.rename(columns={'Average': 'average_num_teachers'})

In [46]:
verbling['income_level'] = verbling['income_level'].apply(map_income)

In [None]:
verbling.head()

In [48]:
verbling = verbling[verbling['average_num_teachers'] >= 100]

In [49]:
verbling.columns

Index(['language', 'position', 'retrieval_date', 'is_featured', 'first_name',
       'last_name', 'url', 'nationality', 'location', 'avg_rating',
       'avg_lessons_per_students', 'num_ratings', 'teaching_levels', 'teaches',
       'class_details', 'speaks', 'lessons', 'students', 'dialect',
       'price_currency', 'avatar_url', 'clean_name', 'sanitized_name',
       'gender', 'probability', 'count', 'price_detail', 'price',
       'probability_male', 'probability_female', 'gender_tuned',
       'income_level', 'Code', 'Language', 'average_num_teachers'],
      dtype='object')

In [50]:
verbling_results_100 = compute_aggregated_feature_top_k(verbling, 100, 'language', 'income_level', 'gender_tuned', ['price'], 'High', 'Low', 'male', 'female')

In [51]:
verbling_results_100.to_csv('../data/results/features_analysis/combined/verbling.csv')

In [52]:
verbling_results_100

Unnamed: 0,language,top_k,target_col,measure,High|Men,High|Women,Low|Men,Low|Women,count_High|Men,count_High|Women,count_Low|Men,count_Low|Women
0,German,67,price,mean,29.307879,35.893824,,,33,34,0,0
1,German,67,price,median,29.0,33.0,,,33,34,0,0
2,Portuguese,89,price,mean,22.0,18.714286,13.9626,15.6,5,14,27,43
3,Portuguese,89,price,median,22.0,19.5,13.0,15.0,5,14,27,43
4,Italian,100,price,mean,18.998788,21.549254,,,33,67,0,0
5,Italian,100,price,median,19.0,20.0,,,33,67,0,0
6,Russian,89,price,mean,15.554444,16.538125,16.3333,15.2077,9,64,3,13
7,Russian,89,price,median,15.0,15.0,16.0,14.0,9,64,3,13
8,English,100,price,mean,22.96102,25.464048,12.5,15.0,49,42,3,6
9,English,100,price,median,21.0,24.6,14.5,14.5,49,42,3,6


In [53]:
verbling_results_100.to_csv('../data/results/features_analysis/combined/verbling.csv')