In [3]:
from fairsearchcore.models import FairScoreDoc
import fairsearchcore as fsc
import pandas as pd
import numpy as np
import os
from copy import copy

In [4]:
def read_ranking(csv_path, index_col=0):
    df = pd.read_csv(csv_path, index_col=index_col)
    return df

def map_income(income):
    if income<=9036.8:
        return 'Low'
    else:
        return 'High'

def include_is_protected(df, protected_col: str, protected_values: list, output_col: str):
    df[output_col] = np.where(df[protected_col].isin(protected_values), True, False)
    return df

def generate_ranking(df, position_col: str, is_protected_col: str):
    ranking = []

    for index, row in df.iterrows():
        ranking.append(FairScoreDoc(row[position_col], row[position_col], row[is_protected_col]))

    return ranking


def is_fair_ranking(ranking, k=20, p=0.25, alpha=0.1):
    fair = fsc.Fair(k, p, alpha)
    return fair.is_fair(ranking)


def generate_p_values(start_value=0.5, end_value=0.95, distance=0.5):
    return list(np.arange(start_value * 100, end_value * 100, distance * 100) / 100)


def compute_min_p_fair(ranking, p_values, k=50, alpha=0.1):
    fair_results = {}
    for p_value in p_values:
        fair_results[p_value] = is_fair_ranking(ranking, k=k, p=p_value, alpha=alpha)

    max_p = 0

    for item in fair_results.items():
        if item[1] is True:
            max_p = item[0]
        else:
            break

    return max_p, fair_results

In [14]:
platforms = ['italki', 'preply', 'verbling']
results = pd.DataFrame(columns=['language', 'platform', 'k', 'max_p', 'original_proportion', 'alpha'])
duplicates_columns = {'italki': ['user_id', 'language'],
                     'preply': ['user_name', 'language'],
                     'verbling': ['first_name', 'last_name', 'language']}

for platform in ['preply']:
    path = '../data/results/final_dataframes/{}.csv'.format(platform)
    df = pd.read_csv(path, index_col=0)
    df['income_level'] = df['income_level'].apply(map_income)
    df = df.drop_duplicates(subset=duplicates_columns[platform])
    df = include_is_protected(df, "income_level", ['Low'], "is_nationality_protected")
    #languages = list(df.language.unique())
    languages = ['Spanish']
    for language in languages:
        print("{} - {}".format(platform, language))
        temp = copy(df[df['language']==language])

        ranking = generate_ranking(temp, 'position', 'is_nationality_protected')

        proportion = temp.is_nationality_protected.value_counts()[True]/len(temp) if True in temp.is_nationality_protected.value_counts().keys() else 0
        
        p_values = generate_p_values(0.10, 1, 0.1)

        if len(df) < 5:
            ks = [len(temp)]
        else:
            ks = [3, 5] + [i for i in range(10, min(len(temp), 99)+1, 30)] + [min(100, len(temp))]
            if len(ks)==2: # If df is shorter than 10, we include the whole dataframe
                ks.append(len(temp))

        for k in ks:
            alpha = 0.1
            max_p, fair_results = compute_min_p_fair(ranking[:k], p_values, k=k, alpha=alpha)
            results = results.append({'language': language, 'platform': platform, 'k': k, 'max_p': max_p, 'original_proportion': proportion, 'alpha': alpha}, ignore_index=True)

results.to_csv('../data/results/nationality_fair_analysis.csv')

italki - Thai




italki - German
italki - Portuguese
italki - Hebrew
italki - Hindi
italki - Romanian
italki - Chinese (Mandarin)
italki - Dutch
italki - Indonesian
italki - Italian
italki - Russian
italki - Greek
italki - English
italki - Polish
italki - Arabic
italki - Turkish
italki - Vietnamese
italki - Japanese
italki - Spanish
italki - French
italki - Korean
italki - Persian (Farsi)
preply - Thai
preply - German
preply - Portuguese
preply - Hebrew
preply - Hindi
preply - Romanian
preply - Persian
preply - Dutch
preply - Indonesian
preply - Italian
preply - Russian
preply - Greek
preply - English
preply - Polish
preply - Arabic
preply - Turkish
preply - Vietnamese
preply - Chinese
preply - Japanese
preply - Spanish
preply - French
preply - Korean
verbling - Thai
verbling - German
verbling - Portuguese
verbling - Hebrew
verbling - Hindi
verbling - Romanian
verbling - Persian
verbling - Dutch
verbling - Indonesian
verbling - Italian
verbling - Russian
verbling - Greek
verbling - English
verbling - P

In [15]:
temp.is_nationality_protected.value_counts()

False    47
Name: is_nationality_protected, dtype: int64

In [16]:
df

Unnamed: 0,language,position,retrieval_date,is_featured,first_name,last_name,url,nationality,location,avg_rating,...,probability,count,price_detail,price,probability_male,probability_female,gender_tuned,income_level,Code,is_nationality_protected
0,Thai,18,20201025,False,Wilailuk,Koinueng,https://www.verbling.com/teachers/232945035364...,TH,Asia/Bangkok,4.990291,...,1.00,26.0,"[{'num_lessons': 1, 'price': 17.0}, {'num_less...",17.0,0.00,1.00,female,Low,TH,True
1,Thai,5,20201025,False,Nattaporn,Claycham,https://www.verbling.com/teachers/496562010199...,TH,Asia/Bangkok,5.000000,...,0.72,137.0,"[{'num_lessons': 1, 'price': 17.0}, {'num_less...",17.0,0.28,0.72,female,Low,TH,True
2,Thai,4,20201025,False,Kruu Cherry,,https://www.verbling.com/teachers/kruucherry,TH,Asia/Bangkok,5.000000,...,0.95,499.0,"[{'num_lessons': 1, 'price': 25.0}, {'num_less...",25.0,0.05,0.95,female,Low,TH,True
3,Thai,16,20201025,False,Matika,Hongsranont,https://www.verbling.com/teachers/717376443054...,TH,Asia/Bangkok,5.000000,...,0.78,9.0,"[{'num_lessons': 1, 'price': 15.0}, {'num_less...",15.0,0.22,0.78,female,Low,TH,True
4,Thai,11,20201025,False,Weerin,Chaiariyakul,https://www.verbling.com/teachers/874241204366...,TH,Asia/Bangkok,5.000000,...,0.75,4.0,"[{'num_lessons': 1, 'price': 18.0}, {'num_less...",18.0,0.25,0.75,female,Low,TH,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2172,Korean,10,20201025,False,JungYeun,Park,https://www.verbling.com/teachers/jyisadreamer,KR,US/Pacific,0.000000,...,1.00,,"[{'num_lessons': 1, 'price': 45.0}, {'num_less...",45.0,0.00,1.00,female,High,KR,False
2174,Korean,6,20201025,False,Euisook,Kim,https://www.verbling.com/teachers/692982640680...,KR,Asia/Seoul,5.000000,...,1.00,,"[{'num_lessons': 1, 'price': 23.0}, {'num_less...",23.0,0.00,1.00,female,High,KR,False
2176,Korean,41,20201025,False,Heeon,Kim,https://www.verbling.com/teachers/680444100920...,KR,Asia/Qatar,5.000000,...,1.00,,"[{'num_lessons': 1, 'price': 30.0}, {'num_less...",30.0,0.00,1.00,female,High,KR,False
2178,Korean,46,20201025,False,안,영은,https://www.verbling.com/teachers/871850328127...,KR,Europe/Prague,5.000000,...,1.00,,"[{'num_lessons': 1, 'price': 20.0}, {'num_less...",20.0,0.00,1.00,female,High,KR,False


In [19]:
results.head(50)

Unnamed: 0,language,platform,k,max_p,original_proportion,alpha
0,Thai,italki,3,0.9,0.974684,0.1
1,Thai,italki,5,0.9,0.974684,0.1
2,Thai,italki,10,0.9,0.974684,0.1
3,Thai,italki,40,0.9,0.974684,0.1
4,Thai,italki,70,0.9,0.974684,0.1
5,Thai,italki,79,0.9,0.974684,0.1
6,German,italki,3,0.6,0.0825,0.1
7,German,italki,5,0.4,0.0825,0.1
8,German,italki,10,0.2,0.0825,0.1
9,German,italki,40,0.1,0.0825,0.1
