In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.graphics.regressionplots import abline_plot
import seaborn as sns

In [2]:
from googletrans import Translator

In [3]:
#Load and clean dataframes
eng_embed = pd.read_excel("data/eng_norm.xlsx")
eng_embed['Relative Female Perc'] = eng_embed['Census Female Perc'].apply(lambda x: x-(100-x))
eng_embed = eng_embed.drop(columns=["Tensor Difference", 'Census Female Perc', "Unnamed: 3"])

hin_embed = pd.read_excel("data/hin_norm.xlsx")
hin_embed['Census Female Perc'] *= 100 #make a percentage 
hin_embed['Relative Female Perc'] = hin_embed['Census Female Perc'].apply(lambda x: x-(100-x))
hin_embed = hin_embed.drop(columns=["Tensor Difference", 'Census Female Perc', "Unnamed: 3"])

In [4]:
hin_embed['jobHindi'] = hin_embed['Job']

In [5]:
translator = Translator()
jobTranslated = []
for item in hin_embed['jobHindi']:
    trans = translator.translate(item).text
    print(trans)
    jobTranslated.append(trans)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
hin_embed['Job'] = [x.lower() for x in jobTranslated]

In [None]:
hin_embed['Job'].str.lower();
eng_embed['Job'].str.lower();

In [None]:
hin_embed['Job'].str.lower()
type(hin_embed['Job'][1])
hin_embed['Job']

In [None]:
eng_embed['Job'] = eng_embed['Job'].str.lower();
eng_embed['Job']

In [None]:
for item in eng_embed['Job']:
    if hin_embed['Job'].str.contains(item).any():
        print(item)

In [None]:
hin_sets = []
for item in hin_embed.Job:
#     print(item)
    occ = item.split(" ")
    occ = set(occ)
    occ.discard('&')
    occ.discard('and')
    occ.discard('any')
    occ.discard('-')
    occ.discard('of')
    occ.discard('or')
    occ.discard('related')
    occ.discard('other')
    occ.discard('workers')
    occ.discard('operators')
    occ.discard('laborers')
    occ.discard('agents')
    print(occ, hin_embed.loc[hin_embed['Job'] == item].index )
    hin_sets.append((occ, hin_embed.loc[hin_embed['Job'] == item].index ))

In [None]:
eng_sets = []
for item in eng_embed.Job:
#     print(item)
    occ = item.split(" ")
    occ = set(occ)
    occ.discard('&')
    occ.discard('and')
    occ.discard('any')
    occ.discard('-')
    occ.discard('all')
    occ.discard('other')
    occ.discard('workers')
    occ.discard('of')
    occ.discard('or')
    occ.discard('operators')
    occ.discard('laborers')
    occ.discard('agents')
    print(occ)
    eng_sets.append((occ, eng_embed.loc[eng_embed['Job'] == item].index))

In [None]:
common_occ_sim = []
for (item1, ind1) in hin_sets:
#     print(item)
    for (item2, ind2) in eng_sets:
        if bool(item1 & item2):
            print('hindi:')
            print(item1, ind1)
            print('english:')
            print(item2, ind2)
            print("----")
    print('******************************************')

In [None]:
#Configured by hand by analyzing the hin_sets and eng_sets
equivalents = {3:340, 9:48, 12:156, 14:71, 17:299, 19:51, 20:16, 23:287, 
             25:377, 30:13, 33:112, 37:44, 39:322, 40:180, 45:42, 46:252, 
             47:277, 49:44, 51:52, 57:376, 61:54, 63:391, 65:155, 66:76,
             68:62, 70:341, 71:303, 79:61, 80:255, 82:268, 84:305, 85:327, 
            86:233, 93:19, 94:108, 98:302, 101:299, 103:334, 111:387,
        113:380, 119:170, 125:186, 126:3, 127:61, 131:62, 132:93, 138:315,
        139:200, 140:183, 143:315, 145:211, 148:12, 150:360, 152:163, 154:370,
        159:293, 161:12, 162:29, 165:386, 169:1, 173:352, 185:325, 190:237, 
        195:140, 222:75, 230:68, 231:99, 233:341, 243:164,246:266, 254:401,
        255:362, 265:353, 268:273, 269:253, 271:210, 275:174, 279:91, 281:176,
        282:89, 284:337, 291:32, 292:243, 302:251, 307:202 ,  320:364, 325:222,
        327:314}

In [None]:
hin_eq = []
eng_eq = []
for key in equivalents:
    hin_eq.append(key)
    eng_eq.append(equivalents[key])

In [None]:
eng_df = eng_embed.ix[eng_eq]
hin_df = hin_embed.ix[hin_eq]

In [None]:
hin_df_plot = hin_df.drop(columns=['Unnamed: 4', 'jobHindi'])
hin_df_plot['Language'] = 'Hindi'
hin_df_plot['Cosine Similarity'] = hin_df['Cosine Distance'] 
hin_df_plot = hin_df_plot.drop(columns='Cosine Distance')
hin_df_plot.head()

In [None]:
eng_df_plot = eng_df.drop(columns='Unnamed: 4')
eng_df_plot['Language'] = 'English'
eng_df_plot['Cosine Similarity'] = eng_df['Cosine Distance'] 
eng_df_plot = eng_df_plot.drop(columns='Cosine Distance')
eng_df_plot.head()

In [None]:
eng_df_plot.corr(method='pearson')

In [None]:
hin_df_plot.corr(method='pearson')

In [None]:
concatenated = eng_df_plot.append(hin_df_plot)
e = sns.lmplot(x='Relative Female Perc', y='Cosine Similarity', data=concatenated,
                hue='Language')
fig = e.fig
fig.suptitle("Embedding Bias for Equivalent Occupations", fontsize=20);
fig.set_size_inches(10,6)
plt.savefig('equiv_occ_biasplot.png')

In [None]:
hin_df_plot.loc[hin_index,'Cosine Similarity']

In [None]:
#Make a new dataframe with the corresponding cosine values
cos_table = pd.DataFrame(columns=['English cosine', 'Hindi cosine'])
for i in range(len(hin_eq)):
    eng_index = eng_eq[i]
    hin_index = hin_eq[i]
    cos_table.loc[i] = [eng_df_plot.loc[eng_index, 'Cosine Similarity'], 
                        hin_df_plot.loc[hin_index,'Cosine Similarity']]

In [None]:
cos_table.loc[4, 'English cosine'] = -0.0154 
cos_table.loc[11, 'English cosine'] = -0.0554
cos_table.loc[17, 'English cosine'] = -0.0554
cos_table.loc[24, 'English cosine'] = -0.0516
cos_table.loc[25, 'English cosine'] = -0.0049
cos_table.loc[27, 'English cosine'] =  -0.0516
cos_table.loc[67, 'English cosine'] =  -0.0049
cos_table.loc[36, 'English cosine'] =  -0.0049
cos_table.loc[43, 'English cosine'] =  -0.0397
cos_table.loc[44, 'English cosine'] =  -0.0392
cos_table.loc[46, 'English cosine'] =  -0.0377
cos_table.loc[49, 'English cosine'] =  -0.0372
cos_table.loc[51, 'English cosine'] =  -0.0365
cos_table.loc[56, 'English cosine'] =  -0.0345

In [None]:
cos_table= cos_table.astype(float)

In [None]:
cos_plot = sns.regplot(x='English cosine', y='Hindi cosine', data=cos_table)
cos_plot.set_title('English vs Hindi cosine similarity')

In [None]:
cos_table.corr(method='pearson')