In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

import pandas as pd
from pandas import DataFrame 
import json
import random
import re

In [2]:
results = []

for i in range(1,13):
    filename = 'hypia20210810/{}.json'.format(i)
    with open(filename, 'r') as file:
        result = json.loads(file.read())['result']
        if 'items' in result:
            results.append(result['items'])
        elif 'data' in result: # applies only for page 1
            results.append(result['data']['items'])

len(results)

12

In [3]:
shortDescs = [profile['shortDescription'].split('\n') for page in results for profile in page]

sample = shortDescs[random.randrange(len(shortDescs))]

sample

['Name: Jaqueline Cassemiro',
 'Nationality or Ethnicity: Brazilian',
 'Where do you live?: São Paulo - Brazil ',
 'Languages: Portuguese, French, German, Italian, Spanish, English']

In [4]:
keyFixes = {
    'Where do you live': 'Where do you live?',
    'Nationality': 'Nationality or Ethnicity',
    'Ethnicity': 'Nationality or Ethnicity',
    'Natuionality or Ethnicity': 'Nationality or Ethnicity',
    'Name and Title': 'Name',
    'Where do you live ?': 'Where do you live?',
    'Currently living in': 'Where do you live?'
}

ignoreKeys = ['*Alternatively', 'Rare language', 'Representative of rare language', 'Language Family', '(Her Youtube channel']


#derived from uncleaned data using this code
#     colCounts = rawData.count().sort_values(ascending=False)
#     colCounts[colCounts<100].index
languageKeys = ['Conversational', 'Role at HYPIA', 'Mother Tongue', 'Fluent (C level)',
       'Learning', 'Fluent', 'Conversant', 'Old languages', 'Basic', 'Read',
       'Currently studying (basic level)', 'Basic (mainly written)', 'Reading',
       'Also a speaker of', 'Rare Language', 'Medial (B level)',
       'Some phrases', 'Family Language', 'Beginner in', 'Currently learning',
       '[Fluent]', 'Baza (A level)', 'Level A to B',
       'In a lesser degree but with a good basic knowledge anyway',
       'Colloquial ability', 'Reading Proficiency',
       'Upper-Intermediate (B2 level)', 'Lower-Intermediate (B1 level)',
       'Basic (A2 level)', 'Beginner (A1 level)', 'Broadcast ability',
       'Interview ability', 'Basic ability', 'Beginner',
       'Languages (In order of Fluency)', '• Native Languages',
       'English (nor Italian)', 'languages above',
       '• Languages of antiquity that I can write/read', 'Native Language',
       'Fluent Languages', '[Conversant]']

def processDescription(sd):
    output = {}
    _key = ''
    for line in sd:
        #print(line)
        value = ''
        if ': ' in line:
            key, *value = line.split(': ')
            key = key.strip()
            if key in keyFixes: key = keyFixes[key]
            if key in ignoreKeys:
                continue
            elif key in languageKeys:
                if not 'Languages' in output:
                    output['Languages'] = ': '.join(value)
                else:
                    output['Languages'] = output['Languages'] + ', ' + ': '.join(value)
            else:
                output[key] = ': '.join(value)
                _key = key
        elif key != '':
            output[_key] = output[_key] + ': '.join(value)
    return output
        
processDescription(sample)

{'Name': 'Jaqueline Cassemiro',
 'Nationality or Ethnicity': 'Brazilian',
 'Where do you live?': 'São Paulo - Brazil ',
 'Languages': 'Portuguese, French, German, Italian, Spanish, English'}

In [40]:
semiCleanData = DataFrame([processDescription(sd) for sd in shortDescs])
semiCleanData.columns = ['name', 'languages', 'nationality', 'location']

semiCleanData

Unnamed: 0,name,languages,nationality,location
0,Usman W. Chohan,"Urdu, Brazilian Portuguese, Spanish, English, ...",,
1,Matias Barmat,"Spanish (native), English (C1), Catalan (C1), ...",Argentina /,Buenos Aires
2,Eduardo Teiga,"Portuguese, Spanish, Catalan, English, German,...",Portuguese,Switzerland
3,Hugues Pluvinage,"French (native), English (C2), Dutch (C2), It...",Belgian,Italy
4,Carlos Yebra Lopez,"Spanish, Judeo-Spanish, Catalan, Portuguese, F...",Spanish,
...,...,...,...,...
221,Jakub Jarosz,"Polish, English, Italian, Spanish, Russian, Fr...",Polish,Poland
222,George L. O’Hara,"English (native), French, Spanish, Russian, Uk...","American, Irish","Washington, DC"
223,Helmar Böhnlein\t,"German, English, French, Portuguese, Spanish, ...",German,"Vienna, Austria"
224,Loana Eugenie Eleonora Kontogouri,"Romanian, English, French, German, Spanish, It...",Greek & Romanian,"Munich, Germany"


In [49]:
semiCleanData[semiCleanData.languages.fillna('').str.contains('Kazakh')]

Unnamed: 0,name,languages,nationality,location
185,Cameron Farr,", English, Spanish, Portuguese, German, Dutch,...",American,
190,John Profaci,English Dutch German Yiddish Spanish Portugues...,American,"Mt. Laurel, NJ"


In [45]:
def languageFix(l):
    return ('Hindustani' if l == 'Hindi'
        else 'Hindustani' if l == 'Urdu'
        else 'Hindustani' if l == 'Hindi/Urdu'
        else 'Serbo-Croatian' if l == 'Bosnian-Serbian-Croatian'
        else 'Serbo-Croatian' if l == 'Bosnian'
        else 'Serbo-Croatian' if l == 'Serbian'
        else 'Serbo-Croatian' if l == 'Croatian'
        else 'Mandarin' if l == 'Mandarin Chinese'
        else 'Mandarin' if l == 'Chinese Mandarin'
        else 'Mandarin' if l == 'Chinese'
        else 'Arabic' if l == 'Standard Arabic'
        else 'Arabic' if l == 'Modern Standard Arabic'
        else 'Greek' if 'Greek' in l
        else 'Indonesian' if 'Bahasa' in l
        else 'Indonesian' if 'Indonesia' in l
        else 'Portuguese' if l == 'Brazilian Portuguese'
        else 'Italian' if l == 'Italiano'
        else 'Neapolitan' if l == 'Napulitano'
        else 'German' if l == 'Deutsch'
        else 'Dutch' if l == 'Nederlands'
        else 'Greek' if l == 'Ελληνικά'
        else 'French' if l == 'Français'
        else 'Spanish' if l == 'Español'
        else 'Portuguese' if l == 'Português'
        else 'Romanian' if l == 'Română'
        else l)

def languageSplit(s):
    if pd.isnull(s): return []
    languages = re.split(',|\.| and |&|;', s)
    languages = [l.strip() for l in languages if l != '']
    languages = [re.sub('\(.*|\*|Understands|I\\b|\\b[a-z]+|\)|[ABC][12]-|[ABC][12]|Just|:|Languages    /', '', l).strip() for l in languages if l != '']
    return [languageFix(l) for l in languages if l != '']

[languageSplit(i) for i in semiCleanData.languages.sample(3)]

[['Spanish', 'Catalan', 'English', 'Dutch', 'French', 'Italian', 'Russian'],
 ['Japanese',
  'Mandarin',
  'Vietnamese',
  'English',
  'French',
  'German',
  'Korean',
  'Thai',
  'Spanish',
  'Portuguese',
  'Italian',
  'Norwegian',
  'Dutch',
  'Lao',
  'Swedish',
  'Danish',
  'Arabic',
  'Sanskrit',
  'Pali',
  'Han Nom',
  'Cham'],
 ['Italian',
  'Greek',
  'Greek',
  'Sicilian',
  'French',
  'English',
  'Spanish',
  'Portuguese']]

In [46]:
langCounts = DataFrame(pd.Series([l for ls in 
                                  semiCleanData.languages for l in languageSplit(ls)]).value_counts()).rename(columns={0: 'Count'})
commonLangs =  list(langCounts[langCounts.Count > 10].index)

print(len(commonLangs))

commonLangs

32


['English',
 'Spanish',
 'French',
 'Italian',
 'German',
 'Portuguese',
 'Russian',
 'Dutch',
 'Mandarin',
 'Greek',
 'Japanese',
 'Catalan',
 'Swedish',
 'Polish',
 'Arabic',
 'Hebrew',
 'Turkish',
 'Norwegian',
 'Esperanto',
 'Romanian',
 'Hindustani',
 'Serbo-Croatian',
 'Ukrainian',
 'Danish',
 'Korean',
 'Latin',
 'Indonesian',
 'Bulgarian',
 'Czech',
 'Galician',
 'Finnish',
 'Swahili']

In [58]:
HypiaLanguages = semiCleanData[['name', 'languages']].copy()

for language in commonLangs:
    HypiaLanguages[language] = HypiaLanguages.languages.apply(lambda ls: language in languageSplit(ls))
    
HypiaLanguages = HypiaLanguages.drop('languages', axis=1).set_index('name')

HypiaLanguages

Unnamed: 0_level_0,English,Spanish,French,Italian,German,Portuguese,Russian,Dutch,Mandarin,Greek,...,Ukrainian,Danish,Korean,Latin,Indonesian,Bulgarian,Czech,Galician,Finnish,Swahili
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Usman W. Chohan,True,True,True,False,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
Matias Barmat,True,True,True,True,True,True,False,True,False,True,...,False,False,False,False,False,False,False,True,False,False
Eduardo Teiga,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Hugues Pluvinage,True,True,True,True,True,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
Carlos Yebra Lopez,True,True,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Jakub Jarosz,True,True,True,True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
George L. O’Hara,True,True,True,False,False,False,True,False,False,False,...,True,False,False,False,False,False,False,False,False,False
Helmar Böhnlein\t,True,True,True,True,True,True,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
Loana Eugenie Eleonora Kontogouri,True,True,True,True,True,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False


In [70]:
HypiaLanguages.corr().sort_values('Catalan', ascending=False)

Unnamed: 0,English,Spanish,French,Italian,German,Portuguese,Russian,Dutch,Mandarin,Greek,...,Ukrainian,Danish,Korean,Latin,Indonesian,Bulgarian,Czech,Galician,Finnish,Swahili
Catalan,0.168275,0.138327,0.16506,0.276343,0.182392,0.242861,0.039165,0.161025,0.048376,-0.01581,...,-0.029934,0.001912,0.041201,0.171408,0.019205,0.028589,-0.086452,0.4353,0.121771,0.036094
Galician,0.081129,0.109308,0.119754,0.126263,0.044382,0.164083,0.005069,0.109681,0.115895,0.09394,...,-0.002485,0.06212,0.06212,0.137897,0.073647,0.079994,-0.063486,1.0,0.111002,0.032434
Italian,0.453056,0.473408,0.449074,1.0,0.36338,0.455046,0.230718,0.126272,0.113951,0.160132,...,-0.04736,0.033288,0.101992,0.16389,0.048456,0.112484,0.055292,0.126263,-0.009782,0.064489
Portuguese,0.369547,0.441377,0.337567,0.455046,0.260677,1.0,0.073806,0.084979,0.213875,0.033139,...,0.041581,-0.011365,0.118687,0.141552,0.165741,0.050741,0.061275,0.164083,0.033471,0.017996
Norwegian,0.078186,0.093793,0.079637,0.138779,0.225579,0.144437,0.090759,0.260534,0.117281,-0.010236,...,0.038192,0.466856,-0.017132,0.087797,-0.005408,0.105653,0.181934,0.080141,0.210432,0.039768
German,0.453056,0.396121,0.449074,0.36338,1.0,0.260677,0.134939,0.277378,0.048477,0.114524,...,0.023057,0.136344,-0.001064,-0.012152,0.048456,0.000987,0.055292,0.044382,-0.009782,0.064489
Romanian,0.120963,0.089237,0.143795,0.161127,0.161127,0.135516,0.13198,0.180168,0.127148,0.192783,...,0.143585,0.134219,0.085056,0.093197,0.101848,0.164277,0.018531,0.201961,0.156143,0.043482
Latin,0.096605,0.041823,-0.023951,0.16389,-0.012152,0.141552,0.094457,-0.028821,0.12649,0.232156,...,0.094551,0.028666,0.028666,1.0,-0.021934,0.173677,-0.0078,0.137897,0.076111,0.00941
English,1.0,0.661526,0.601407,0.453056,0.453056,0.369547,0.237849,0.163629,0.158779,0.141572,...,0.041472,0.0457,0.099492,0.096605,0.093659,0.090646,0.08439,0.081129,-0.055378,0.07428
French,0.601407,0.608109,1.0,0.449074,0.449074,0.337567,0.213104,0.240371,0.155771,0.018973,...,0.059324,0.024986,0.106235,-0.023951,0.052765,0.133801,0.077803,0.119754,0.014237,0.004868


In [100]:
ld = {}

for post in commonLangs:
    dependencies = {}
    for prior in commonLangs:
        ratio = HypiaLanguages[[post, prior]].min(axis=1).sum() / HypiaLanguages[prior].sum()
        dependencies[prior] = ratio
    ld[post] = dependencies
    
LangDependencies = DataFrame(ld)

LangDependencies

Unnamed: 0,English,Spanish,French,Italian,German,Portuguese,Russian,Dutch,Mandarin,Greek,...,Ukrainian,Danish,Korean,Latin,Indonesian,Bulgarian,Czech,Galician,Finnish,Swahili
English,1.0,0.916667,0.887255,0.754902,0.754902,0.656863,0.490196,0.289216,0.279412,0.245098,...,0.083333,0.088235,0.093137,0.088235,0.083333,0.078431,0.068627,0.063725,0.04902,0.053922
Spanish,0.989418,1.0,0.915344,0.783069,0.767196,0.693122,0.486772,0.291005,0.275132,0.238095,...,0.079365,0.084656,0.084656,0.084656,0.079365,0.079365,0.068783,0.068783,0.05291,0.047619
French,0.989071,0.945355,1.0,0.786885,0.786885,0.677596,0.502732,0.31694,0.289617,0.229508,...,0.087432,0.087432,0.098361,0.076503,0.081967,0.087432,0.071038,0.071038,0.054645,0.04918
Italian,0.993548,0.954839,0.929032,1.0,0.8,0.748387,0.529032,0.303226,0.290323,0.270968,...,0.070968,0.090323,0.103226,0.109677,0.083871,0.090323,0.070968,0.077419,0.051613,0.058065
German,0.993548,0.935484,0.929032,0.8,1.0,0.683871,0.496774,0.348387,0.270968,0.258065,...,0.083871,0.109677,0.083871,0.077419,0.083871,0.070968,0.070968,0.064516,0.051613,0.058065
Portuguese,0.992593,0.97037,0.918519,0.859259,0.785185,1.0,0.481481,0.296296,0.333333,0.237037,...,0.088889,0.081481,0.111111,0.111111,0.111111,0.081481,0.074074,0.088889,0.059259,0.051852
Russian,0.980392,0.901961,0.901961,0.803922,0.754902,0.637255,1.0,0.333333,0.333333,0.27451,...,0.166667,0.107843,0.107843,0.107843,0.088235,0.127451,0.098039,0.058824,0.088235,0.078431
Dutch,0.983333,0.916667,0.966667,0.783333,0.9,0.666667,0.566667,1.0,0.333333,0.233333,...,0.133333,0.216667,0.116667,0.066667,0.15,0.116667,0.1,0.1,0.05,0.083333
Mandarin,0.982759,0.896552,0.913793,0.775862,0.724138,0.775862,0.586207,0.344828,1.0,0.258621,...,0.12069,0.12069,0.275862,0.137931,0.224138,0.086207,0.068966,0.103448,0.103448,0.103448
Greek,0.980392,0.882353,0.823529,0.823529,0.784314,0.627451,0.54902,0.27451,0.294118,1.0,...,0.078431,0.078431,0.117647,0.196078,0.098039,0.117647,0.078431,0.098039,0.058824,0.078431


In [103]:
_ = DataFrame(LangDependencies.stack()).reset_index()
_.columns = ['from', 'to', 'd']
_[(_['to'] != 'English')&(_['to'] != _['from'])&(_['to']=='Russian')].sort_values('d', ascending=False)[:50]

Unnamed: 0,from,to,d
710,Ukrainian,Russian,0.944444
422,Polish,Russian,0.875
870,Bulgarian,Russian,0.8125
966,Finnish,Russian,0.75
998,Swahili,Russian,0.727273
902,Czech,Russian,0.714286
678,Serbo-Croatian,Russian,0.684211
518,Turkish,Russian,0.677419
454,Arabic,Russian,0.657895
646,Hindustani,Russian,0.65
