In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

import pandas as pd
from pandas import DataFrame 
import json
import random
import re

In [2]:
results = []

for i in range(1,13):
    filename = 'hypia20210810/{}.json'.format(i)
    with open(filename, 'r') as file:
        result = json.loads(file.read())['result']
        if 'items' in result:
            results.append(result['items'])
        elif 'data' in result: # applies only for page 1
            results.append(result['data']['items'])

len(results)

12

In [3]:
shortDescs = [profile['shortDescription'].split('\n') for page in results for profile in page]

sample = shortDescs[random.randrange(len(shortDescs))]

sample

['Name: Jaqueline Cassemiro',
 'Nationality or Ethnicity: Brazilian',
 'Where do you live?: São Paulo - Brazil ',
 'Languages: Portuguese, French, German, Italian, Spanish, English']

In [4]:
keyFixes = {
    'Where do you live': 'Where do you live?',
    'Nationality': 'Nationality or Ethnicity',
    'Ethnicity': 'Nationality or Ethnicity',
    'Natuionality or Ethnicity': 'Nationality or Ethnicity',
    'Name and Title': 'Name',
    'Where do you live ?': 'Where do you live?',
    'Currently living in': 'Where do you live?'
}

ignoreKeys = ['*Alternatively', 'Rare language', 'Representative of rare language', 'Language Family', '(Her Youtube channel']


#derived from uncleaned data using this code
#     colCounts = rawData.count().sort_values(ascending=False)
#     colCounts[colCounts<100].index
languageKeys = ['Conversational', 'Role at HYPIA', 'Mother Tongue', 'Fluent (C level)',
       'Learning', 'Fluent', 'Conversant', 'Old languages', 'Basic', 'Read',
       'Currently studying (basic level)', 'Basic (mainly written)', 'Reading',
       'Also a speaker of', 'Rare Language', 'Medial (B level)',
       'Some phrases', 'Family Language', 'Beginner in', 'Currently learning',
       '[Fluent]', 'Baza (A level)', 'Level A to B',
       'In a lesser degree but with a good basic knowledge anyway',
       'Colloquial ability', 'Reading Proficiency',
       'Upper-Intermediate (B2 level)', 'Lower-Intermediate (B1 level)',
       'Basic (A2 level)', 'Beginner (A1 level)', 'Broadcast ability',
       'Interview ability', 'Basic ability', 'Beginner',
       'Languages (In order of Fluency)', '• Native Languages',
       'English (nor Italian)', 'languages above',
       '• Languages of antiquity that I can write/read', 'Native Language',
       'Fluent Languages', '[Conversant]']

def processDescription(sd):
    output = {}
    _key = ''
    for line in sd:
        #print(line)
        value = ''
        if ': ' in line:
            key, *value = line.split(': ')
            key = key.strip()
            if key in keyFixes: key = keyFixes[key]
            if key in ignoreKeys:
                continue
            elif key in languageKeys:
                if not 'Languages' in output:
                    output['Languages'] = ': '.join(value)
                else:
                    output['Languages'] = output['Languages'] + ', ' + ': '.join(value)
            else:
                output[key] = ': '.join(value)
                _key = key
        elif key != '':
            output[_key] = output[_key] + ': '.join(value)
    return output
        
processDescription(sample)

{'Name': 'Jaqueline Cassemiro',
 'Nationality or Ethnicity': 'Brazilian',
 'Where do you live?': 'São Paulo - Brazil ',
 'Languages': 'Portuguese, French, German, Italian, Spanish, English'}

In [5]:
semiCleanData = DataFrame([processDescription(sd) for sd in shortDescs])
semiCleanData.columns = ['name', 'languages', 'nationality', 'locatio']

semiCleanData

Unnamed: 0,Name,Languages,Nationality or Ethnicity,Where do you live?
0,Usman W. Chohan,"Urdu, Brazilian Portuguese, Spanish, English, ...",,
1,Matias Barmat,"Spanish (native), English (C1), Catalan (C1), ...",Argentina /,Buenos Aires
2,Eduardo Teiga,"Portuguese, Spanish, Catalan, English, German,...",Portuguese,Switzerland
3,Hugues Pluvinage,"French (native), English (C2), Dutch (C2), It...",Belgian,Italy
4,Carlos Yebra Lopez,"Spanish, Judeo-Spanish, Catalan, Portuguese, F...",Spanish,
...,...,...,...,...
221,Jakub Jarosz,"Polish, English, Italian, Spanish, Russian, Fr...",Polish,Poland
222,George L. O’Hara,"English (native), French, Spanish, Russian, Uk...","American, Irish","Washington, DC"
223,Helmar Böhnlein\t,"German, English, French, Portuguese, Spanish, ...",German,"Vienna, Austria"
224,Loana Eugenie Eleonora Kontogouri,"Romanian, English, French, German, Spanish, It...",Greek & Romanian,"Munich, Germany"


In [29]:
def languageFix(l):
    return ('Hindustani' if l == 'Hindi'
        else 'Hindustani' if l == 'Urdu'
        else 'Hindustani' if l == 'Hindi/Urdu'
        else 'Serbo-Croatian' if l == 'Bosnian-Serbian-Croatian'
        else 'Serbo-Croatian' if l == 'Bosnian'
        else 'Serbo-Croatian' if l == 'Serbian'
        else 'Serbo-Croatian' if l == 'Croatian'
        else 'Mandarin' if l == 'Mandarin Chinese'
        else 'Mandarin' if l == 'Chinese'
        else 'Arabic' if l == 'Standard Arabic'
        else 'Arabic' if l == 'Modern Standard Arabic'
        else 'Greek' if 'Greek' in l
        else 'Portuguese' if l == 'Brazilian Portuguese'
        else 'Italian' if l == 'Italiano'
        else 'Neapolitan' if l == 'Napulitano'
        else 'German' if l == 'Deutsch'
        else 'Dutch' if l == 'Nederlands'
        else 'Greek' if l == 'Ελληνικά'
        else 'French' if l == 'Français'
        else 'Spanish' if l == 'Español'
        else 'Portuguese' if l == 'Português'
        else 'Romanian' if l == 'Română'
        else l)

def languageSplit(s):
    if pd.isnull(s): return []
    languages = re.split(',|\.| and |&|;', s)
    languages = [l.strip() for l in languages if l != '']
    languages = [re.sub('\(.*|\*|Understands|I\\b|\\b[a-z]+|\)|[ABC][12]-|[ABC][12]|Just|:|Languages    /', '', l).strip() for l in languages if l != '']
    return [languageFix(l) for l in languages if l != '']

[languageSplit(i) for i in semiCleanData.Languages.sample(3)]

[['Korean', 'English', 'Portuguese', 'Japanese', 'Catalan', 'Turkish'],
 ['Polish',
  'English',
  'French',
  'Spanish',
  'Italian',
  'Portuguese',
  'Latin',
  'Russian',
  'Arabic',
  'Japanese',
  'ASL'],
 ['English',
  'German',
  'Scottish Gaelic',
  'French',
  'Spanish',
  'Welsh',
  'Greek',
  'Czech']]

In [38]:
langCounts = DataFrame(pd.Series([l for ls in 
                                  semiCleanData.Languages for l in languageSplit(ls)]).value_counts()).rename(columns={0: 'Count'})
commonLangs =  list(langCounts[langCounts.Count > 10].index)

print(len(commonLangs))

commonLangs

32


['English',
 'Spanish',
 'French',
 'Italian',
 'German',
 'Portuguese',
 'Russian',
 'Dutch',
 'Mandarin',
 'Greek',
 'Japanese',
 'Catalan',
 'Swedish',
 'Polish',
 'Arabic',
 'Hebrew',
 'Turkish',
 'Norwegian',
 'Esperanto',
 'Romanian',
 'Hindustani',
 'Serbo-Croatian',
 'Ukrainian',
 'Danish',
 'Korean',
 'Latin',
 'Bulgarian',
 'Czech',
 'Galician',
 'Indonesian',
 'Finnish',
 'Swahili']

In [39]:
semiCleanData[semiCleanData.Languages.apply(lambda ls: all([l in commonLangs  for l in languageSplit(ls)]))]

Unnamed: 0,Name,Languages,Nationality or Ethnicity,Where do you live?
2,Eduardo Teiga,"Portuguese, Spanish, Catalan, English, German,...",Portuguese,Switzerland
8,Yulianna Ramón Martínez,"Spanish, English, French, Portuguese, Italian ...",Dominican,"Santo Domingo, Dominican Republic"
10,Anca Doina Cretu,"Romanian, English, French, Spanish, German, Bo...",Romanian,Switzerland (for the time being)
11,Enora Lessinger,"French, English, Spanish, Hebrew, Portuguese, ...",French,France
13,Alexandra Ivanova,"Bulgarian, German, English, French, Spanish, ...",Austrian / Bulgarian,
...,...,...,...,...
219,Marcos Duleba Mendoza,"Italian, Spanish, English, French, Portuguese,...",Argentinian / Italian,Argentina
221,Jakub Jarosz,"Polish, English, Italian, Spanish, Russian, Fr...",Polish,Poland
222,George L. O’Hara,"English (native), French, Spanish, Russian, Uk...","American, Irish","Washington, DC"
223,Helmar Böhnlein\t,"German, English, French, Portuguese, Spanish, ...",German,"Vienna, Austria"


In [204]:
semiCleanData[semiCleanData.Languages.fillna('').str.contains('Kazakh')]

Unnamed: 0,Name,Languages,Nationality or Ethnicity,Where do you live?
185,Cameron Farr,", English, Spanish, Portuguese, German, Dutch,...",American,
190,John Profaci,English Dutch German Yiddish Spanish Portugues...,American,"Mt. Laurel, NJ"
