In [51]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

import pandas as pd
from pandas import DataFrame 
import json
import random

In [4]:
results = []

for i in range(1,13):
    filename = 'hypia20210810/{}.json'.format(i)
    with open(filename, 'r') as file:
        result = json.loads(file.read())['result']
        if 'items' in result:
            results.append(result['items'])
        elif 'data' in result: # applies only for page 1
            results.append(result['data']['items'])

len(results)

12

In [79]:
shortDescs = [profile['shortDescription'].split('\n') for page in results for profile in page]

sample = shortDescs[random.randrange(len(shortDescs))]

sample

['Name: Kevin Fei Sun',
 'Nationality: United States. Ethnicity: Chinese',
 'Where do you live?: New York City',
 'Languages: English, Mandarin (native); Russian, Spanish, French, Portuguese, German, Serbo-Croatian, Italian, Hindi/Urdu (B2); Turkish, Korean, Japanese, Arabic, Hebrew (B1); Shanghainese (heritage speaker).']

In [112]:
keyFixes = {
    'Where do you live': 'Where do you live?',
    'Nationality': 'Nationality or Ethnicity',
    'Ethnicity': 'Nationality or Ethnicity',
    'Natuionality or Ethnicity': 'Nationality or Ethnicity',
    'Name and Title': 'Name',
    'Where do you live ?': 'Where do you live?',
    'Currently living in': 'Where do you live?'
}

ignoreKeys = ['*Alternatively', 'Rare language', 'Representative of rare language', 'Language Family', '(Her Youtube channel']


#derived from uncleaned data using this code
#     colCounts = rawData.count().sort_values(ascending=False)
#     colCounts[colCounts<100].index
languageKeys = ['Conversational', 'Role at HYPIA', 'Mother Tongue', 'Fluent (C level)',
       'Learning', 'Fluent', 'Conversant', 'Old languages', 'Basic', 'Read',
       'Currently studying (basic level)', 'Basic (mainly written)', 'Reading',
       'Also a speaker of', 'Rare Language', 'Medial (B level)',
       'Some phrases', 'Family Language', 'Beginner in', 'Currently learning',
       '[Fluent]', 'Baza (A level)', 'Level A to B',
       'In a lesser degree but with a good basic knowledge anyway',
       'Colloquial ability', 'Reading Proficiency',
       'Upper-Intermediate (B2 level)', 'Lower-Intermediate (B1 level)',
       'Basic (A2 level)', 'Beginner (A1 level)', 'Broadcast ability',
       'Interview ability', 'Basic ability', 'Beginner',
       'Languages (In order of Fluency)', '• Native Languages',
       'English (nor Italian)', 'languages above',
       '• Languages of antiquity that I can write/read', 'Native Language',
       'Fluent Languages', '[Conversant]']

def processDescription(sd):
    output = {}
    _key = ''
    for line in sd:
        #print(line)
        value = ''
        if ': ' in line:
            key, *value = line.split(': ')
            key = key.strip()
            if key in keyFixes: key = keyFixes[key]
            if key in ignoreKeys:
                continue
            elif key in languageKeys:
                if not 'Languages' in output:
                    output['Languages'] = ': '.join(value)
                else:
                    output['Languages'] = output['Languages'] + ', ' + ': '.join(value)
            else:
                output[key] = ': '.join(value)
                _key = key
        elif key != '':
            output[_key] = output[_key] + ': '.join(value)
    return output
        
processDescription(sample)

{'Name': 'Kevin Fei Sun',
 'Nationality or Ethnicity': 'United States. Ethnicity: Chinese',
 'Where do you live?': 'New York City',
 'Languages': 'English, Mandarin (native); Russian, Spanish, French, Portuguese, German, Serbo-Croatian, Italian, Hindi/Urdu (B2); Turkish, Korean, Japanese, Arabic, Hebrew (B1); Shanghainese (heritage speaker).'}

In [114]:
semiCleanData = DataFrame([processDescription(sd) for sd in shortDescs])
semiCleanData

Unnamed: 0,Name,Languages,Nationality or Ethnicity,Where do you live?
0,Usman W. Chohan,"Urdu, Brazilian Portuguese, Spanish, English, ...",,
1,Matias Barmat,"Spanish (native), English (C1), Catalan (C1), ...",Argentina /,Buenos Aires
2,Eduardo Teiga,"Portuguese, Spanish, Catalan, English, German,...",Portuguese,Switzerland
3,Hugues Pluvinage,"French (native), English (C2), Dutch (C2), It...",Belgian,Italy
4,Carlos Yebra Lopez,"Spanish, Judeo-Spanish, Catalan, Portuguese, F...",Spanish,
...,...,...,...,...
221,Jakub Jarosz,"Polish, English, Italian, Spanish, Russian, Fr...",Polish,Poland
222,George L. O’Hara,"English (native), French, Spanish, Russian, Uk...","American, Irish","Washington, DC"
223,Helmar Böhnlein\t,"German, English, French, Portuguese, Spanish, ...",German,"Vienna, Austria"
224,Loana Eugenie Eleonora Kontogouri,"Romanian, English, French, German, Spanish, It...",Greek & Romanian,"Munich, Germany"


In [116]:
[i for i in semiCleanData.Languages]

['Urdu, Brazilian Portuguese, Spanish, English, Punjabi, French, Hindi, Mandarin**, Japanese**',
 'Spanish (native), English (C1), Catalan (C1), Portuguese (C1), Italian (B2), French (B1), Galician (B1), Romanian (B1),  Basque (A2),  Greek (A2), Turkish (A2), Hebrew (A2), German (A2), Dutch (A2).',
 'Portuguese, Spanish, Catalan, English, German, French, Italian',
 'French (native), English (C2), Dutch (C2),  Italian (C2), Neapolitan (C1), Salentino (C1), German (C1) , Spanish (C1), Portuguese (B1) and Japanese (basic).',
 'Spanish, Judeo-Spanish, Catalan, Portuguese, French, Italian, English, German, Serbian, Arabic*',
 'Urdu, Spanish, English, Punjabi, French, Pashto',
 'English, Persian*, Urdu, French, Spanish**, Punjabi, Bengali',
 'English, German, Slovenian, Croatian, Bosnian, Serbian, Macedonian *, Italian *, Japanese *, Sanskrit**, Old Greek**, and Hittite**',
 'Spanish, English, French, Portuguese, Italian and German.',
 'French, English, Spanish, Vietnamese, Japanese and Germ