# AnalysePhoneticsBetweenSkills

In [1]:
#laod CMU dict
CMUpath = 'cmudict_SPHINX_40.txt'
cmu_dict = {}
with open(CMUpath, "r", encoding='utf8') as f:
    for line in f:
        #A line looks like this
        #ABORT	AH B AO R T
        if line and len(line.strip())>0:
            spl = line.split('\t')
            word = spl[0].strip()
            phonemes = spl[1].strip()
            cmu_dict[word] = phonemes.split(' ')

In [2]:
'''
Call getPhoneticTranslation to get the phonetic trasnlation of a sentence. 
It cleans the word and obtains a CMU translation of the sentence.
Replaces the non existent words in the dictionary by a word of the same length formed by '_'
'''

import string
invalidChars = set(string.punctuation)
numbers = {'0':'ZERO','1':'ONE','2':'TWO','3':'THREE','4':'FOUR','5':'FIVE',
           '6':'SIX','7':'SEVEN','8':'EIGHT','9':'NINE'}
def getPhoneticTranslation(sentence):
    #Transcriptions to be compared were converted to uppercase and all non-ASCII characters and punctuation were
    #removed. Digits were replaced with their corresponding texts (e.g., “2” became “two”)
    def PreProcessWord(word):
        for c in invalidChars:
            if c in word:
                word = word.replace(c,'')
        for n in numbers.keys():
            if n in word:
                word = word.replace(n, ' '+numbers[n]+' ')

        word = word.upper()
        word = word.replace('  ', ' ')
        return word.strip()

    a=PreProcessWord(sentence)
    cmutranslated = []
    for w in a.split(' '):
        if w in cmu_dict:
            b = cmu_dict[w]
        else:
            b = ['_']*len(w)
        cmutranslated+= b+[' ']
    return cmutranslated[:len(cmutranslated)-1]
    



#example of the process
word = '123feeli_ng down!'
print('*', word)
print(getPhoneticTranslation(word))
word = 'Apple watch --!'
print('*', word)
print(getPhoneticTranslation(word))
word = 'Unexxxxisting word but the rest is fine'
print('*', word)
print(getPhoneticTranslation(word))

* 123feeli_ng down!
['W', 'AH', 'N', ' ', 'T', 'UW', ' ', 'TH', 'R', 'IY', ' ', 'F', 'IY', 'L', 'IH', 'NG', ' ', 'D', 'AW', 'N']
* Apple watch --!
['AE', 'P', 'AH', 'L', ' ', 'W', 'AA', 'CH']
* Unexxxxisting word but the rest is fine
['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', ' ', 'W', 'ER', 'D', ' ', 'B', 'AH', 'T', ' ', 'DH', 'AH', ' ', 'R', 'EH', 'S', 'T', ' ', 'IH', 'Z', ' ', 'F', 'AY', 'N']


### Load new Alexa Dataset

In [3]:
#Load dataset
import os
import json

def reloadData():
    def getFiles(dirName):
        listOfFile = os.listdir(dirName)
        completeFileList = list()
        for file in listOfFile:
            completePath = os.path.join(dirName, file)
            if os.path.isdir(completePath):
                completeFileList = completeFileList + getFiles(completePath)
            elif '.json' in completePath:
                completeFileList.append(completePath.replace('\\', '/'))

        return completeFileList

    allf = getFiles( "../NewFullDataset/")
    print(allf)

    #create dictionary of markets and skills
    skillsmarketd = {}
    for f in allf:
        marketname = f.split('/')[2].split('.json')[0]
        with open(f, 'r') as json_file:
            skillsmarketd[marketname] = json.load(json_file)

    print()
    print('Total unique skills per market as per last week')
    for key,value in skillsmarketd.items():
        print(key, ": ", len(value))
    print()
    print()
    return skillsmarketd

In [4]:
allskills = reloadData()

['../NewFullDataset/AU.json', '../NewFullDataset/CA.json', '../NewFullDataset/DE.json', '../NewFullDataset/ES.json', '../NewFullDataset/FR.json', '../NewFullDataset/IN.json', '../NewFullDataset/IT.json', '../NewFullDataset/JP.json', '../NewFullDataset/MX.json', '../NewFullDataset/UK.json', '../NewFullDataset/US.json']

Total unique skills per market as per last week
AU :  24062
CA :  26009
DE :  9082
ES :  4756
FR :  2282
IN :  29249
IT :  4202
JP :  3519
MX :  1972
UK :  30975
US :  60298




### Levenshtein Distance

In [5]:
import editdistance
#luckily I found this iplementation of levehenstein distance that goes almost 100 times faster than the defined above...
def levdistance(w1,w2):
    #if there is a difference of more than 3 words in the skil names, 
    #we are not interested in the real distance as they are not interesting
    if(w1 == w2):
        return 0
    if(len(w1) > len(w2)+2):
        return 1
    if(len(w2) > len(w1)+2):
        return 1
    return editdistance.eval(w1, w2)/ max(len(w1), len(w2))

In [6]:
levdistance(['ck', 'a', 't'], ['k', 'a', 't'])

0.3333333333333333

### Perform experiment for TABLE III.SKILL NAMES WITHLEVENSHTEIN DISTANCE≤0.2, considering english markets and ignoring words that do not have a CMU translation

#### NOTE: Do not run this, it takes a while!

In [8]:
import datetime

#explore the different markets per separate, otherwis eI get memory errors
data = reloadData()
englishmarkets = ['UK','AU', 'CA', 'IN', 'US']
for market in englishmarkets:
    uniqskcount = {} #dictionary that contains the quantity of skills for each element
    skilltocmu = {}  #set of unique skill names considered and their translation to phonetics
    
    '''
    Create skilltocmu dict
    '''
    print('Exploring market', market)
    skills = data[market]
    for skillid, skillobj in skills.items():
        name1 = skillobj['name'].lower()
        cmuname = getPhoneticTranslation(name1)
        #we ignore skills we couldnt completely trasnalte (include '_') [also to speed up the process]
        if '_' not in cmuname:
            skilltocmu[name1] = cmuname
            if(name1 in uniqskcount):
                uniqskcount[name1] += 1
            else:
                uniqskcount[name1] = 1
    print('finished creating crate skilltocmu dict for market', market)    

    '''
    Calculate levehenstein distances:
    This process is super slow. We do many tricks to speed up the process:
        1- since we are only interested in super similar skill names, we ignore words with different lengths (see lev function above)
        2- we only iterate trhough unique skill names transformed to lower, hence the results may be different as 
        in other figures since 'catFacts' and 'CatFacts' are considered the same skill
        3- We only identify which skill names have levehenstein distances < 0.1 or 0.2 (to reduce the final json files).
        Then, we count how many of skills with this name.lower() exist in the market to get hte data for table III
    '''
    alldistances = set([])
    uniqsk = list(skilltocmu.keys())
    totalanalysed = 0
    for i in range(len(uniqsk)):
        name1 = uniqsk[i].lower()
        if(name1 not in skilltocmu):
            continue
        cmuname1 = skilltocmu[name1]
        for j in range(i, len(uniqsk)):
            name2 = uniqsk[j].lower()
            if(name2 not in skilltocmu):
                continue
            cmuname2 = skilltocmu[name2]
            d=  levdistance(cmuname1, cmuname2)
            if(d<0.31):
                #only save these skills with smaller lev distances (otherwise memory error)
                alldistances.add( (d,name1,name2) )
            #count all distances processed
            totalanalysed +=1
            
        if(i%1000 == 0):
            now = datetime.datetime.now()
            print('\t[',now.hour,':',now.minute,'.',now.second,'] Distances computed ', i)
     
    
    
    #save analysis in PhonDict json
    path = 'PhonDict/'+market+'.json'
    with open(path, 'w') as json_file:
        print('Saving json in ',path)
        json.dump( {'phonetic_distances_minororequal03':list(alldistances), 'all_distances_processed':totalanalysed, 'skillnames_count_d':uniqskcount} , json_file)    
    
    #min01 = [d for d in alldistances if d <= 0.1 and d!=0]
    #min02 = [d for d in alldistances if d <= 0.2 and d!=0]
    ##we should count how many times this skills appear in the market!
    #print('There are ', len(min01), 'skills within a ld of 0.1')
    #print('There are ', len(min02), 'skills within a ld of 0.2')
    #print('Total unique skill names ', len(uniqsk))
    #
    now = datetime.datetime.now()
    print('[',now.hour,':',now.minute,'.',now.second,'] Finished estiamting skill names levehenstein distances for market', market)    

['../NewFullDataset/AU.json', '../NewFullDataset/CA.json', '../NewFullDataset/DE.json', '../NewFullDataset/ES.json', '../NewFullDataset/FR.json', '../NewFullDataset/IN.json', '../NewFullDataset/IT.json', '../NewFullDataset/JP.json', '../NewFullDataset/MX.json', '../NewFullDataset/UK.json', '../NewFullDataset/US.json']

Total unique skills per market as per last week
AU :  24062
CA :  26009
DE :  9082
ES :  4756
FR :  2282
IN :  29249
IT :  4202
JP :  3519
MX :  1972
UK :  30975
US :  60298


Exploring market UK
finished creating crate skilltocmu dict for market UK
	[ 9 : 21 . 37 ] Distances computed  0
	[ 9 : 22 . 5 ] Distances computed  1000
	[ 9 : 22 . 30 ] Distances computed  2000
	[ 9 : 22 . 55 ] Distances computed  3000
	[ 9 : 23 . 19 ] Distances computed  4000
	[ 9 : 23 . 40 ] Distances computed  5000
	[ 9 : 24 . 0 ] Distances computed  6000
	[ 9 : 24 . 20 ] Distances computed  7000
	[ 9 : 24 . 39 ] Distances computed  8000
	[ 9 : 24 . 54 ] Distances computed  9000
	[ 9 : 25 . 9 

### Load the dictionaries of words and get the data for table III

In [34]:
def getFiles(dirName):
    listOfFile = os.listdir(dirName)
    completeFileList = list()
    for file in listOfFile:
        completePath = os.path.join(dirName, file)
        if os.path.isdir(completePath):
            completeFileList = completeFileList + getFiles(completePath)
        elif '.json' in completePath:
            completeFileList.append(completePath.replace('\\', '/'))

    return completeFileList

#load the jsons
alljsons = getFiles('PhonDict/')
phondict = {}
for f in alljsons:
    market = f.split('/')[1].split('.json')[0]
    print('Loaded json for ', market, ' :', f)
    with open(f, 'r') as json_file:
        phondict[market] = json.load(json_file)
        
#get the data
for market in phondict:
    print('**Studying market', market)
    phondist = phondict[market]['phonetic_distances_minororequal03']
    totalcomparisons = phondict[market]['all_distances_processed']
    skillnamecount_d = phondict[market]['skillnames_count_d']
    totalskills_hits = sum(skillnamecount_d.values())
    print('Total hits for unique skill names.lower() ', totalskills_hits)
    
    '''
    find, for every unique name skill.lower(), the minimum distance to any other skill.lower()
    '''
    min_d_skillname = {}
    for skillname in skillnamecount_d.keys():
        #print('exploring skill ', skillname)
        alldforskill = [(d, sk1, sk2) for (d, sk1, sk2) in phondist if (sk1 == skillname or sk2 == skillname) and d!=0]        
        if(len(alldforskill)>0):
            min_d_skillname[skillname] = min([d for (d, sk1, sk2) in alldforskill ])
        else:
            min_d_skillname[skillname] = 1
    '''
    now, iterate over all unique skills.lower(), and bin them < 0.1 and < 0.2
    '''
    min_d_skillname_l = [ (k,v) for (k,v) in min_d_skillname.items()]
    #get these skills with a minimum lev distance to any other skill name of 0.1
    min01 = [(skname, skmin) for (skname, skmin) in min_d_skillname_l if skmin <= 0.1]
    #get how many skills are aggregated with that specific name
    total01 = sum([skillnamecount_d[skname] for (skname, skmin) in min01])
    print('There are ', len(min01), 'skills within a ld of 0.1 to any other skillname that does not contain "_", addding a total of hits ', total01, '(',total01/totalskills_hits,'%)')
    
    #get these skills with a minimum lev distance to any other skill name of 0.1
    min02 = [(skname, skmin) for (skname, skmin) in min_d_skillname_l if skmin <= 0.2]
    #get how many skills are aggregated with that specific name
    total02 = sum([skillnamecount_d[skname] for (skname, skmin) in min02])
    print('There are ', len(min02), 'skills within a ld of 0.2 to any other skillname that does not contain "_", addding a total of hits ', total02, '(',total02/totalskills_hits,'%)')
    
    


Loaded json for  AU  : PhonDict/AU.json
Loaded json for  CA  : PhonDict/CA.json
Loaded json for  IN  : PhonDict/IN.json
Loaded json for  UK  : PhonDict/UK.json
Loaded json for  US  : PhonDict/US.json
**Studying market AU
Total hits for unique skill names.lower()  17974
There are  673 skills within a ld of 0.1 to any other skillname that does not contain "_", addding a total of hits  1234 ( 0.06865472348948481 %)
There are  3099 skills within a ld of 0.1 to any other skillname that does not contain "_", addding a total of hits  4439 ( 0.24696784243907868 %)
**Studying market CA
Total hits for unique skill names.lower()  19209
There are  912 skills within a ld of 0.1 to any other skillname that does not contain "_", addding a total of hits  1514 ( 0.07881722109427872 %)
There are  3414 skills within a ld of 0.1 to any other skillname that does not contain "_", addding a total of hits  4836 ( 0.25175698891144777 %)
**Studying market IN
Total hits for unique skill names.lower()  21389
Ther

In [38]:
w1 = getPhoneticTranslation('Cat Facts')
w2 = getPhoneticTranslation('Fat Facts')
w3 = getPhoneticTranslation('Kite Facts')
w4 = getPhoneticTranslation('fast facts')
print(w1)
print(w2)
print( levdistance(w1,w2) ) 
print( levdistance(w1,w3) ) 
print( levdistance(w1,w4) ) 

['K', 'AE', 'T', ' ', 'F', 'AE', 'K', 'T', 'S']
['F', 'AE', 'T', ' ', 'F', 'AE', 'K', 'T', 'S']
0.1111111111111111
0.1111111111111111
0.2


In [39]:
w1 = getPhoneticTranslation('mi home')
w2 = getPhoneticTranslation('my home')
print(w1)
print(w2)
print( levdistance(w1,w2) ) 


['M', 'IY', ' ', 'HH', 'OW', 'M']
['M', 'AY', ' ', 'HH', 'OW', 'M']
0.16666666666666666


### analyse a few specific examples of phonetically similar skills

In [63]:
def getFiles(dirName):
    listOfFile = os.listdir(dirName)
    completeFileList = list()
    for file in listOfFile:
        completePath = os.path.join(dirName, file)
        if os.path.isdir(completePath):
            completeFileList = completeFileList + getFiles(completePath)
        elif '.json' in completePath:
            completeFileList.append(completePath.replace('\\', '/'))

    return completeFileList

#load the jsons
alljsons = getFiles('PhonDict/')
phondict = {}
for f in alljsons:
    market = f.split('/')[1].split('.json')[0]
    print('Loaded json for ', market, ' :', f)
    with open(f, 'r') as json_file:
        phondict[market] = json.load(json_file)
        
#get the data
for market in phondict:
    print()
    print()
    print('**Studying market', market)
    phondist = phondict[market]['phonetic_distances_minororequal03']
    interestingskills = [(d, sk1, sk2) for (d, sk1, sk2) in phondist if d<=0.1 and d!=0 and len(sk1)<15] 
    interestingskills2 = [(d, sk1, sk2) for (d, sk1, sk2) in phondist if d<=0.15 and d!=0 and len(sk1)<30 and 'food' in sk1 and 'fact' not in sk1] 
    print(interestingskills[:100])
    print()
    print(interestingskills2[:50])
        

Loaded json for  AU  : PhonDict/AU.json
Loaded json for  CA  : PhonDict/CA.json
Loaded json for  IN  : PhonDict/IN.json
Loaded json for  UK  : PhonDict/UK.json
Loaded json for  US  : PhonDict/US.json


**Studying market AU
[(0.09523809523809523, 'magic 102.5', 'magic 102.9'), (0.1, 'bird trivia', 'nerd trivia'), (0.1, 'fact facts', 'fax facts'), (0.08333333333333333, 'squat workout', 'squats workout'), (0.1, 'bear facts', 'barre facts'), (0.08333333333333333, 'letter i facts', 'letter e facts'), (0.1, 'philly facts', 'film facts'), (0.09090909090909091, 'random skill!', 'random scale'), (0.09090909090909091, 'mango facts', 'tango facts'), (0.09090909090909091, 'number facts', 'number fact'), (0.07692307692307693, 'elephant facts', 'element facts'), (0.1, 'wine trivia', 'jain trivia'), (0.09090909090909091, 'time traveller', 'time travel'), (0.09090909090909091, 'roman facts', 'roma facts'), (0.07692307692307693, 'letter p facts', 'letter c facts'), (0.07692307692307693, 'letter e facts

In [43]:
w1 = getPhoneticTranslation('bird trivia')
w2 = getPhoneticTranslation('nerd trivia')
print(w1)
print(w2)
print( levdistance(w1,w2) ) 

['B', 'ER', 'D', ' ', 'T', 'R', 'IH', 'V', 'IY', 'AH']
['N', 'ER', 'D', ' ', 'T', 'R', 'IH', 'V', 'IY', 'AH']
0.1


In [62]:
w1 = getPhoneticTranslation('cat facts')
w2 = getPhoneticTranslation('kite facts')
print(w1)
print(w2)
print( levdistance(w1,w2) ) 

['K', 'AE', 'T', ' ', 'F', 'AE', 'K', 'T', 'S']
['K', 'AY', 'T', ' ', 'F', 'AE', 'K', 'T', 'S']
0.1111111111111111
