In [1]:
from deep_translator import GoogleTranslator
from nltk.corpus import stopwords
import re
import pandas as pd
from LDistance import LDistance as LDistance
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
def stringDevowelize(strArg):
    strRet = ""
    listVowels = ['a', 'e', 'i', 'o', 'u']
    for n in strArg:
        if n not in listVowels:
            strRet += n
    return strRet

In [3]:
def csvFreqListLDistances(strSrcLangFile, strSrcLang, strTarLang, strSrcLangCode, strTarLangCode):
    # Creates file from frequency list
    fileFreqList = open(strSrcLangFile, encoding="utf-8")
    listStops = stopwords.words(strSrcLang)
    listAllLines = fileFreqList.readlines()

    # listFirstThousand is the first thousand non-stopword entries in the file
    listFirstThousand = []

    # scrapes frequency list for first 1000 entries (word and frequency) not in stopwords
    intFreqIndex = 0
    while len(listFirstThousand) < 1000:

        # checks if word is a stop word and adds if it is not
        if re.split(" ", listAllLines[intFreqIndex])[0].lower() not in listStops:
            listFirstThousand.append(listAllLines[intFreqIndex].lower())
        intFreqIndex += 1
    listThousandWords = []

    # removes frequency from entries and adds just the word to listThousandWords
    for stringText in listFirstThousand:
        listLine = re.split(" ", stringText)
        listThousandWords.append(listLine[0])

    # list of length 2 lists with the words from language A and their translations into language B
    listSrcTarWordLists = []
    for wordA in listThousandWords:
        listSrcTarWordLists.append([wordA, GoogleTranslator(source=strSrcLangCode, target=strTarLangCode).translate(text=wordA).lower()])
    listLDists = []
    for i in range(len(listSrcTarWordLists)):
        strAWord = listSrcTarWordLists[i][0].lower()
        strBWord = listSrcTarWordLists[i][1].lower()
        listLDists.append(LDistance.floatLDistance(LDistance, stringDevowelize(strAWord), stringDevowelize(strBWord)))

    # creates dataframe with columns: original word, translation, and l-distance
    df = pd.DataFrame(listSrcTarWordLists, columns=[strSrcLang, strTarLang])
    df["Distances"] = listLDists
    print(df)
    csvFileName = ""
    csvFileName += strSrcLang[0:3]
    csvFileName += strTarLang[0:3]
    csvFileName += "1.csv"

    df.to_csv(csvFileName, encoding='utf-8-sig')

In [4]:
def snsCreateHeatMap(listRatioValues, listLangNames, strFileSaveName):
    intTableLen = pow(len(listLangNames), 2)
    
    if ((len(listRatioValues[0]) * len(listRatioValues)) != intTableLen):
        return "Incorrect number of data values, please edit listRatioValues."
    
    listData = listRatioValues
    listRows = listLangNames
    listCols = listLangNames

    df = pd.DataFrame(listData, index=listRows, columns=listCols)
    s = sns.heatmap(df.head(), annot=True, cmap='YlOrRd', cbar=True,fmt='.4f')
    plt.savefig(strFileSaveName)
    s.set(xlabel='Source Language', ylabel='Target Language')

In [5]:
dfEngSpa = csvFreqListLDistances("./freq_samples/en_50k.txt", "English", "Spanish", "en", "es")
dfEngSpa

      English    Spanish  Distances
0        know      saber   0.000000
1        like   me gusta   0.000000
2         get    obtener   0.250000
3          go      vamos   0.000000
4       right   correcto   0.400000
..        ...        ...        ...
995  drinking   bebiendo   0.166667
996     grand  grandioso   0.800000
997     worst    el peor   0.000000
998     match      juego   0.000000
999   nervous   nervioso   1.000000

[1000 rows x 3 columns]
