In [51]:
import pandas as pd
import wordfreq

### Read Words

We have created a .csv file contaning the top 1000 most frequent english word translated into 10 different languages:
 - English
 - Catalan
 - Spanish
 - German
 - French
 - Polish
 - Portuguese
 - Russian
 - Italian
 - Swedish

In [52]:
# Read Data and convert to lowercase strings
df = pd.read_csv('data/data.csv', sep=',')
df = df.apply(lambda x: x.astype(str).str.lower())

### Create Dataset

We create a new dataset concatenating all words and create a new column containing the language of the word

In [53]:
# concat all languages into one Dataframe
dfs = list()
for lang in df.columns:
    df_lang = pd.DataFrame(df[lang])
    df_lang['lang'] = lang[:3]
    df_lang = df_lang.rename(columns={lang: 'word'})
    dfs.append(df_lang)
df = pd.concat(dfs, ignore_index=True)
df.head()

Unnamed: 0,word,lang
0,as,ang
1,his,ang
2,that,ang
3,he,ang
4,was,ang


### Feature Engineering

In this section we create new features to try to have more information for better predictions.

We've created different "types" of features:
 - Character counting
 - Groups of characters
 - Prefixes and Suffixes
 - Zipf's Law 

Having more information about the words will be useful in order to predict the language of the word. 

We must have in mind that all of our features are subsets of the languages characteristics. Per example, almost every word that contains the letter "ñ" is a spanish word, but not all spanish words contain the letter "ñ". 
So this feature will help us to predict the words that *do* contain an "ñ" but not the ones that don't. Our objective is to create enough features so they can cover as much of the language's characteristics as possible. This will increase the chances of correctly predicting the language of a word. 

Add Word Length Feature

In [54]:
# Length
df['len'] = df['word'].str.len()

Count Special Characters in word
We've created different new features based on the characters in the words. 
This characteristcs are devided into two groups: 
 - *General features:* features like the number of vowels, accents, number of spaces, etc
 - *Language specific features:* the number of special characters specific to the language, like the number of "ñ", "ç", "é", etc 

In [55]:
# We create a dictionary containing the feature name and the special characters we want to check.
dict = {
    'vow':'aeiou', 
    'num_words': ' ',
    "apos": "'",
    "hyph": "-",
    "rares": "kqwxyz",
    'acc':'àèìòùáéíóú', 
    'accl':'àèìòù', 
    'accr':'áéíóú', 
    'die':'äëïöü',
    'cir':'âêîôû', 
    'ñ':'ñ', 
    'ç':'ç', 
    'ale': 'ß', 
    'rus': 'бвгджзийклмнпрстфцчшщъыьэюя',
    'pol': 'ąćęłńóśźż', 
    'por': 'ãõ', 
    'sue': 'åäö',
    'esp': 'áéíóúü',
    'ita': 'àèéìíîòóùú',
    'fra': 'àâæçéèêëîïôœùûüÿ',
    'ger': 'äöüß',
    'cat': 'àèéíïòóúüç'
}

# Returns the number of appearances of any char from string in word.
def count_special_characters(word: str, string: str) -> int:
    num_special_characters=0
    for char in word:
        if char in string:
            num_special_characters+=1
    return num_special_characters


# For every entry in the dictionary we create that column using the function above.
for column in dict:
        df[column] = df['word'].apply(lambda row: count_special_characters(row, dict[column]))

We check for common prefixes in different languages, to try to obtain more information about the language.

In [56]:
# Dictionary containing feature name, and list of the common prefixes in that language.
common_prefixes = {
    'pre_eng': ["anti", "be", "de", "dis", "en", "ex", "im", "in", "non", "pre", "re", "un"],
    'pre_esp': ["anti", "auto", "contra", "des", "en", "ex", "in", "inter", "pre", "re", "sub", "trans"],
    'pre_cat': ["anti", "ab", "avant", "arxi", "dia", "hemi", "auto", "contra", "des", "en", "ex", "in", "inter", "pre", "re", "sub", "trans"],
    'pre_ita': ["auto", "dis", "en", "ex", "im", "in", "ir", "mal", "per", "pre", "pro", "re", "sott", "sotto", "tran", "ab"],
    'pre_fra': ["anti", "auto", "co", "con", "contre", "de", "des", "en", "ex", "in", "inter", "mal", "pre", "pro", "re", "sub", "sur"],
    'pre_por': ["auto", "co", "contra", "des", "em", "en", "ex", "in", "inter", "pre", "pro", "re", "sub"],
    'pre_ale': ["be", "ein", "ent", "er", "ge", "hin", "ver", "zer"],
    'pre_sue': ["be", "för", "in", "om", "över", "under"],
    'pre_pol': ["przed", "nad", "na", "pod", "z", "w"],
    'pre_rus': ["анти", "без", "в", "во", "до", "за", "из", "над", "пере", "под", "по", "пре", "раз"]
}

# Returns if word starts with a prefix from prefixes list.
def has_prefix(word: str, prefixes: list) -> int:
    num_prefixes=0
    for prefix in prefixes:
        if word.startswith(prefix):
            return 1
    return 0

# For every entry in the dictionary we create that column using the function above.
for column in common_prefixes:
        df[column] = df['word'].apply(lambda row: has_prefix(row, common_prefixes[column]))

Similary to the case above, we check for common suffixes in the different languages.

In [57]:
# Dictionary containing feature name, and list of the common suffixes in that language.
common_suffixes = {
    'suf_eng': ["able", "al", "ation", "er", "est", "ful", "ing", "ion", "ive", "less", "ly", "ness", "ous", "s", "y"],
    'suf_esp': ["ado", "ador", "aje", "anza", "ar", "ario", "ero", "iente", "illa", "ina", "izar", "oso", "ón", "udo", "er", "ir"],
    'suf_cat': ["ana", "aca", "ada", "al", "am", "ador", "tge", "isme", "nça", "ar", "ista", "istic", "mente", "ment", "ina", "tzar", "nça" "on", "um", "ut", "uda", "er", "ir", "re"],
    'suf_ita': ["abile", "are", "ario", "atore", "azione", "ente", "evole", "ificare", "ivo", "izzare", "ore", "orente", "orevole", "oso", "ura"],
    'suf_fra': ["age", "aille", "ance", "eau", "eux", "eur", "eurse", "ie", "iment", "ion", "ique", "isme", "iste", "ition", "ive", "oire", "ure", "y"],
    'suf_por': ["al", "ão", "ar", "ês", "ência", "eza", "ia", "ício", "imento", "ir", "or", "oso", "ura"],
    'suf_ale': ["bar", "e", "ei", "er", "heit", "ich", "ig", "in", "keit", "lich", "ling", "sam", "schaft", "ung"],
    'suf_sue': ["ande", "are", "bar", "dom", "else", "en", "eri", "het", "ing", "isk", "itet", "lig", "lighet", "ning", "ningen", "ningar", "ningen"],
    'suf_pol': ["acja", "ać", "anie", "eć", "enie", "enie", "enie", "enie", "enie", "enie", "enie", "enie", "enie", "enie", "enie", "enie"],
    'suf_rus': ["больше", "енький", "ик", "ичка", "ок", "онок", "ушко", "ца", "чек", "шка", "шко", "ящик", "ец", "ин", "ист", "ник", "овец", "щик", "ёнок", "ь"]
}

# Returns if word ends with a prefix from suffixes list.
def has_suffix(word: str, suffixes: list) -> int:
    num_suffixes=0
    for suffix in suffixes:
        if word.endswith(suffix):
            return 1
    return 0

# For every entry in the dictionary we create that column using the function above.
for column in common_suffixes:
    df[column] = df['word'].apply(lambda row: has_suffix(row, common_suffixes[column]))

Check letter groups in a word based on groups commonly used by each language, we also check for contiguous vowel pairs.


In [58]:
# Function that creates all the pairs of 2 vowels.
def diptongos() -> list:
    list=[]
    vowels = "aeiouàèìòùáéíóúäëïöüâêîôû"
    for i in vowels:
        for j in vowels:
            list.append(i+j)
    return list

# Dictionary containing feature name, and list of the leetter groups in that language.
groups = {
        "pairs_eng": ["sh", "th", "ch", "ck", "ph", "ng", "qu", "tr", "st", "wh", "tr"],
        "pairs_cat": ["ny", "tx", "sc", "nc", "rc", "ll", "nc", "pc", "pr", "br", "fr", "ts", "ix",  "nd", "pr", "bl"],
        "pairs_esp": ["nd", "nt", "ch", "rr", "ll", "qu", "gu", "nc", "mb", "pr"],
        "pairs_ger": ["tch", "ck", "ng", "qu", "tz", "ss", "st", "sp", "str", "sch"],
        "pairs_por": ["tch", "lh", "nh", "qu", "sc", "rr", "nc", "gu", "lm", "rm"],
        "pairs_pol": ["ch", "dz", "d±", "di", "rz", "sz", " Sc", "ed", "id"],
        "pairs_ita": ["ch", "gl", "gn", "sc", "qu", "scl", "ch", "ci", "gli", "gn", "io", "la", "leu", "ii", "io", "ne"],
        "pairs_swe": ["ch", "ck", "cid", "dt", "gg", "ll", "ng", "sk", "st", "tt"],
        "pairs_fre": ["ch", "che", "eau", "ent", "es", "ette", "eur", "iau", "ie", "in"],
        "diptongos": diptongos()
        }

def count_group(word: str, groups: list) -> int:
    num_groups=0
    for group in groups:
        if group in word:
            num_groups+=1
    return num_groups

# For every entry in the dictionary we create that column using the function above.
for column in groups:
        df[column] = df['word'].apply(lambda row: count_group(row, groups[column]))

Check Word Frequency in each language

To do this we use the Zipf's Law. This law states that the frequency of a word is inversely proportional to its rank in the frequency table.

In [59]:
language_codes = ['en', 'es', 'ca', 'it', 'fr', 'pt', 'de', 'sv', 'pl', 'ru']

for code in language_codes:
        df[code+'_freq'] = df['word'].apply(lambda row: wordfreq.zipf_frequency(row, code))

Categorize Languages

In [60]:
# Categorizar los idiomas de 0 a 9
df['lang'] = df['lang'].astype('category').cat.codes

Save processed dataset

In [62]:
df.to_csv('data/final.csv')
df.head(20)

Unnamed: 0,word,lang,len,vow,num_words,apos,hyph,rares,acc,accl,...,en_freq,es_freq,ca_freq,it_freq,fr_freq,pt_freq,de_freq,sv_freq,pl_freq,ru_freq
0,as,1,2,1,0,0,0,0,0,0,...,6.77,4.66,4.22,4.26,5.79,6.73,4.57,4.66,4.51,3.89
1,his,1,3,1,0,0,0,0,0,0,...,6.51,3.63,3.12,3.67,3.7,3.94,3.93,3.89,3.82,3.62
2,that,1,4,1,0,0,0,0,0,0,...,7.01,4.12,3.67,4.24,4.12,4.38,4.56,4.5,4.4,3.98
3,he,1,2,1,0,0,0,0,0,0,...,6.69,5.77,5.87,4.0,4.02,4.01,4.36,4.32,4.39,3.74
4,was,1,3,1,0,0,0,1,0,0,...,6.82,3.84,3.78,3.94,3.83,3.99,6.49,4.01,5.57,3.76
5,for,1,3,1,0,0,0,0,0,0,...,7.01,4.66,4.73,4.86,4.6,5.62,4.88,4.94,4.98,4.71
6,on,1,2,1,0,0,0,0,0,0,...,6.91,4.66,6.18,4.98,6.71,5.17,4.95,4.93,5.98,4.49
7,are,1,3,2,0,0,0,0,0,0,...,6.74,4.07,3.67,4.22,4.05,4.32,4.29,4.42,4.42,3.94
8,with,1,4,1,0,0,0,1,0,0,...,6.85,4.2,4.24,4.38,4.4,4.67,4.58,4.52,4.52,4.22
9,they,1,4,1,0,0,0,1,0,0,...,6.5,3.61,3.42,3.76,3.6,3.76,3.83,3.82,3.93,3.45
