In [3]:
import pandas as pd
import numpy as np
import os
import re

In [163]:
current_dir = os.getcwd()

# Original Dataset 
filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))


chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)

In [164]:
pd.options.mode.chained_assignment = None  # default='warn'

In [165]:
df.head()

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,1.0,1362082032,444407,u:dDwF,es,en,73eecb492ca758ddab5371cf7b5cca32,bajo/bajo<pr>,3,3,1,1
1,1.0,1362082044,5963,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,8,6,6,6
2,0.75,1362082044,5963,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,6,5,4,3
3,0.888889,1362082044,5963,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,6,5,9,8
4,0.8,1362082044,5963,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,8,6,5,4


In [166]:
print(len(df))

12527558


In [167]:
# HYPOTHESIS 1 
""" Instead of the sparse indicator variables used here, it may be better to decompose lexeme tags 
into denser and more generic features of tag components (e.g., part of speech, tense, gender, case), 
and also use corpus frequency, word length, etc."""
lexeme_filepath = os.path.normpath(os.path.join(current_dir, '../data/resources', 'lexeme_reference.csv'))
lexeme_reference = pd.read_csv(lexeme_filepath, sep = ';', header=None, on_bad_lines='warn', 
                               names=["tag", "type", "description"])



In [168]:
lexemes_grouped = lexeme_reference.groupby('type')
lexemes_grouped.nunique()

Unnamed: 0_level_0,tag,description
type,Unnamed: 1_level_1,Unnamed: 2_level_1
POS,22,22
adjective,18,18
animacy,3,3
case,2,2
def,2,2
gender,5,5
number,4,4
other,14,14
person,3,3
propernoun,2,2


In [169]:
# Separate only the words to get new features
df_lexeme = df[['lexeme_string', 'learning_language']]

In [170]:
df_lexeme.drop_duplicates(inplace=True)

In [171]:
df_lexeme.head()

Unnamed: 0,lexeme_string,learning_language
0,bajo/bajo<pr>,es
1,lernt/lernen<vblex><pri><p3><sg>,de
2,die/die<det><def><f><sg><nom>,de
3,mann/mann<n><m><sg><nom>,de
4,frau/frau<n><f><sg><nom>,de


In [172]:
print(len(df_lexeme))

19009


In [173]:
def prepare_tags_reference(df): 
    df = df[df["type"].str.contains("adjective|animacy|other|propernoun|case") == False]
    tags_dict = df.set_index('tag')['type'].to_dict()
    types = set(tags_dict.values())
    return tags_dict, types

In [174]:
tags_dict, types = prepare_tags_reference(lexeme_reference)

In [175]:
for lexeme_type in types:
    df_lexeme[lexeme_type] = None

In [176]:
def extract_from_lexemestring(lexeme_string):
    tags = re.findall(r'<(.*?)>', lexeme_string)
    return tags 

df_lexeme['tags'] = df_lexeme['lexeme_string'].apply(extract_from_lexemestring)

In [177]:
df_lexeme

Unnamed: 0,lexeme_string,learning_language,gender,tense,POS,def,person,number,tags
0,bajo/bajo<pr>,es,,,,,,,[pr]
1,lernt/lernen<vblex><pri><p3><sg>,de,,,,,,,"[vblex, pri, p3, sg]"
2,die/die<det><def><f><sg><nom>,de,,,,,,,"[det, def, f, sg, nom]"
3,mann/mann<n><m><sg><nom>,de,,,,,,,"[n, m, sg, nom]"
4,frau/frau<n><f><sg><nom>,de,,,,,,,"[n, f, sg, nom]"
...,...,...,...,...,...,...,...,...,...
12494521,conférence/conférence<n><f><sg>,fr,,,,,,,"[n, f, sg]"
12494524,liens/lien<n><m><pl>,fr,,,,,,,"[n, m, pl]"
12505951,foi/ir<vblex><ifi><p3><sg>,pt,,,,,,,"[vblex, ifi, p3, sg]"
12514074,will/will<n><sg>,en,,,,,,,"[n, sg]"


In [178]:
# Some of the words contain "<sf>" at the begginig, there is no reference to it in lexeme_reference 
# The original word is restored and "sf" is then deleted 
# No idea what it means
df_lexeme[df_lexeme['lexeme_string'].str.contains("<*sf>")].head()

Unnamed: 0,lexeme_string,learning_language,gender,tense,POS,def,person,number,tags
57,<*sf>/traje<n><m><*numb>,pt,,,,,,,"[*sf, n, m, *numb]"
58,<*sf>/chapéu<n><m><*numb>,pt,,,,,,,"[*sf, n, m, *numb]"
59,<*sf>/fantasia<n><f><*numb>,pt,,,,,,,"[*sf, n, f, *numb]"
60,<*sf>/sapato<n><m><*numb>,pt,,,,,,,"[*sf, n, m, *numb]"
61,<*sf>/agasalho<n><m><*numb>,pt,,,,,,,"[*sf, n, m, *numb]"


In [179]:
# Some of the words show as duplicate later, because after splitting the tags, some of them are lost
# For example there is no "acc" in lexeme_reference as well as "nom"
# So in the effect both words contain only tags "n", "f", "sg" and get flagged as duplicates
# They will be deleted 

df_lexeme[df_lexeme['lexeme_string'].str.contains("freundin")]

Unnamed: 0,lexeme_string,learning_language,gender,tense,POS,def,person,number,tags
130,freundin/freundin<n><f><sg><nom>,de,,,,,,,"[n, f, sg, nom]"
135,freundin/freundin<n><f><sg><acc>,de,,,,,,,"[n, f, sg, acc]"


In [180]:
def assign_tags(tags):
    values = {'gender': np.nan, 'POS': np.nan, 'def': np.nan, 'tense':np.nan, 'person':np.nan, 'number':np.nan}
    for tag in tags:
        col = tags_dict.get(tag)
        if col and pd.isna(values[col]):  # Only assign if column is empty 
            values[col] = tag
    return pd.Series([values['gender'], values['POS'],  values['def'], values['tense'], values['person'], values['number']])

df_with_tags = df_lexeme.copy()
df_with_tags[['gender', 'POS', 'def', 'tense', 'person', 'number']] = df_with_tags['tags'].apply(assign_tags)

In [181]:
is_sf =  df_with_tags['lexeme_string'].str.contains("<*sf>")
df_sf = df_with_tags[is_sf]
df_without_sf = df_with_tags[~is_sf]

In [182]:
df_without_sf['word'] = df_without_sf['lexeme_string'].str.split("/").str[0]
df_sf['word'] = df_sf['lexeme_string'].str.split("/").str[1].str.split("<").str[0]

In [197]:
df_with_tags_final = pd.concat([df_without_sf, df_sf])

In [198]:
df_with_tags_final

Unnamed: 0,lexeme_string,learning_language,gender,tense,POS,def,person,number,tags,word
0,bajo/bajo<pr>,es,,,pr,,,,[pr],bajo
1,lernt/lernen<vblex><pri><p3><sg>,de,,pri,vblex,,p3,sg,"[vblex, pri, p3, sg]",lernt
2,die/die<det><def><f><sg><nom>,de,f,,det,def,,sg,"[det, def, f, sg, nom]",die
3,mann/mann<n><m><sg><nom>,de,m,,n,,,sg,"[n, m, sg, nom]",mann
4,frau/frau<n><f><sg><nom>,de,f,,n,,,sg,"[n, f, sg, nom]",frau
...,...,...,...,...,...,...,...,...,...,...
12241994,<*sf>/ausgabe<n><f><*numb><*case>,de,f,,n,,,,"[*sf, n, f, *numb, *case]",ausgabe
12260972,<*sf>/acontecer<vblex><prs><*pers><*numb>,pt,,prs,vblex,,,,"[*sf, vblex, prs, *pers, *numb]",acontecer
12303845,<*sf>/pegar<vblex><prs><*pers><*numb>,pt,,prs,vblex,,,,"[*sf, vblex, prs, *pers, *numb]",pegar
12315787,<*sf>/gara<n><f><*numb>,it,f,,n,,,,"[*sf, n, f, *numb]",gara


In [199]:
df_with_tags_final[df_with_tags_final['word']=='freundin']

Unnamed: 0,lexeme_string,learning_language,gender,tense,POS,def,person,number,tags,word
130,freundin/freundin<n><f><sg><nom>,de,f,,n,,,sg,"[n, f, sg, nom]",freundin
135,freundin/freundin<n><f><sg><acc>,de,f,,n,,,sg,"[n, f, sg, acc]",freundin


In [200]:
df_with_tags_final['word_len'] = df_with_tags_final['word'].apply(lambda x: len(x))

# Drop columns  
df_with_tags_final.drop(columns=['tags'], inplace=True)

In [201]:
df_with_tags_final.drop_duplicates(inplace=True)

In [202]:
df_with_tags_final

Unnamed: 0,lexeme_string,learning_language,gender,tense,POS,def,person,number,word,word_len
0,bajo/bajo<pr>,es,,,pr,,,,bajo,4
1,lernt/lernen<vblex><pri><p3><sg>,de,,pri,vblex,,p3,sg,lernt,5
2,die/die<det><def><f><sg><nom>,de,f,,det,def,,sg,die,3
3,mann/mann<n><m><sg><nom>,de,m,,n,,,sg,mann,4
4,frau/frau<n><f><sg><nom>,de,f,,n,,,sg,frau,4
...,...,...,...,...,...,...,...,...,...,...
12241994,<*sf>/ausgabe<n><f><*numb><*case>,de,f,,n,,,,ausgabe,7
12260972,<*sf>/acontecer<vblex><prs><*pers><*numb>,pt,,prs,vblex,,,,acontecer,9
12303845,<*sf>/pegar<vblex><prs><*pers><*numb>,pt,,prs,vblex,,,,pegar,5
12315787,<*sf>/gara<n><f><*numb>,it,f,,n,,,,gara,4


In [203]:
# Add SUBTLEX
folderpath = os.path.normpath(os.path.join(current_dir, '../data/resources/SUBTLEX'))
def prepare_subtlex(folderpath):
    dfs = []
    for filename in os.listdir(folderpath): 
        if filename.endswith(".txt"):
            language = os.path.splitext(filename)[0].split('_')[-2]
            filepath = os.path.join(folderpath, filename)
            df = pd.read_csv(filepath, on_bad_lines = 'skip', sep=' ', names=['word', 'SUBTLEX'])
            df["learning_language"] = language
            dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    return df


df_subtlex = prepare_subtlex(folderpath)

In [204]:
word_complexity_df = df_with_tags_final.merge(df_subtlex, on = ['word', 'learning_language'], how='left')

In [205]:
word_complexity_df_final = word_complexity_df[word_complexity_df['SUBTLEX'].notna()]


In [206]:
word_complexity_df_final

Unnamed: 0,lexeme_string,learning_language,gender,tense,POS,def,person,number,word,word_len,SUBTLEX
0,bajo/bajo<pr>,es,,,pr,,,,bajo,4,111241.0
1,lernt/lernen<vblex><pri><p3><sg>,de,,pri,vblex,,p3,sg,lernt,5,3391.0
2,die/die<det><def><f><sg><nom>,de,f,,det,def,,sg,die,3,2484854.0
3,mann/mann<n><m><sg><nom>,de,m,,n,,,sg,mann,4,222707.0
4,frau/frau<n><f><sg><nom>,de,f,,n,,,sg,frau,4,143725.0
...,...,...,...,...,...,...,...,...,...,...,...
19004,<*sf>/ausgabe<n><f><*numb><*case>,de,f,,n,,,,ausgabe,7,947.0
19005,<*sf>/acontecer<vblex><prs><*pers><*numb>,pt,,prs,vblex,,,,acontecer,9,66924.0
19006,<*sf>/pegar<vblex><prs><*pers><*numb>,pt,,prs,vblex,,,,pegar,5,12910.0
19007,<*sf>/gara<n><f><*numb>,it,f,,n,,,,gara,4,10084.0


In [207]:
filepath = os.path.normpath(os.path.join(current_dir, '../data/features/'))
word_complexity_df_final.to_csv(os.path.join(filepath, 'word_complexity_features.csv'), sep='\t', index=False, header=True)

In [79]:
df_lexeme.groupby(df['learning_language']).size() # ES, DE and FR have the most words in dataset

learning_language
de    2720
en    2010
es    2828
fr    2523
it    1102
pt    1079
dtype: int64