In [3]:
import pandas as pd
import numpy as np
import os
import re

In [4]:
current_dir = os.getcwd()

# Original Dataset 
filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))


chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)

In [5]:
pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
df

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,1.000000,1362082032,444407,u:dDwF,es,en,73eecb492ca758ddab5371cf7b5cca32,bajo/bajo<pr>,3,3,1,1
1,1.000000,1362082044,5963,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,8,6,6,6
2,0.750000,1362082044,5963,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,6,5,4,3
3,0.888889,1362082044,5963,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,6,5,9,8
4,0.800000,1362082044,5963,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,8,6,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...
12527553,0.800000,1363104897,368,u:i5D8,en,it,d5efc552aaea3109eb5388aa1ec8673d,the/the<det><def><sp>,6,4,5,4
12527554,0.800000,1363104897,368,u:i5D8,en,it,a826c47947d68549fa81e19cafa57ba0,eat/eat<vblex><pres>,4,4,5,4
12527555,1.000000,1363104897,368,u:i5D8,en,it,5e29d77697d23070a1fb92eb6c90e9b6,bread/bread<n><sg>,4,4,4,4
12527556,0.600000,1363104897,368,u:i5D8,en,it,cdfecc9247566d40bb964a218c54c783,drink/drink<vblex><pres>,3,2,5,3


In [7]:
# HYPOTHESIS 1 
""" Instead of the sparse indicator variables used here, it may be better to decompose lexeme tags 
into denser and more generic features of tag components (e.g., part of speech, tense, gender, case), 
and also use corpus frequency, word length, etc."""
lexeme_filepath = os.path.normpath(os.path.join(current_dir, '../data/resources', 'lexeme_reference.csv'))
lexeme_reference = pd.read_csv(lexeme_filepath, sep = ';', header=None, on_bad_lines='warn', 
                               names=["tag", "type", "description"])



In [8]:
lexemes_grouped = lexeme_reference.groupby('type')
lexemes_grouped.nunique()

Unnamed: 0_level_0,tag,description
type,Unnamed: 1_level_1,Unnamed: 2_level_1
POS,22,22
adjective,18,18
animacy,3,3
case,2,2
def,2,2
gender,5,5
number,4,4
other,14,14
person,3,3
propernoun,2,2


In [55]:
# Separate only the words to get new features
df_lexeme = df[['lexeme_string', 'learning_language']]

In [56]:
df_lexeme.drop_duplicates(inplace=True)

In [57]:
df_lexeme.head()

Unnamed: 0,lexeme_string,learning_language
0,bajo/bajo<pr>,es
1,lernt/lernen<vblex><pri><p3><sg>,de
2,die/die<det><def><f><sg><nom>,de
3,mann/mann<n><m><sg><nom>,de
4,frau/frau<n><f><sg><nom>,de


In [58]:
def prepare_tags_reference(df): 
    df = df[df["type"].str.contains("adjective|animacy|other|propernoun|case") == False]
    tags_dict = df.set_index('tag')['type'].to_dict()
    types = set(tags_dict.values())
    return tags_dict, types

In [59]:
tags_dict, types = prepare_tags_reference(lexeme_reference)

In [60]:
for lexeme_type in types:
    df_lexeme[lexeme_type] = None

In [61]:
def extract_from_lexemestring(lexeme_string):
    tags = re.findall(r'<(.*?)>', lexeme_string)
    return tags 

df_lexeme['tags'] = df_lexeme['lexeme_string'].apply(extract_from_lexemestring)

In [62]:
df_lexeme.head()

Unnamed: 0,lexeme_string,learning_language,gender,tense,POS,def,person,number,tags
0,bajo/bajo<pr>,es,,,,,,,[pr]
1,lernt/lernen<vblex><pri><p3><sg>,de,,,,,,,"[vblex, pri, p3, sg]"
2,die/die<det><def><f><sg><nom>,de,,,,,,,"[det, def, f, sg, nom]"
3,mann/mann<n><m><sg><nom>,de,,,,,,,"[n, m, sg, nom]"
4,frau/frau<n><f><sg><nom>,de,,,,,,,"[n, f, sg, nom]"


In [64]:
df_lexeme.iloc[44,:]

lexeme_string        <*sf>/traje<n><m><*numb>
learning_language                          pt
gender                                   None
tense                                    None
POS                                      None
def                                      None
person                                   None
number                                   None
tags                       [*sf, n, m, *numb]
Name: 57, dtype: object

In [65]:
def assign_tags(tags):
    values = {'gender': np.nan, 'POS': np.nan, 'def': np.nan, 'tense':np.nan, 'person':np.nan, 'number':np.nan}
    for tag in tags:
        col = tags_dict.get(tag)
        if col and pd.isna(values[col]):  # Only assign if column is empty 
            values[col] = tag
    return pd.Series([values['gender'], values['POS'],  values['def'], values['tense'], values['person'], values['number']])

df_lexeme[['gender', 'POS', 'def', 'tense', 'person', 'number']] = df_lexeme['tags'].apply(assign_tags)

In [66]:
df_lexeme

Unnamed: 0,lexeme_string,learning_language,gender,tense,POS,def,person,number,tags
0,bajo/bajo<pr>,es,,,pr,,,,[pr]
1,lernt/lernen<vblex><pri><p3><sg>,de,,pri,vblex,,p3,sg,"[vblex, pri, p3, sg]"
2,die/die<det><def><f><sg><nom>,de,f,,det,def,,sg,"[det, def, f, sg, nom]"
3,mann/mann<n><m><sg><nom>,de,m,,n,,,sg,"[n, m, sg, nom]"
4,frau/frau<n><f><sg><nom>,de,f,,n,,,sg,"[n, f, sg, nom]"
...,...,...,...,...,...,...,...,...,...
12494521,conférence/conférence<n><f><sg>,fr,f,,n,,,sg,"[n, f, sg]"
12494524,liens/lien<n><m><pl>,fr,m,,n,,,pl,"[n, m, pl]"
12505951,foi/ir<vblex><ifi><p3><sg>,pt,,ifi,vblex,,p3,sg,"[vblex, ifi, p3, sg]"
12514074,will/will<n><sg>,en,,,n,,,sg,"[n, sg]"


In [67]:
df_lexeme['word'] = df_lexeme['lexeme_string'].str.split("/").str[0]
df_lexeme['word_len'] = df_lexeme['word'].apply(lambda x: len(x))

# Drop columns  
df_lexeme.drop(columns=['tags', 'lexeme_string'], inplace=True)

In [68]:
df_lexeme.drop_duplicates(inplace=True)

In [69]:
df_lexeme.head()

Unnamed: 0,learning_language,gender,tense,POS,def,person,number,word,word_len
0,es,,,pr,,,,bajo,4
1,de,,pri,vblex,,p3,sg,lernt,5
2,de,f,,det,def,,sg,die,3
3,de,m,,n,,,sg,mann,4
4,de,f,,n,,,sg,frau,4


In [70]:
# Add SUBTLEX
folderpath = os.path.normpath(os.path.join(current_dir, '../data/resources/SUBTLEX'))
def prepare_subtlex(folderpath):
    dfs = []
    for filename in os.listdir(folderpath): 
        if filename.endswith(".txt"):
            language = os.path.splitext(filename)[0].split('_')[-2]
            filepath = os.path.join(folderpath, filename)
            df = pd.read_csv(filepath, on_bad_lines = 'skip', sep=' ', names=['word', 'SUBTLEX'])
            df["learning_language"] = language
            dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    return df


df_subtlex = prepare_subtlex(folderpath)

In [71]:
word_complexity_df = df_lexeme.merge(df_subtlex, on = ['word', 'learning_language'], how='left')

In [73]:
word_complexity_df[word_complexity_df['SUBTLEX'].isna()]

Unnamed: 0,learning_language,gender,tense,POS,def,person,number,word,word_len,SUBTLEX
44,pt,m,,n,,,,<*sf>,5,
45,pt,f,,n,,,,<*sf>,5,
46,pt,,pri,vblex,,,,<*sf>,5,
85,de,,,n,,,,hähnchensandwich,16,
188,fr,f,,n,,,,<*sf>,5,
...,...,...,...,...,...,...,...,...,...,...
11733,it,,fti,vbmod,,,,<*sf>,5,
11852,pt,,pis,vblex,,,,<*sf>,5,
11853,pt,,pis,vbhaver,,,,<*sf>,5,
12136,it,,imp,vbmod,,,,<*sf>,5,


In [74]:
word_complexity_df_final = word_complexity_df[word_complexity_df['SUBTLEX'].notna()]


In [75]:
word_complexity_df_final

Unnamed: 0,learning_language,gender,tense,POS,def,person,number,word,word_len,SUBTLEX
0,es,,,pr,,,,bajo,4,111241.0
1,de,,pri,vblex,,p3,sg,lernt,5,3391.0
2,de,f,,det,def,,sg,die,3,2484854.0
3,de,m,,n,,,sg,mann,4,222707.0
4,de,f,,n,,,sg,frau,4,143725.0
...,...,...,...,...,...,...,...,...,...,...
12257,fr,f,,n,,,sg,conférence,10,8019.0
12258,fr,m,,n,,,pl,liens,5,5062.0
12259,pt,,ifi,vblex,,p3,sg,foi,3,752009.0
12260,en,,,n,,,sg,will,4,1969807.0


In [24]:
filepath = os.path.normpath(os.path.join(current_dir, '../data/features/'))
word_complexity_df.to_csv(os.path.join(filepath, 'word_complexity_features.csv'), sep='\t', index=False, header=True)

In [25]:
df_lexeme.groupby(df['learning_language']).size() # ES, DE and FR have the most words in dataset

learning_language
de    2720
en    2010
es    2828
fr    2523
it    1102
pt    1079
dtype: int64