In [1]:
import pandas as pd
import numpy as np
import os
import re

In [13]:
current_dir = os.getcwd()

# Original Dataset 
filename = '13 million Duolingo student learning traces.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/raw/', filename))


chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)

In [14]:
pd.options.mode.chained_assignment = None  # default='warn'

In [15]:
df.head()

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,1.0,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,6,4,2,2
1,0.5,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,4,4,2,1
2,1.0,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,5,4,1,1
3,0.5,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,6,5,2,1
4,1.0,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,4,4,1,1


In [16]:
print(len(df))

12854145


In [17]:
# HYPOTHESIS 1 
""" Instead of the sparse indicator variables used here, it may be better to decompose lexeme tags 
into denser and more generic features of tag components (e.g., part of speech, tense, gender, case), 
and also use corpus frequency, word length, etc."""
lexeme_filepath = os.path.normpath(os.path.join(current_dir, '../data/resources', 'lexeme_reference.csv'))
lexeme_reference = pd.read_csv(lexeme_filepath, sep = ';', header=None, on_bad_lines='warn', 
                               names=["tag", "type", "description"])



In [18]:
lexemes_grouped = lexeme_reference.groupby('type')
lexemes_grouped_df = lexemes_grouped.nunique()
lexemes_grouped_df.to_html('lexemes_grouped.html')


In [19]:
lexemes_grouped_df

Unnamed: 0_level_0,tag,description
type,Unnamed: 1_level_1,Unnamed: 2_level_1
POS,22,22
adjective,18,18
animacy,3,3
case,2,2
def,2,2
gender,5,5
number,4,4
other,14,14
person,3,3
propernoun,2,2


In [20]:
# Keep only relevant tags 
lexeme_reference_rel =  lexeme_reference.loc[lexeme_reference['type'].isin(['gender', 'POS', 'def', 'tense', 'person', 'number'])]

In [21]:
lexeme_reference_rel

lexeme_reference_rel.to_html('lexeme_reference.html')

In [22]:
# Separate only the words to get new features
df_lexeme = df[['lexeme_id', 'lexeme_string', 'learning_language']]
df_lexeme.drop_duplicates(inplace=True)

In [23]:
df_lexeme.head()

Unnamed: 0,lexeme_id,lexeme_string,learning_language
0,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,de
1,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,de
2,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,de
3,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,de
4,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,de


In [24]:
print(len(df_lexeme))

19279


In [25]:
def prepare_tags_reference(df): 
    df = df[df["type"].str.contains("adjective|animacy|other|propernoun|case") == False]
    tags_dict = df.set_index('tag')['type'].to_dict()
    types = set(tags_dict.values())
    return tags_dict, types

In [26]:
tags_dict, types = prepare_tags_reference(lexeme_reference)

In [27]:
types

{'POS', 'def', 'gender', 'number', 'person', 'tense'}

In [28]:
df_lexeme.head()

Unnamed: 0,lexeme_id,lexeme_string,learning_language
0,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,de
1,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,de
2,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,de
3,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,de
4,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,de


In [19]:
for lexeme_type in types:
    df_lexeme[lexeme_type] = None

In [20]:
df_lexeme.head()

Unnamed: 0,lexeme_id,lexeme_string,learning_language,gender,number,def,tense,person,POS
0,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,de,,,,,,
1,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,de,,,,,,
2,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,de,,,,,,
3,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,de,,,,,,
4,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,de,,,,,,


In [21]:
def extract_from_lexemestring(lexeme_string):
    tags = re.findall(r'<(.*?)>', lexeme_string)
    return tags 

df_lexeme['tags'] = df_lexeme['lexeme_string'].apply(extract_from_lexemestring)

In [22]:
df_lexeme.head()

Unnamed: 0,lexeme_id,lexeme_string,learning_language,gender,number,def,tense,person,POS,tags
0,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,de,,,,,,,"[vblex, pri, p3, sg]"
1,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,de,,,,,,,"[det, def, f, sg, nom]"
2,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,de,,,,,,,"[n, m, sg, nom]"
3,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,de,,,,,,,"[n, f, sg, nom]"
4,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,de,,,,,,,"[det, def, nt, sg, nom]"


In [23]:
# Some of the words contain "<sf>" at the begginig, there is no reference to it in lexeme_reference 
# The original word is restored and "sf" is then deleted 
# No idea what it means
df_lexeme[df_lexeme['lexeme_string'].str.contains("<*sf>")].head()

Unnamed: 0,lexeme_id,lexeme_string,learning_language,gender,number,def,tense,person,POS,tags
72,6e820e73c9936c1c90e20a174f3dd1ad,<*sf>/traje<n><m><*numb>,pt,,,,,,,"[*sf, n, m, *numb]"
73,38ec3b04540c300424b556dee6e5dbe5,<*sf>/chapéu<n><m><*numb>,pt,,,,,,,"[*sf, n, m, *numb]"
74,6e39fa977508d7c2e5990cfddc80b2f7,<*sf>/fantasia<n><f><*numb>,pt,,,,,,,"[*sf, n, f, *numb]"
75,b70b44f07b89cc7c5ac2626029a8952a,<*sf>/sapato<n><m><*numb>,pt,,,,,,,"[*sf, n, m, *numb]"
76,5377c84560aaf45988067be11302d1d8,<*sf>/agasalho<n><m><*numb>,pt,,,,,,,"[*sf, n, m, *numb]"


In [24]:
# Some of the words show as duplicate later, because after splitting the tags, some of them are lost
# For example there is no "acc" in lexeme_reference as well as "nom"
# So in the effect both words contain only tags "n", "f", "sg" and get flagged as duplicates
# They will be deleted 
df_lexeme[df_lexeme['lexeme_string'].str.contains("freundin")]

Unnamed: 0,lexeme_id,lexeme_string,learning_language,gender,number,def,tense,person,POS,tags
145,26a4633147e001fe303af2f475221771,freundin/freundin<n><f><sg><nom>,de,,,,,,,"[n, f, sg, nom]"
150,9e1f56b08922d1d9f7ab663b58d88367,freundin/freundin<n><f><sg><acc>,de,,,,,,,"[n, f, sg, acc]"


In [35]:
df_lexeme

Unnamed: 0,lexeme_id,lexeme_string,learning_language,gender,number,def,tense,person,POS,tags
0,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,de,,,,,,,"[vblex, pri, p3, sg]"
1,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,de,,,,,,,"[det, def, f, sg, nom]"
2,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,de,,,,,,,"[n, m, sg, nom]"
3,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,de,,,,,,,"[n, f, sg, nom]"
4,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,de,,,,,,,"[det, def, nt, sg, nom]"
...,...,...,...,...,...,...,...,...,...,...
12820232,aa25ea9dc156919749f444521720432c,conférence/conférence<n><f><sg>,fr,,,,,,,"[n, f, sg]"
12820235,3db97fbbdd0d4e08b8248b6219a86e4b,liens/lien<n><m><pl>,fr,,,,,,,"[n, m, pl]"
12832064,8b53845a67433bd62179e23bcb88b5c3,foi/ir<vblex><ifi><p3><sg>,pt,,,,,,,"[vblex, ifi, p3, sg]"
12839884,7f5f1bf5fd85866ff55673f72cb7d084,<*sf>/heißen<vblex><pri><*pers><*numb>,de,,,,,,,"[*sf, vblex, pri, *pers, *numb]"


In [25]:
def assign_tags(tags):
    values = {'gender': np.nan, 'POS': np.nan, 'def': np.nan, 'tense':np.nan, 'person':np.nan, 'number':np.nan}
    for tag in tags:
        col = tags_dict.get(tag)
        if col and pd.isna(values[col]):  # Only assign if column is empty 
            values[col] = tag
    return pd.Series([values['gender'], values['POS'],  values['def'], values['tense'], values['person'], values['number']])

df_with_tags = df_lexeme.copy()
df_with_tags[['gender', 'POS', 'def', 'tense', 'person', 'number']] = df_with_tags['tags'].apply(assign_tags)

In [26]:
df_with_tags.head()

Unnamed: 0,lexeme_id,lexeme_string,learning_language,gender,number,def,tense,person,POS,tags
0,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,de,,sg,,pri,p3,vblex,"[vblex, pri, p3, sg]"
1,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,de,f,sg,def,,,det,"[det, def, f, sg, nom]"
2,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,de,m,sg,,,,n,"[n, m, sg, nom]"
3,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,de,f,sg,,,,n,"[n, f, sg, nom]"
4,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,de,nt,sg,def,,,det,"[det, def, nt, sg, nom]"


In [27]:
is_sf =  df_with_tags['lexeme_string'].str.contains("<*sf>")
df_sf = df_with_tags[is_sf]
df_without_sf = df_with_tags[~is_sf]

In [28]:
df_without_sf['word'] = df_without_sf['lexeme_string'].str.split("/").str[0]
df_sf['word'] = df_sf['lexeme_string'].str.split("/").str[1].str.split("<").str[0]

In [29]:
df_with_tags_final = pd.concat([df_without_sf, df_sf])

In [30]:
df_with_tags_final.head()

Unnamed: 0,lexeme_id,lexeme_string,learning_language,gender,number,def,tense,person,POS,tags,word
0,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,de,,sg,,pri,p3,vblex,"[vblex, pri, p3, sg]",lernt
1,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,de,f,sg,def,,,det,"[det, def, f, sg, nom]",die
2,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,de,m,sg,,,,n,"[n, m, sg, nom]",mann
3,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,de,f,sg,,,,n,"[n, f, sg, nom]",frau
4,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,de,nt,sg,def,,,det,"[det, def, nt, sg, nom]",das


In [31]:
df_with_tags_final['word_len'] = df_with_tags_final['word'].apply(lambda x: len(x))

In [32]:
df_with_tags_final['tags_list']=[[y for y in x if y in lexeme_reference_rel['tag'].tolist()] for x in df_with_tags_final['tags']]
df_with_tags_final['tags_list'].value_counts()

tags_list
[n, f, sg]                1609
[n, m, sg]                1520
[vblex, inf]              1359
[n, f]                    1215
[n, m]                    1147
                          ... 
[det, ind, pl]               1
[n, n, m, pl]                1
[vaux, pp]                   1
[vbhaver, fti, p1, pl]       1
[vbmod, imp]                 1
Name: count, Length: 404, dtype: int64

In [33]:
df_with_tags_final.loc[df_with_tags_final.astype(str).drop_duplicates().index]

Unnamed: 0,lexeme_id,lexeme_string,learning_language,gender,number,def,tense,person,POS,tags,word,word_len,tags_list
0,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,de,,sg,,pri,p3,vblex,"[vblex, pri, p3, sg]",lernt,5,"[vblex, pri, p3, sg]"
1,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,de,f,sg,def,,,det,"[det, def, f, sg, nom]",die,3,"[det, def, f, sg]"
2,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,de,m,sg,,,,n,"[n, m, sg, nom]",mann,4,"[n, m, sg]"
3,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,de,f,sg,,,,n,"[n, f, sg, nom]",frau,4,"[n, f, sg]"
4,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,de,nt,sg,def,,,det,"[det, def, nt, sg, nom]",das,3,"[det, def, nt, sg]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12579429,289ec4ae458f4741f9401eb234dd29a3,<*sf>/acontecer<vblex><prs><*pers><*numb>,pt,,,,prs,,vblex,"[*sf, vblex, prs, *pers, *numb]",acontecer,9,"[vblex, prs]"
12623869,6633292d640dda65f2160fcf2bbb0e2d,<*sf>/pegar<vblex><prs><*pers><*numb>,pt,,,,prs,,vblex,"[*sf, vblex, prs, *pers, *numb]",pegar,5,"[vblex, prs]"
12636265,832551ca2353cc290409e0a61a1e8f5a,<*sf>/gara<n><f><*numb>,it,f,,,,,n,"[*sf, n, f, *numb]",gara,4,"[n, f]"
12742729,551a67d04615085738d178e5a8432dc3,<*sf>/feld<n><nt><*numb><*case>,de,nt,,,,,n,"[*sf, n, nt, *numb, *case]",feld,4,"[n, nt]"


In [117]:
# df_with_tags_final.drop(columns=['tags'], inplace=True)

In [118]:
# Add SUBTLEX
folderpath = os.path.normpath(os.path.join(current_dir, '../data/resources/SUBTLEX'))
def prepare_subtlex(folderpath):
    dfs = []
    for filename in os.listdir(folderpath): 
        if filename.endswith(".txt"):
            language = os.path.splitext(filename)[0].split('_')[-2]
            filepath = os.path.join(folderpath, filename)
            df = pd.read_csv(filepath, on_bad_lines = 'skip', sep=' ', names=['word', 'SUBTLEX'])
            df["learning_language"] = language
            dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    return df


df_subtlex = prepare_subtlex(folderpath)

In [119]:
word_complexity_df = df_with_tags_final.merge(df_subtlex, on = ['word', 'learning_language'], how='left')

In [120]:
word_complexity_df_final = word_complexity_df[word_complexity_df['SUBTLEX'].notna()]


In [121]:
word_complexity_df_final.drop(columns=['lexeme_string', 'learning_language'], inplace=True)

In [122]:
word_complexity_df_final

Unnamed: 0,lexeme_id,gender,def,tense,POS,person,number,word,word_len,tags_list,SUBTLEX
0,76390c1350a8dac31186187e2fe1e178,,,pri,vblex,p3,sg,lernt,5,"[vblex, pri, p3, sg]",3391.0
1,7dfd7086f3671685e2cf1c1da72796d7,f,def,,det,,sg,die,3,"[det, def, f, sg]",2484854.0
2,35a54c25a2cda8127343f6a82e6f6b7d,m,,,n,,sg,mann,4,"[n, m, sg]",222707.0
3,0cf63ffe3dda158bc3dbd55682b355ae,f,,,n,,sg,frau,4,"[n, f, sg]",143725.0
4,84920990d78044db53c1b012f5bf9ab5,nt,def,,det,,sg,das,3,"[det, def, nt, sg]",3122198.0
...,...,...,...,...,...,...,...,...,...,...,...
19274,289ec4ae458f4741f9401eb234dd29a3,,,prs,vblex,,,acontecer,9,"[vblex, prs]",66924.0
19275,6633292d640dda65f2160fcf2bbb0e2d,,,prs,vblex,,,pegar,5,"[vblex, prs]",12910.0
19276,832551ca2353cc290409e0a61a1e8f5a,f,,,n,,,gara,4,"[n, f]",10084.0
19277,551a67d04615085738d178e5a8432dc3,nt,,,n,,,feld,4,"[n, nt]",3661.0


In [123]:
filepath = os.path.normpath(os.path.join(current_dir, '../data/features/'))
word_complexity_df_final.to_csv(os.path.join(filepath, 'word_complexity_features.csv'), sep='\t', index=False, header=True)

In [None]:
df_lexeme.groupby(df['learning_language']).size() # ES, DE and FR have the most words in dataset