In [131]:
import pandas as pd
import numpy as np
import os
import re

In [132]:
current_dir = os.getcwd()

# Original Dataset 
filename = '13 million Duolingo student learning traces.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/raw/', filename))


chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)

In [146]:
pd.options.mode.chained_assignment = None  # default='warn'

In [133]:
df

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,1.000000,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,6,4,2,2
1,0.500000,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,4,4,2,1
2,1.000000,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,5,4,1,1
3,0.500000,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,6,5,2,1
4,1.000000,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,4,4,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
12854140,0.800000,1363104897,368,u:i5D8,en,it,d5efc552aaea3109eb5388aa1ec8673d,the/the<det><def><sp>,6,4,5,4
12854141,0.800000,1363104897,368,u:i5D8,en,it,a826c47947d68549fa81e19cafa57ba0,eat/eat<vblex><pres>,4,4,5,4
12854142,1.000000,1363104897,368,u:i5D8,en,it,5e29d77697d23070a1fb92eb6c90e9b6,bread/bread<n><sg>,4,4,4,4
12854143,0.600000,1363104897,368,u:i5D8,en,it,cdfecc9247566d40bb964a218c54c783,drink/drink<vblex><pres>,3,2,5,3


In [134]:
# HYPOTHESIS 1 
""" Instead of the sparse indicator variables used here, it may be better to decompose lexeme tags 
into denser and more generic features of tag components (e.g., part of speech, tense, gender, case), 
and also use corpus frequency, word length, etc."""
lexeme_filepath = os.path.normpath(os.path.join(current_dir, '../data/resources', 'lexeme_reference.csv'))
lexeme_reference = pd.read_csv(lexeme_filepath, sep = ';', header=None, on_bad_lines='warn', 
                               names=["tag", "type", "description"])



In [135]:
lexemes_grouped = lexeme_reference.groupby('type')
lexemes_grouped.nunique()

Unnamed: 0_level_0,tag,description
type,Unnamed: 1_level_1,Unnamed: 2_level_1
POS,22,22
adjective,18,18
animacy,3,3
case,2,2
def,2,2
gender,5,5
number,4,4
other,14,14
person,3,3
propernoun,2,2


In [147]:
# Separate only the words to get new features
df_lexeme = df[['lexeme_string', 'learning_language']]

In [148]:
df_lexeme.drop_duplicates(inplace=True)

In [149]:
df_lexeme.head()

Unnamed: 0,lexeme_string,learning_language
0,lernt/lernen<vblex><pri><p3><sg>,de
1,die/die<det><def><f><sg><nom>,de
2,mann/mann<n><m><sg><nom>,de
3,frau/frau<n><f><sg><nom>,de
4,das/das<det><def><nt><sg><nom>,de


In [150]:
def prepare_tags_reference(df): 
    df = df[df["type"].str.contains("adjective|animacy|other|propernoun|case") == False]
    tags_dict = df.set_index('tag')['type'].to_dict()
    types = set(tags_dict.values())
    return tags_dict, types

In [151]:
tags_dict, types = prepare_tags_reference(lexeme_reference)

In [152]:
for lexeme_type in types:
    df_lexeme[lexeme_type] = None

In [153]:
def extract_from_lexemestring(lexeme_string):
    tags = re.findall(r'<(.*?)>', lexeme_string)
    return tags 

df_lexeme['tags'] = df_lexeme['lexeme_string'].apply(extract_from_lexemestring)

In [154]:
df_lexeme.head()

Unnamed: 0,lexeme_string,learning_language,tags
0,lernt/lernen<vblex><pri><p3><sg>,de,"[vblex, pri, p3, sg]"
1,die/die<det><def><f><sg><nom>,de,"[det, def, f, sg, nom]"
2,mann/mann<n><m><sg><nom>,de,"[n, m, sg, nom]"
3,frau/frau<n><f><sg><nom>,de,"[n, f, sg, nom]"
4,das/das<det><def><nt><sg><nom>,de,"[det, def, nt, sg, nom]"


In [155]:
def assign_tags(tags):
    values = {'gender': np.nan, 'POS': np.nan, 'def': np.nan, 'tense':np.nan, 'person':np.nan, 'number':np.nan}
    for tag in tags:
        col = tags_dict.get(tag)
        if col and pd.isna(values[col]):  # Only assign if column is empty 
            values[col] = tag
    return pd.Series([values['gender'], values['POS'],  values['def'], values['tense'], values['person'], values['number']])

df_lexeme[['gender', 'POS', 'def', 'tense', 'person', 'number']] = df_lexeme['tags'].apply(assign_tags)

In [158]:
df_lexeme

Unnamed: 0,lexeme_string,learning_language,tags,gender,POS,def,tense,person,number
0,lernt/lernen<vblex><pri><p3><sg>,de,"[vblex, pri, p3, sg]",,vblex,,pri,p3,sg
1,die/die<det><def><f><sg><nom>,de,"[det, def, f, sg, nom]",f,det,def,,,sg
2,mann/mann<n><m><sg><nom>,de,"[n, m, sg, nom]",m,n,,,,sg
3,frau/frau<n><f><sg><nom>,de,"[n, f, sg, nom]",f,n,,,,sg
4,das/das<det><def><nt><sg><nom>,de,"[det, def, nt, sg, nom]",nt,det,def,,,sg
...,...,...,...,...,...,...,...,...,...
12820232,conférence/conférence<n><f><sg>,fr,"[n, f, sg]",f,n,,,,sg
12820235,liens/lien<n><m><pl>,fr,"[n, m, pl]",m,n,,,,pl
12832064,foi/ir<vblex><ifi><p3><sg>,pt,"[vblex, ifi, p3, sg]",,vblex,,ifi,p3,sg
12839884,<*sf>/heißen<vblex><pri><*pers><*numb>,de,"[*sf, vblex, pri, *pers, *numb]",,vblex,,pri,,


In [160]:
df_lexeme['word'] = df_lexeme['lexeme_string'].str.split("/").str[0]
df_lexeme['word_len'] = df_lexeme['word'].apply(lambda x: len(x))

# Drop columns  
df_lexeme.drop(columns=['tags', 'lexeme_string'], inplace=True)

In [37]:
df_lexeme.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_lexeme.drop_duplicates(inplace=True)


In [162]:
df_lexeme.head()

Unnamed: 0,learning_language,gender,POS,def,tense,person,number,word,word_len
0,de,,vblex,,pri,p3,sg,lernt,5
1,de,f,det,def,,,sg,die,3
2,de,m,n,,,,sg,mann,4
3,de,f,n,,,,sg,frau,4
4,de,nt,det,def,,,sg,das,3


In [163]:
# Add SUBTLEX
folderpath = os.path.normpath(os.path.join(current_dir, '../data/resources/SUBTLEX'))
def prepare_subtlex(folderpath):
    dfs = []
    for filename in os.listdir(folderpath): 
        if filename.endswith(".txt"):
            language = os.path.splitext(filename)[0].split('_')[-2]
            filepath = os.path.join(folderpath, filename)
            df = pd.read_csv(filepath, on_bad_lines = 'skip', sep=' ', names=['word', 'SUBTLEX'])
            df["learning_language"] = language
            dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    return df


df_subtlex = prepare_subtlex(folderpath)

770227
1202520
798017
834768
1157685
1656996


In [164]:
word_complexity_df = df_lexeme.merge(df_subtlex, on = ['word', 'learning_language'], how='left')

In [128]:
filepath = os.path.normpath(os.path.join(current_dir, '../data/features/'))
word_complexity_df.to_csv(os.path.join(filepath, 'word_complexity_features.csv'), sep='\t', index=False, header=True)

In [129]:
df_lexeme.groupby(df['learning_language']).size() # ES, DE and FR have the most words in dataset

learning_language
de    2767
en    2024
es    2865
fr    2649
it    1102
pt    1083
dtype: int64