In [25]:
import pandas as pd
import numpy as np
import os
import re

In [26]:
current_dir = os.getcwd()
filename = '13 million Duolingo student learning traces.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/', filename))


chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)

In [27]:
# HYPOTHESIS 1 
""" Instead of the sparse indicator variables used here, it may be better to decompose lexeme tags 
into denser and more generic features of tag components (e.g., part of speech, tense, gender, case), 
and also use corpus frequency, word length, etc."""
lexeme_filepath = os.path.normpath(os.path.join(current_dir, '../data/', 'lexeme_reference.csv'))
lexeme_reference = pd.read_csv(lexeme_filepath, sep = ';', header=None, on_bad_lines='warn', 
                               names=["tag", "type", "description"])



In [28]:
lexemes_grouped = lexeme_reference.groupby('type')
lexemes_grouped.nunique()

Unnamed: 0_level_0,tag,description
type,Unnamed: 1_level_1,Unnamed: 2_level_1
POS,22,22
adjective,18,18
animacy,3,3
case,2,2
def,2,2
gender,5,5
number,4,4
other,14,14
person,3,3
propernoun,2,2


In [53]:
tags_dict

{'adj': 'POS',
 'adv': 'POS',
 'cni': 'tense',
 'cnjadv': 'POS',
 'cnjcoo': 'POS',
 'cnjsub': 'POS',
 'def': 'def',
 'det': 'POS',
 'f': 'gender',
 'fti': 'tense',
 'fts': 'tense',
 'GD': 'gender',
 'ger': 'tense',
 'ifi': 'tense',
 'ij': 'POS',
 'imp': 'tense',
 'ind': 'def',
 'inf': 'tense',
 'm': 'gender',
 'mf': 'gender',
 'n': 'POS',
 'ND': 'number',
 'np': 'POS',
 'nt': 'gender',
 'num': 'POS',
 'p1': 'person',
 'p2': 'person',
 'p3': 'person',
 'past': 'tense',
 'pii': 'tense',
 'pis': 'tense',
 'pl': 'number',
 'pp': 'tense',
 'pprs': 'tense',
 'pr': 'POS',
 'preadv': 'POS',
 'predet': 'POS',
 'pres': 'tense',
 'pri': 'tense',
 'prn': 'POS',
 'pron': 'tense',
 'prpers': 'POS',
 'prs': 'tense',
 'sent': 'POS',
 'sg': 'number',
 'sp': 'number',
 'subs': 'tense',
 'vaux': 'POS',
 'vbdo': 'POS',
 'vbhaver': 'POS',
 'vblex': 'POS',
 'vbmod': 'POS',
 'vbser': 'POS'}

In [29]:
def prepare_tags_reference(df): 
    df = df[df["type"].str.contains("adjective|animacy|other|propernoun|case") == False]
    tags_dict = df.set_index('tag')['type'].to_dict()
    types = set(tags_dict.values())
    return tags_dict, types

In [30]:
tags_dict, types = prepare_tags_reference(lexeme_reference)

In [31]:
for lexeme_type in types:
    df[lexeme_type] = None

In [33]:
df_test = df[:100]

In [57]:
df_test

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct,gender,POS,def,tense,person,number,tags
0,1.000000,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,6,4,2,2,,vblex,,pri,p3,sg,"[vblex, pri, p3, sg]"
1,0.500000,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,4,4,2,1,f,det,def,,,sg,"[det, def, f, sg, nom]"
2,1.000000,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,5,4,1,1,m,n,,,,sg,"[n, m, sg, nom]"
3,0.500000,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,6,5,2,1,f,n,,,,sg,"[n, f, sg, nom]"
4,1.000000,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,4,4,1,1,nt,det,def,,,sg,"[det, def, nt, sg, nom]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.666667,1362082511,228208,u:fxGh,es,en,7e416735d7d43c3c0fc30d10801a352d,si/si<cnjadv>,3,3,3,2,,cnjadv,,,,,[cnjadv]
96,1.000000,1362082511,64651,u:fxGh,es,en,69faeef930a44421ec22be4b06474a06,aunque/aunque<cnjadv>,4,4,1,1,,cnjadv,,,,,[cnjadv]
97,1.000000,1362082511,64651,u:fxGh,es,en,8a100871aa249eeef95c20f864b281a1,mientras/mientras<cnjadv>,4,4,1,1,,cnjadv,,,,,[cnjadv]
98,1.000000,1362082511,442104,u:fxGh,es,en,a764900ace90aa45b2466e3cb031072e,caminas/caminar<vblex><pri><p2><sg>,6,5,1,1,,vblex,,pri,p2,sg,"[vblex, pri, p2, sg]"


In [58]:
def extract_from_lexemestring(lexeme_string):
    tags = re.findall(r'<(.*?)>', lexeme_string)
    return tags 

# df['tags'] = df['lexeme_string'].apply(extract_from_lexemestring)

In [42]:
def assign_tags(tags):
    values = {'gender': np.nan, 'POS': np.nan, 'def': np.nan, 'tense':np.nan, 'person':np.nan, 'number':np.nan}
    for tag in tags:
        col = tags_dict.get(tag)
        if col and pd.isna(values[col]):  # Only assign if column is empty 
            values[col] = tag
    return pd.Series([values['gender'], values['POS'],  values['def'], values['tense'], values['person'], values['number']])

df[['gender', 'POS', 'def', 'tense', 'person', 'number']] = df['tags'].apply(assign_tags)

In [47]:
# New features 
df['word'] = df['lexeme_string'].str.split("/").str[0]
df['word_len'] = df['word'].apply(lambda x: len(x))

# Interaction feature 
df['lang_combination'] = df['ui_language'] + '-' + df['learning_language']

# Drop columns 
df.drop(columns=['tags', 'lexeme_string', 'lexeme_id'], inplace=True)

In [69]:
# HYPOTHESIS 2 
""" User embeddings""" 
print('number of unique users', df['user_id'].nunique())

number of unique users 115222


In [75]:
df_users = df.groupby('user_id').agg({'delta':'mean', 'p_recall':'mean', 'history_seen':'mean', 'history_correct':'mean'})

In [76]:
df_users.describe()

Unnamed: 0,delta,p_recall,history_seen,history_correct
count,115222.0,115222.0,115222.0,115222.0
mean,1381635.0,0.893688,8.667307,7.730473
std,2982971.0,0.092706,16.454787,15.125486
min,1.0,0.0,1.0,1.0
25%,94766.6,0.861111,3.8,3.30303
50%,384396.4,0.909091,5.604651,4.944444
75%,1217518.0,0.95,9.36778,8.345135
max,39758320.0,1.0,1756.652794,1683.241974
