In [25]:
import pandas as pd
import numpy as np
import os
import re

In [26]:
current_dir = os.getcwd()
filename = '13 million Duolingo student learning traces.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/', filename))


chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)

In [27]:
# HYPOTHESIS 1 
""" Instead of the sparse indicator variables used here, it may be better to decompose lexeme tags 
into denser and more generic features of tag components (e.g., part of speech, tense, gender, case), 
and also use corpus frequency, word length, etc."""
lexeme_filepath = os.path.normpath(os.path.join(current_dir, '../data/', 'lexeme_reference.csv'))
lexeme_reference = pd.read_csv(lexeme_filepath, sep = ';', header=None, on_bad_lines='warn', 
                               names=["tag", "type", "description"])



In [28]:
lexemes_grouped = lexeme_reference.groupby('type')
lexemes_grouped.nunique()

Unnamed: 0_level_0,tag,description
type,Unnamed: 1_level_1,Unnamed: 2_level_1
POS,22,22
adjective,18,18
animacy,3,3
case,2,2
def,2,2
gender,5,5
number,4,4
other,14,14
person,3,3
propernoun,2,2


In [29]:
def prepare_tags_reference(df): 
    df = df[df["type"].str.contains("adjective|animacy|other|propernoun|case") == False]
    tags_dict = df.set_index('tag')['type'].to_dict()
    types = set(tags_dict.values())
    return tags_dict, types

In [30]:
tags_dict, types = prepare_tags_reference(lexeme_reference)

In [31]:
for lexeme_type in types:
    df[lexeme_type] = None

In [33]:
df_test = df[:100]

In [34]:
df_test

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct,gender,POS,def,tense,person,number
0,1.000000,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,6,4,2,2,,,,,,
1,0.500000,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,4,4,2,1,,,,,,
2,1.000000,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,5,4,1,1,,,,,,
3,0.500000,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,6,5,2,1,,,,,,
4,1.000000,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,4,4,1,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.666667,1362082511,228208,u:fxGh,es,en,7e416735d7d43c3c0fc30d10801a352d,si/si<cnjadv>,3,3,3,2,,,,,,
96,1.000000,1362082511,64651,u:fxGh,es,en,69faeef930a44421ec22be4b06474a06,aunque/aunque<cnjadv>,4,4,1,1,,,,,,
97,1.000000,1362082511,64651,u:fxGh,es,en,8a100871aa249eeef95c20f864b281a1,mientras/mientras<cnjadv>,4,4,1,1,,,,,,
98,1.000000,1362082511,442104,u:fxGh,es,en,a764900ace90aa45b2466e3cb031072e,caminas/caminar<vblex><pri><p2><sg>,6,5,1,1,,,,,,


In [41]:
def extract_from_lexemestring(lexeme_string):
    tags = re.findall(r'<(.*?)>', lexeme_string)
    return tags 

df['tags'] = df['lexeme_string'].apply(extract_from_lexemestring)

In [42]:
def assign_tags(tags):
    values = {'gender': np.nan, 'POS': np.nan, 'def': np.nan, 'tense':np.nan, 'person':np.nan, 'number':np.nan}
    for tag in tags:
        col = tags_dict.get(tag)
        if col and pd.isna(values[col]):  # Only assign if column is empty 
            values[col] = tag
    return pd.Series([values['gender'], values['POS'],  values['def'], values['tense'], values['person'], values['number']])

df[['gender', 'POS', 'def', 'tense', 'person', 'number']] = df['tags'].apply(assign_tags)

In [43]:
df

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct,gender,POS,def,tense,person,number,tags
0,1.000000,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,6,4,2,2,,vblex,,pri,p3,sg,"[vblex, pri, p3, sg]"
1,0.500000,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,4,4,2,1,f,det,def,,,sg,"[det, def, f, sg, nom]"
2,1.000000,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,5,4,1,1,m,n,,,,sg,"[n, m, sg, nom]"
3,0.500000,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,6,5,2,1,f,n,,,,sg,"[n, f, sg, nom]"
4,1.000000,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,4,4,1,1,nt,det,def,,,sg,"[det, def, nt, sg, nom]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12854140,0.800000,1363104897,368,u:i5D8,en,it,d5efc552aaea3109eb5388aa1ec8673d,the/the<det><def><sp>,6,4,5,4,,det,def,,,sp,"[det, def, sp]"
12854141,0.800000,1363104897,368,u:i5D8,en,it,a826c47947d68549fa81e19cafa57ba0,eat/eat<vblex><pres>,4,4,5,4,,vblex,,pres,,,"[vblex, pres]"
12854142,1.000000,1363104897,368,u:i5D8,en,it,5e29d77697d23070a1fb92eb6c90e9b6,bread/bread<n><sg>,4,4,4,4,,n,,,,sg,"[n, sg]"
12854143,0.600000,1363104897,368,u:i5D8,en,it,cdfecc9247566d40bb964a218c54c783,drink/drink<vblex><pres>,3,2,5,3,,vblex,,pres,,,"[vblex, pres]"


In [47]:
# New features 
df['word'] = df['lexeme_string'].str.split("/").str[0]
df['word_len'] = df['word'].apply(lambda x: len(x))

# Interaction feature 
df['lang_combination'] = df['ui_language'] + '-' + df['learning_language']

# Drop columns 
df.drop(columns=['tags', 'lexeme_string', 'lexeme_id'], inplace=True)

In [50]:
pip install sklearn.preprocessing

Collecting sklearn.preprocessing
  Downloading sklearn_preprocessing-0.1.0-py3-none-any.whl.metadata (70 bytes)
Downloading sklearn_preprocessing-0.1.0-py3-none-any.whl (10 kB)
Installing collected packages: sklearn.preprocessing
Successfully installed sklearn.preprocessing-0.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [52]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

categorical_columns = ['gender', 'POS', 'def', 'tense', 'person', 'number', 'lang_combination', 'learning_language', 'ui_language']
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_encoded = pd.concat([df, one_hot_df], axis=1)
df_encoded = df_encoded.drop(categorical_columns, axis=1)

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# HYPOTHESIS 2 
""" User embeddings""" 
print('number of unique users', df['user_id'].nunique())