In [2]:
import os 
import pandas as pd
import numpy as np

In [26]:
current_dir = os.getcwd()

filename = '13 million Duolingo student learning traces.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/raw/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)

In [27]:
# HYPOTHESIS 2 
""" User embeddings""" 
print('number of unique users', df['user_id'].nunique())

number of unique users 115222


In [28]:
df.head()

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,1.0,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,6,4,2,2
1,0.5,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,4,4,2,1
2,1.0,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,5,4,1,1
3,0.5,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,6,5,2,1
4,1.0,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,4,4,1,1


In [29]:
df_users = df.drop(columns=['lexeme_id', 'lexeme_string', 'learning_language', 'ui_language'])

In [30]:
df_users.head()

Unnamed: 0,p_recall,timestamp,delta,user_id,history_seen,history_correct,session_seen,session_correct
0,1.0,1362076081,27649635,u:FO,6,4,2,2
1,0.5,1362076081,27649635,u:FO,4,4,2,1
2,1.0,1362076081,27649635,u:FO,5,4,1,1
3,0.5,1362076081,27649635,u:FO,6,5,2,1
4,1.0,1362076081,27649635,u:FO,4,4,1,1


In [31]:
df_users.describe()

Unnamed: 0,p_recall,timestamp,delta,history_seen,history_correct,session_seen,session_correct
count,12854140.0,12854140.0,12854140.0,12854140.0,12854140.0,12854140.0,12854140.0
mean,0.8961069,1362589000.0,729581.1,21.98107,19.35027,1.817689,1.644139
std,0.2714029,293208.8,2246504.0,129.5512,111.9684,1.360182,1.318795
min,0.0,1362076000.0,1.0,1.0,1.0,1.0,0.0
25%,1.0,1362343000.0,532.0,3.0,3.0,1.0,1.0
50%,1.0,1362591000.0,77134.0,6.0,6.0,1.0,1.0
75%,1.0,1362846000.0,442503.0,15.0,13.0,2.0,2.0
max,1.0,1363105000.0,40328360.0,13518.0,12888.0,20.0,20.0


In [32]:
df.groupby('user_id').agg({'delta':'mean', 'p_recall':'mean', 'history_seen':'mean', 'history_correct':'mean'})

Unnamed: 0_level_0,delta,p_recall,history_seen,history_correct
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u:--N,1.510561e+07,0.809896,5.625000,5.093750
u:--U,5.608727e+06,0.928571,2.857143,2.571429
u:-3I,2.834205e+05,0.900000,8.428571,7.428571
u:-3n,1.126940e+07,0.829973,21.854839,19.225806
u:-4V,3.142199e+06,0.874419,17.879070,16.665116
...,...,...,...,...
u:zz3,1.457092e+06,0.947436,57.373626,52.752747
u:zz8,5.981280e+06,0.871111,11.358333,10.258333
u:zzI,3.835878e+06,1.000000,8.000000,6.857143
u:zzU,7.480000e+02,0.860833,4.100000,3.000000


In [40]:
# Feature engineering 
df_users['avg_user_p_recall'] = df_users.groupby('user_id')['p_recall'].transform('mean') # The average p_recall specific for each user

In [38]:
df_users['h_recall'] = df_users['history_correct']/df_users['history_seen']

In [44]:
df_users['avg_delta'] = df_users.groupby('user_id')['delta'].transform('mean') # Average interval between seeing 

Unnamed: 0,p_recall,timestamp,delta,user_id,history_seen,history_correct,session_seen,session_correct,h_recall,avg_user_p_recall
0,1.000000,1362076081,27649635,u:FO,6,4,2,2,0.666667,0.668996
1,0.500000,1362076081,27649635,u:FO,4,4,2,1,1.000000,0.668996
2,1.000000,1362076081,27649635,u:FO,5,4,1,1,0.800000,0.668996
3,0.500000,1362076081,27649635,u:FO,6,5,2,1,0.833333,0.668996
4,1.000000,1362076081,27649635,u:FO,4,4,1,1,1.000000,0.668996
...,...,...,...,...,...,...,...,...,...,...
12854140,0.800000,1363104897,368,u:i5D8,6,4,5,4,0.666667,0.948715
12854141,0.800000,1363104897,368,u:i5D8,4,4,5,4,1.000000,0.948715
12854142,1.000000,1363104897,368,u:i5D8,4,4,4,4,1.000000,0.948715
12854143,0.600000,1363104897,368,u:i5D8,3,2,5,3,0.666667,0.948715


In [46]:
df_users['std_delta'] = df_users.groupby('user_id')['delta'].transform('std') # Standard deviation from the practice

In [47]:
df

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct,std_delta
0,1.000000,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,6,4,2,2,1.175043e+07
1,0.500000,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,4,4,2,1,1.175043e+07
2,1.000000,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,5,4,1,1,1.175043e+07
3,0.500000,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,6,5,2,1,1.175043e+07
4,1.000000,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,das/das<det><def><nt><sg><nom>,4,4,1,1,1.175043e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12854140,0.800000,1363104897,368,u:i5D8,en,it,d5efc552aaea3109eb5388aa1ec8673d,the/the<det><def><sp>,6,4,5,4,6.550006e+04
12854141,0.800000,1363104897,368,u:i5D8,en,it,a826c47947d68549fa81e19cafa57ba0,eat/eat<vblex><pres>,4,4,5,4,6.550006e+04
12854142,1.000000,1363104897,368,u:i5D8,en,it,5e29d77697d23070a1fb92eb6c90e9b6,bread/bread<n><sg>,4,4,4,4,6.550006e+04
12854143,0.600000,1363104897,368,u:i5D8,en,it,cdfecc9247566d40bb964a218c54c783,drink/drink<vblex><pres>,3,2,5,3,6.550006e+04


In [49]:
from datetime import datetime
df_users['hour'] = pd.to_datetime(df['timestamp'], unit='s').dt.hour
most_active_time = df_users.groupby('user_id')['hour'].agg(lambda x: x.mode()[0])

In [50]:
most_active_time

user_id
u:--N     6
u:--U    14
u:-3I    13
u:-3n    10
u:-4V    15
         ..
u:zz3    20
u:zz8     4
u:zzI    20
u:zzU    16
u:zzv    22
Name: hour, Length: 115222, dtype: int32