In [1]:
import os 
import pandas as pd
import numpy as np

In [2]:
current_dir = os.getcwd()

filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df_users = pd.concat(chunks, ignore_index=True)

In [3]:
# HYPOTHESIS 2 
""" User embeddings""" 
print('number of unique users', df_users['user_id'].nunique())

number of unique users 115215


In [4]:
df_users.head()

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,history_seen,history_correct,session_seen,session_correct,h_recall,lang_combination
0,1.0,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,6,4,2,2,0.666667,en-de
1,0.5,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,4,4,2,1,1.0,en-de
2,1.0,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,5,4,1,1,0.8,en-de
3,0.5,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,6,5,2,1,0.833333,en-de
4,1.0,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,4,4,1,1,1.0,en-de


In [5]:
df_users.groupby('user_id').agg({'delta':'mean', 'p_recall':'mean', 'history_seen':'mean', 'history_correct':'mean'})

Unnamed: 0_level_0,delta,p_recall,history_seen,history_correct
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u:--N,1.510561e+07,0.809896,5.625000,5.093750
u:--U,5.608727e+06,0.928571,2.857143,2.571429
u:-3I,2.834205e+05,0.900000,8.428571,7.428571
u:-3n,1.126940e+07,0.829973,21.854839,19.225806
u:-4V,3.142199e+06,0.874419,17.879070,16.665116
...,...,...,...,...
u:zz3,1.446055e+06,0.947145,47.049724,43.784530
u:zz8,5.981280e+06,0.871111,11.358333,10.258333
u:zzI,3.835878e+06,1.000000,8.000000,6.857143
u:zzU,7.480000e+02,0.860833,4.100000,3.000000


In [9]:
df_users['avg_delta'] = df_users.groupby(['user_id', 'lang_combination'])['delta'].transform('mean') # Average interval between seeing 

In [10]:
df_users['std_delta'] = df_users.groupby(['user_id', 'lang_combination'])['delta'].transform('std') # Standard deviation from the practice

In [11]:
df_users['avg_h_recall'] = df_users.groupby(['user_id', 'lang_combination'])['h_recall'].transform('mean')

In [12]:
df_users.head()

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,history_seen,history_correct,session_seen,session_correct,h_recall,lang_combination,avg_delta,std_delta,avg_h_recall
0,1.0,1362076081,27649635,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,6,4,2,2,0.666667,en-de,6245869.0,11750430.0,0.884367
1,0.5,1362076081,27649635,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,4,4,2,1,1.0,en-de,6245869.0,11750430.0,0.884367
2,1.0,1362076081,27649635,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,5,4,1,1,0.8,en-de,6245869.0,11750430.0,0.884367
3,0.5,1362076081,27649635,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,6,5,2,1,0.833333,en-de,6245869.0,11750430.0,0.884367
4,1.0,1362076081,27649635,u:FO,de,en,84920990d78044db53c1b012f5bf9ab5,4,4,1,1,1.0,en-de,6245869.0,11750430.0,0.884367


In [13]:
df_users.drop(columns=['p_recall', 'timestamp', 'delta', 'lexeme_id', 'history_seen',
                       'history_correct', 'session_seen', 'session_correct', 'ui_language', 'learning_language', 'h_recall'], inplace=True)

In [14]:
df_users.drop_duplicates(inplace=True)

In [15]:
df_users

Unnamed: 0,user_id,lang_combination,avg_delta,std_delta,avg_h_recall
0,u:FO,en-de,6.245869e+06,1.175043e+07,0.884367
7,u:dDwF,en-es,3.221094e+06,3.380166e+06,0.959344
64,u:g3WM,en-pt,4.993274e+05,6.501751e+05,0.875062
87,u:dwbJ,pt-en,1.419411e+06,2.088632e+06,0.920798
94,u:fxGh,en-es,1.132953e+06,1.391952e+06,0.916034
...,...,...,...,...,...
12725389,u:imXT,en-fr,1.035321e+06,0.000000e+00,0.916667
12725451,u:figN,en-es,1.665690e+06,9.697722e+05,0.915454
12725478,u:f_W4,pt-en,3.527883e+06,1.551145e+06,0.876156
12725513,u:i8m1,es-en,3.488540e+05,0.000000e+00,0.879592


In [17]:
# Some users learn multiple languages and have different behaviours based on it
df_users[df_users['user_id']=='u:eUF1']

Unnamed: 0,user_id,lang_combination,avg_delta,std_delta,avg_h_recall
2478,u:eUF1,en-es,1749756.0,2052413.0,0.931856
6084,u:eUF1,en-fr,1114582.0,1836707.0,0.912726


In [18]:
filepath = os.path.normpath(os.path.join(current_dir, '../data/features/'))
df_users.to_csv(os.path.join(filepath, 'users_behaviur.csv'), sep=',', index=False, header=True)