In [2]:
import os 
import pandas as pd
import numpy as np

In [50]:
current_dir = os.getcwd()

filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df_users = pd.concat(chunks, ignore_index=True)

In [51]:
# HYPOTHESIS 2 
""" User embeddings""" 
print('number of unique users', df_users['user_id'].nunique())

number of unique users 112571


In [52]:
df_users.head()

Unnamed: 0,p_recall,timestamp,delta,user_id,learning_language,ui_language,lexeme_id,lexeme_string,history_seen,history_correct,session_seen,session_correct
0,1.0,1362082032,444407,u:dDwF,es,en,73eecb492ca758ddab5371cf7b5cca32,bajo/bajo<pr>,3,3,1,1
1,1.0,1362082044,5963,u:FO,de,en,76390c1350a8dac31186187e2fe1e178,lernt/lernen<vblex><pri><p3><sg>,8,6,6,6
2,0.75,1362082044,5963,u:FO,de,en,7dfd7086f3671685e2cf1c1da72796d7,die/die<det><def><f><sg><nom>,6,5,4,3
3,0.888889,1362082044,5963,u:FO,de,en,35a54c25a2cda8127343f6a82e6f6b7d,mann/mann<n><m><sg><nom>,6,5,9,8
4,0.8,1362082044,5963,u:FO,de,en,0cf63ffe3dda158bc3dbd55682b355ae,frau/frau<n><f><sg><nom>,8,6,5,4


In [23]:
df_users.groupby('user_id').agg({'delta':'mean', 'p_recall':'mean', 'history_seen':'mean', 'history_correct':'mean'})

Unnamed: 0_level_0,delta,p_recall,history_seen,history_correct
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
u:--N,6.269484e+06,0.800000,2.200000,2.200000
u:--U,5.608727e+06,0.928571,2.857143,2.571429
u:-3I,2.834205e+05,0.900000,8.428571,7.428571
u:-3n,5.499959e+05,0.884615,44.384615,37.692308
u:-4V,1.964277e+06,0.883598,19.502646,18.174603
...,...,...,...,...
u:zz3,1.446055e+06,0.947145,47.049724,43.784530
u:zz8,5.397128e+05,0.872254,14.184971,12.734104
u:zzI,4.532418e+05,1.000000,8.333333,7.000000
u:zzU,7.480000e+02,0.860833,4.100000,3.000000


In [53]:
# Feature engineering 
# Interaction data between ui and learning language
df_users['lang_combination'] = df_users['ui_language'] + '-' + df_users['learning_language']

In [57]:
df_users['avg_user_p_recall'] = df_users.groupby(['user_id', 'lang_combination'])['p_recall'].transform('mean') # The average p_recall specific for each user

In [54]:
df_users['avg_delta'] = df_users.groupby(['user_id', 'lang_combination'])['delta'].transform('mean') # Average interval between seeing 

In [58]:
df_users['std_delta'] = df_users.groupby(['user_id', 'lang_combination'])['delta'].transform('std') # Standard deviation from the practice

In [61]:
df_users['avg_h_recall'] = df_users.groupby(['user_id', 'lang_combination'])['h_recall'].transform('mean')

In [49]:
from datetime import datetime
df_users['hour'] = pd.to_datetime(df['timestamp'], unit='s').dt.hour
most_active_time = df_users.groupby('user_id')['hour'].agg(lambda x: x.mode()[0])

In [62]:
df_users.drop(columns=['p_recall', 'timestamp', 'delta', 'lexeme_id', 'lexeme_string', 'history_seen',
                       'history_correct', 'session_seen', 'session_correct'], inplace=True)



In [63]:
# df_users.drop(columns=['h_recall'], inplace=True)

In [64]:
df_users.drop_duplicates(inplace=True)

In [65]:
df_users

Unnamed: 0,user_id,learning_language,ui_language,lang_combination,avg_delta,avg_user_p_recall,std_delta,avg_h_recall
0,u:dDwF,es,en,en-es,2.475405e+06,0.885194,2.879771e+06,0.954897
1,u:FO,de,en,en-de,3.104417e+03,0.614120,2.977079e+03,0.890225
49,u:g3WM,pt,en,en-pt,4.993274e+05,0.873737,6.501751e+05,0.875062
72,u:dwbJ,en,pt,pt-en,1.298170e+06,0.917643,1.902581e+06,0.921389
79,u:fxGh,es,en,en-es,1.132953e+06,0.919753,1.391952e+06,0.916034
...,...,...,...,...,...,...,...,...
12527264,u:figN,es,en,en-es,1.665690e+06,0.857143,9.697722e+05,0.915454
12527285,u:btDs,es,en,en-es,1.271060e+05,0.916667,0.000000e+00,0.805556
12527291,u:f_W4,en,pt,pt-en,3.527883e+06,0.885714,1.551145e+06,0.876156
12527326,u:i8m1,en,es,es-en,3.488540e+05,0.910714,0.000000e+00,0.879592


In [66]:
df_users[df_users['user_id'].duplicated()]

Unnamed: 0,user_id,learning_language,ui_language,lang_combination,avg_delta,avg_user_p_recall,std_delta,avg_h_recall
5997,u:eUF1,fr,en,en-fr,8.914560e+05,0.948933,1.400867e+06,0.910827
27152,u:ggYG,es,en,en-es,4.011405e+05,0.883947,3.405392e+05,0.877957
39441,u:cWTo,es,en,en-es,1.615074e+06,0.765957,1.888191e+06,0.882460
51744,u:iKle,fr,en,en-fr,3.190000e+02,0.793939,0.000000e+00,0.789394
53096,u:eShq,de,en,en-de,2.617210e+06,0.916667,1.785884e+06,1.000000
...,...,...,...,...,...,...,...,...
12517273,u:i_Zn,es,en,en-es,2.124725e+05,1.000000,5.513937e+04,0.958611
12518019,u:g7yB,pt,en,en-pt,8.847857e+02,0.892857,3.863793e+02,0.950397
12519140,u:i16S,es,en,en-es,1.090000e+02,0.952381,0.000000e+00,0.857143
12522927,u:huBL,es,en,en-es,2.841417e+06,0.876667,3.718064e+01,0.721190


In [67]:
# Some users learn multiple languages and have different behaviurs based on it
df_users[df_users['user_id']=='u:eUF1']

Unnamed: 0,user_id,learning_language,ui_language,lang_combination,avg_delta,avg_user_p_recall,std_delta,avg_h_recall
2442,u:eUF1,es,en,en-es,1749756.0,0.92329,2052413.0,0.931856
5997,u:eUF1,fr,en,en-fr,891456.0,0.948933,1400867.0,0.910827


In [68]:
filepath = os.path.normpath(os.path.join(current_dir, '../data/features/'))
df_users.to_csv(os.path.join(filepath, 'users_behaviur.csv'), sep=',', index=False, header=True)