In [120]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, Concatenate
from tensorflow.keras.layers import LSTM
from keras import regularizers
from keras import losses
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from keras.optimizers import Adam
import os

#keras.layers.Flatten, Input

In [319]:
cols_to_drop = ['lexeme_id' ,'gender', 'def', 'tense', 'POS', 'person', 'number', 'word', 'session_seen', 'session_correct', 'avg_user_p_recall', 'timestamp', 'user_id', 'learning_language', 'ui_language']

current_dir = os.getcwd()
filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df_users = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'users_behaviur.csv')))
df_words = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'word_complexity_features.csv')), sep='\t')
dff = pd.merge(pd.merge(df_words, df, on = 'lexeme_id', how='inner'), df_users, on = ['user_id', 'lang_combination'], how='inner')
dff.drop(columns=cols_to_drop, inplace=True)
dff.dropna(inplace=True)

In [320]:
dff.head()

Unnamed: 0,word_len,tags_list,SUBTLEX,p_recall,delta,history_seen,history_correct,h_recall,lang_combination,avg_delta,std_delta,avg_h_recall
0,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,1.0,5963,8,6,0.75,en-de,3104.416667,2977.078695,0.890225
1,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.0,253,14,12,0.857143,en-de,3104.416667,2977.078695,0.890225
2,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,1.0,65,15,12,0.8,en-de,3104.416667,2977.078695,0.890225
3,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.5,27,16,13,0.8125,en-de,3104.416667,2977.078695,0.890225
4,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,1.0,179,15,15,1.0,en-de,87253.516209,141166.52974,0.91407


In [321]:
def prepare_dataset(df):
    categorical_features = df.select_dtypes(include='O').columns
    numeric_features = df.select_dtypes(exclude=['O']).columns.drop(['p_recall'])

    # normalize the dataset
    scaler = MinMaxScaler()
    dff[numeric_features] = scaler.fit_transform(dff[numeric_features])
    print('ok')

    for col in categorical_features:
        encoder = LabelEncoder()
        df[col] = encoder.fit_transform(df[col])
        print(col, 'ok')
        
    return df, categorical_features, numeric_features 

In [322]:
dff_1 = dff.copy()
dff_1, categorical_features, numeric_features = prepare_dataset(dff_1)

ok
tags_list ok
lang_combination ok


In [323]:
dff_1.head()

Unnamed: 0,word_len,tags_list,SUBTLEX,p_recall,delta,history_seen,history_correct,h_recall,lang_combination,avg_delta,std_delta,avg_h_recall
0,5,291,3391.0,1.0,5963,8,6,0.75,0,3104.416667,2977.078695,0.890225
1,5,291,3391.0,0.0,253,14,12,0.857143,0,3104.416667,2977.078695,0.890225
2,5,291,3391.0,1.0,65,15,12,0.8,0,3104.416667,2977.078695,0.890225
3,5,291,3391.0,0.5,27,16,13,0.8125,0,3104.416667,2977.078695,0.890225
4,5,291,3391.0,1.0,179,15,15,1.0,0,87253.516209,141166.52974,0.91407


In [324]:
# Train test split
def split(df, numeric_features): 
    X = df.drop(columns=['p_recall'])
    y = df['p_recall']
    X_train, X_test, y_train, y_test = sklearn_train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=42)
    X_train_tags = X_train['tags_list']
    X_train_langs = X_train['lang_combination']
    X_train_numerical = X_train[numeric_features] 
    X_test_tags = X_test['tags_list']
    X_test_langs = X_test['lang_combination']
    X_test_numerical = X_test[numeric_features] 
    return df, X_train_tags, X_train_langs, X_train_numerical, X_test_tags,X_test_langs, X_test_numerical, X_test, y_train, y_test

dff, X_train_tags, X_train_langs, X_train_numerical, X_test_tags,X_test_langs, X_test_numerical, X_test, y_train, y_test = split(dff_1, numeric_features)

In [325]:
X_train

Unnamed: 0,word_len,tags_list,SUBTLEX,delta,history_seen,history_correct,h_recall,lang_combination,avg_delta,std_delta,avg_h_recall
4249897,0.047619,47,0.083481,0.065284,0.070905,0.081921,1.000000,1,0.087497,0.174216,0.922913
3099445,0.095238,80,0.004369,0.161580,0.004890,0.002825,0.650794,5,0.098613,0.109571,0.833445
1065245,0.190476,79,0.005238,0.000073,0.036675,0.039548,0.934524,7,0.099120,0.269973,0.887451
3785449,0.190476,122,0.014676,0.000019,0.014670,0.014124,0.850340,7,0.006947,0.025335,0.908853
7824664,0.142857,65,0.000486,0.000033,0.034230,0.031073,0.790476,2,0.004399,0.012931,0.809277
...,...,...,...,...,...,...,...,...,...,...,...
7611244,0.047619,134,0.154057,0.010965,0.105134,0.098870,0.809524,2,0.037111,0.075878,0.930164
5336329,0.142857,80,0.001690,0.024189,0.004890,0.000000,0.301587,7,0.021619,0.018292,0.707474
4066701,0.095238,31,0.021194,0.000042,0.012225,0.014124,1.000000,3,0.008160,0.036931,0.866056
5167650,0.095238,17,0.056762,0.000023,0.017115,0.019774,1.000000,5,0.010827,0.019037,0.935679


In [326]:
# Embeddings
len_tags = len(np.unique(dff['tags_list']))
len_langs = len(np.unique(dff['lang_combination']))

embedding_tags_size = int(min(np.ceil((len_tags)/2), 50))
embedding_lang_size = int(min(np.ceil((len_langs)/2), 50))

In [327]:
# tags_input, langs_input = Input(shape=(len_tags,)), Input(shape=(len_langs,))

tags_input = Input(shape=(1,))  # Reshape input to (None, 1)
langs_input = Input(shape=(1,))  # Reshape input to (None, 1)
numerical_input = Input(shape=(len(numeric_features),))  # Should be 11

tags_embedded = Embedding(input_dim=len_tags, output_dim=embedding_tags_size)(tags_input)  
langs_embedded = Embedding(input_dim=len_langs, output_dim=embedding_langs_size)(langs_input)  

flattened_tags = Flatten()(tags_embedded)
flattened_langs = Flatten()(langs_embedded)

# Concatenate layers
conc = Concatenate()([flattened_tags, flattened_langs, numerical_input])

In [328]:
# Neural Network Architectture
# input_dim = X_train.shape[1] 
hidden_dim = 4         
l2wt = 0.1              # L2 regularization weight
learning_rate = 0.001
epochs = 10
batch_size = 32

x = Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l2(l2wt))(conc)
output = Dense(1, activation="sigmoid")(x) 

In [329]:
def nhlr_loss(y_true, y_pred):
    p_true, h_true = y_true[:, 0], y_true[:, 1]
    p_pred, h_pred = y_pred[:, 0], y_pred[:, 1]

    slp = tf.reduce_mean(tf.square(p_true - p_pred)) # p_recall loss 
    slh = tf.reduce_mean(tf.square(h_true - h_pred)) # half-life loss 

    return slp + slh 


In [330]:
print("Tags Input Shape:", tags_input.shape)
print("Langs Input Shape:", langs_input.shape)
print("Numerical Input Shape:", numerical_input.shape)

print("Flattened tags Embedded Shape:", flattened_tags.shape)
print("Flattened Langs Embedded Shape:", flattened_langs.shape)
print("Flattened Numerical Input Shape:", numerical_input.shape)

print("X_train_tags shape:", X_train_tags.shape)    # Should be (batch_size, 1)
print("X_train_langs shape:", X_train_langs.shape)  # Should be (batch_size, 1)
print("X_train_numerical shape:", X_train_numerical.shape)  

Tags Input Shape: (None, 1)
Langs Input Shape: (None, 1)
Numerical Input Shape: (None, 9)
Flattened tags Embedded Shape: (None, 50)
Flattened Langs Embedded Shape: (None, 50)
Flattened Numerical Input Shape: (None, 9)
X_train_tags shape: (10004398,)
X_train_langs shape: (10004398,)
X_train_numerical shape: (10004398, 9)


In [331]:
from tensorflow.keras.losses import MeanSquaredError


In [332]:
# Check for NaNs in training data
print(np.isnan(X_train_numerical).sum(), np.isnan(X_train_tags).sum(), np.isnan(X_train_langs).sum())
print(np.isnan(y_train).sum())

# Check for infinite values
print(np.isinf(X_train_numerical).sum(), np.isinf(X_train_tags).sum(), np.isinf(X_train_langs).sum())
print(np.isinf(y_train).sum())

word_len           0
SUBTLEX            0
delta              0
history_seen       0
history_correct    0
h_recall           0
avg_delta          0
std_delta          0
avg_h_recall       0
dtype: int64 0 0
0
word_len           0
SUBTLEX            0
delta              0
history_seen       0
history_correct    0
h_recall           0
avg_delta          0
std_delta          0
avg_h_recall       0
dtype: int64 0 0
0


In [333]:
model = Model(inputs=[tags_input, langs_input, numerical_input], outputs=output)
model.compile(loss=MeanSquaredError(), optimizer= Adam(learning_rate=learning_rate), metrics=['accuracy'])
model.fit([X_train_tags, X_train_langs, X_train_numerical], y_train, epochs=epochs, batch_size=batch_size, verbose=2)
model.summary()

Epoch 1/10
312638/312638 - 147s - 469us/step - accuracy: 0.8404 - loss: 0.0754
Epoch 2/10
312638/312638 - 137s - 438us/step - accuracy: 0.8405 - loss: 0.0729
Epoch 3/10
312638/312638 - 133s - 427us/step - accuracy: 0.8405 - loss: 0.0729
Epoch 4/10
312638/312638 - 133s - 426us/step - accuracy: 0.8405 - loss: 0.0729
Epoch 5/10
312638/312638 - 133s - 426us/step - accuracy: 0.8405 - loss: 0.0729
Epoch 6/10
312638/312638 - 130s - 415us/step - accuracy: 0.8405 - loss: 0.0729
Epoch 7/10
312638/312638 - 129s - 413us/step - accuracy: 0.8405 - loss: 0.0729
Epoch 8/10
312638/312638 - 127s - 407us/step - accuracy: 0.8405 - loss: 0.0729
Epoch 9/10
312638/312638 - 127s - 406us/step - accuracy: 0.8405 - loss: 0.0729
Epoch 10/10
312638/312638 - 127s - 407us/step - accuracy: 0.8405 - loss: 0.0729


In [334]:
y_pred

array([0.89837456, 0.89837456, 0.89837456, ..., 0.89837456, 0.89837456,
       0.89837456], dtype=float32)

In [335]:
y_pred = model.predict([X_test_tags, X_test_langs, X_test_numerical]).flatten()


[1m78160/78160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 287us/step


In [336]:
y_test

8466429     1.0
1798602     1.0
9209040     1.0
10115241    1.0
11916336    1.0
           ... 
4594657     1.0
4727614     1.0
6783868     1.0
1789219     1.0
3066258     1.0
Name: p_recall, Length: 2501100, dtype: float64

In [337]:
results = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
# y_test['p_recall_pred'] = y_pred
# dff['half_life_pred'] = y_pred

mae_p = np.mean(np.abs(results['Actual'] - results['Predicted']))
# mae_h = np.mean(np.abs(dff['half_life'] - dff['half_life_pred']))

print(f"Final MAE - p_recall: {mae_p:.4f}")

Final MAE - p_recall: 0.1723
