In [54]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from keras import regularizers
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from keras.optimizers import Adam


In [14]:
import os

In [47]:
current_dir = os.getcwd()
filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df_users = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'users_behaviur.csv')))
df_words = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'word_complexity_features.csv')), sep='\t')
df_1 = df.merge(df_words, on = 'lexeme_id', how='inner')
df_2 = df_1.merge(df_users, on = ['user_id', 'lang_combination'], how='inner')

dff = df_2.drop(columns=['word', 'user_id', 'session_seen', 'session_correct', 'avg_user_p_recall', 'timestamp'], errors='ignore')

In [48]:
dff.columns

Index(['p_recall', 'delta', 'learning_language', 'ui_language', 'lexeme_id',
       'history_seen', 'history_correct', 'h_recall', 'lang_combination',
       'gender', 'def', 'tense', 'POS', 'person', 'number', 'word_len',
       'tags_list', 'SUBTLEX', 'avg_delta', 'std_delta', 'avg_h_recall'],
      dtype='object')

In [49]:
# normalize the dataset
scaler = MinMaxScaler()
dff[['p_recall', 'delta', 'history_seen', 'history_correct', 'h_recall',
       'word_len', 'SUBTLEX', 'avg_delta', 'std_delta',
       'avg_h_recall']] = scaler.fit_transform(dff[['p_recall', 'delta', 'history_seen', 'history_correct', 'h_recall',
       'word_len', 'SUBTLEX', 'avg_delta', 'std_delta', 'avg_h_recall']])

In [50]:
def ohe(df):
    """
    One-hot encode categorical variables
    """ 
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    ohe = OneHotEncoder(sparse_output=False)
    ohe_data = ohe.fit_transform(df[categorical_cols])
    ohe_df = pd.DataFrame(ohe_data, columns=ohe.get_feature_names_out(categorical_cols))
    df_encoded = pd.concat([df.select_dtypes(exclude='O'), ohe_df], axis=1)
    df_encoded.dropna(inplace=True)
    return df_encoded

In [None]:
dff_encoded = ohe(dff.sample(1000000))
X = dff_encoded.drop(columns='p_recall')
y = dff_encoded['p_recall']
X_train, X_test, y_train, y_test = sklearn_train_test_split(X,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=42)

In [None]:
input_dim = X_train.shape[1] 
hidden_dim = 4         
# hlwt = 0.01             # Half-life loss weight
l2wt = 0.1              # L2 regularization weight
learning_rate = 0.01
epochs = 100
batch_size = 32


model = Sequential([
    Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l2(l2wt), input_shape=(input_dim,)),
    Dense(2)  
])

In [None]:
def nhlr_loss(y_true, y_pred):
    p_true, h_true = y_true[:, 0], y_true[:, 1]
    p_pred, h_pred = y_pred[:, 0], y_pred[:, 1]

    slp = tf.reduce_mean(tf.square(p_true - p_pred)) # p_recall loss 
    slh = tf.reduce_mean(tf.square(h_true - h_pred)) # half-life loss 

    return slp + slh 
    

model.compile(loss=nhlr_loss, optimizer= Adam(learning_rate=learning_rate))
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=2)

In [None]:
y_pred = model.predict(X_test)

df['p_recall_pred'] = y_pred[:, 0]
df['half_life_pred'] = y_pred[:, 1]

mae_p = np.mean(np.abs(df['p_recall'] - df['p_recall_pred']))
mae_h = np.mean(np.abs(df['half_life'] - df['half_life_pred']))

print(f"Final MAE - p_recall: {mae_p:.4f}, half-life: {mae_h:.4f}")