In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, Concatenate, Lambda
from tensorflow.keras.layers import LSTM
from keras import regularizers
from keras import losses
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from tensorflow.keras.losses import MeanSquaredError
from keras.optimizers import Adam
import os

#keras.layers.Flatten, Input

In [3]:
# Utility functions
def pclip(p):
    """Clip recall probability to avoid numerical issues."""
    return p.clip(0.0001, 0.9999)


def hclip(h):
    min_half_life = 15.0 / (24 * 60)  # 15 minutes in days
    max_half_life = 274.0   
    """Clip half-life to a reasonable range."""
    return h.clip(min_half_life, max_half_life)


In [30]:
current_dir = os.getcwd()
filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df_users = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'users_behaviur.csv')))
df_words = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'word_complexity_features.csv')), sep='\t')
dff = pd.merge(pd.merge(df_words, df, on = 'lexeme_id', how='inner'), df_users, on = ['user_id', 'lang_combination'], how='inner')

In [31]:
cols_to_drop = ['lexeme_id' ,'gender', 'def', 'tense', 'POS', 'person', 'number', 'word', 'session_seen', 'session_correct', 'avg_user_p_recall', 'timestamp', 'user_id', 'learning_language', 'ui_language']
dff.drop(columns=cols_to_drop, inplace=True)
dff.dropna(inplace=True)


dff['delta'] = dff['delta']/(60*60*24) # convert time delta to days
dff['avg_delta'] = dff['avg_delta']/(60*60*24) 
dff['std_delta'] = dff['std_delta']/(60*60*24)
dff['p_recall'] = pclip(dff['p_recall'])
dff['half_life'] = hclip(-dff['delta']/np.log2(dff['p_recall']))


In [32]:
dff.head()

Unnamed: 0,word_len,tags_list,SUBTLEX,p_recall,delta,history_seen,history_correct,h_recall,lang_combination,avg_delta,std_delta,avg_h_recall,half_life
0,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.9999,0.069016,8,6,0.75,en-de,0.035931,0.034457,0.890225,274.0
1,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.0001,0.002928,14,12,0.857143,en-de,0.035931,0.034457,0.890225,0.010417
2,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.9999,0.000752,15,12,0.8,en-de,0.035931,0.034457,0.890225,5.214388
3,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.5,0.000313,16,13,0.8125,en-de,0.035931,0.034457,0.890225,0.010417
4,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.9999,0.002072,15,15,1.0,en-de,1.009879,1.633872,0.91407,14.359623


In [33]:
# Encoding tags and langs 
tag_encoder = LabelEncoder()
lang_encoder = LabelEncoder()

dff['tags_list'] = tag_encoder.fit_transform(dff['tags_list'])
dff['lang_combination'] = lang_encoder.fit_transform(dff['lang_combination'])

In [34]:
def prepare_dataset(df):
    categorical_features = df.select_dtypes(include='O').columns
    numeric_features = df.select_dtypes(exclude=['O']).columns.drop(['p_recall', 'half_life', 'delta'])

    scaler = MinMaxScaler()
    df[numeric_features] = scaler.fit_transform(df[numeric_features])
    return df, categorical_features, numeric_features 

In [35]:
dff_1 = dff.copy()
dff_1, categorical_features, numeric_features = prepare_dataset(dff_1)

In [45]:
# Train test split
def split(df, numeric_features): 
    
    X = df.drop(columns=['p_recall', 'half_life'])
    y = df[['p_recall', 'half_life']]
    
    X_train, X_test, y_train, y_test = sklearn_train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=42)

    X_train_delta = X_train['delta']
    X_test_delta = X_test['delta'] 
    
    X_train_numerical = X_train[numeric_features]
    X_test_numerical = X_test[numeric_features]

    # In case we use encodings 
    X_train_tags = X_train['tags_list']
    X_train_langs = X_train['lang_combination']
    X_test_tags = X_test['tags_list']
    X_test_langs = X_test['lang_combination']
    

    # In case we use half-life regression
    y_train_p_recall = y_train['p_recall']
    y_train_half_life = y_train['half_life']
    y_test_p_recall = y_test['p_recall']
    y_test_half_life = y_test['half_life']


    # Embeddings 
    return df, X_train_tags, X_train_langs, X_train_numerical, X_test_tags, X_test_langs, X_test_numerical, X_train_delta, X_test_delta, X_test, y_train, y_test, y_train_half_life, y_train_p_recall, y_test_half_life, y_test_p_recall

    # No embeddings 
    # return df, X_train_delta, X_test_delta, X_train_numerical, X_test_numerical, y_train_p_recall, y_train_half_life, y_test_p_recall, y_test_half_life

# Embeddings 
df, X_train_tags, X_train_langs, X_train_numerical, X_test_tags, X_test_langs, X_test_numerical, X_train_delta, X_test_delta, X_test, y_train, y_test, y_train_half_life, y_train_p_recall, y_test_half_life, y_test_p_recall = split(dff_1, numeric_features)


# No embeddings
# df, X_train_delta, X_test_delta, X_train_numerical, X_test_numerical, y_train_p_recall, y_train_half_life, y_test_p_recall, y_test_half_life = split(dff_1.sample(frac=0.1), numeric_features)

In [46]:
# print('X_train_tags_size', X_train_tags.shape)
# print('X_train_langs_size', X_train_langs.shape)
# print('X_train_numerical_size', X_train_numerical.shape)
# print('X_test_tags_size', X_test_tags.shape)
# print('X_test_langs_size', X_test_langs.shape)
# print('X_test_numerical_size', X_test_numerical.shape)
# print('y_train_half_life_size', y_train_half_life.shape)
# print('y_train_p_recall_size', y_train_p_recall.shape)
# print('y_test_half_life_size', y_test_half_life.size)
# print('y_test_p_recall_size', y_test_p_recall.size)

In [47]:
# Embeddings
len_tags = len(np.unique(df['tags_list']))
len_langs = len(np.unique(df['lang_combination']))

embedding_tags_size = int(min(np.ceil((len_tags)/2), 50))
embedding_langs_size = int(min(np.ceil((len_langs)/2), 50))

tags_input = Input(shape=(1,))  # Reshape input to (None, 1)
langs_input = Input(shape=(1,))  # Reshape input to (None, 1)

tags_embedded = Embedding(input_dim=len_tags, output_dim=embedding_tags_size)(tags_input)  
langs_embedded = Embedding(input_dim=len_langs, output_dim=embedding_langs_size)(langs_input)  

flattened_tags = Flatten()(tags_embedded)
flattened_langs = Flatten()(langs_embedded)

numerical_input = Input(shape=(len(numeric_features),))  # Should be 11
delta_input = Input(shape=(1,))  # Time delta (Δ)


# Concatenate layers
conc = Concatenate()([flattened_tags, flattened_langs, numerical_input])

In [48]:
# Neural Network Architectture
# input_dim = X_train.shape[1] 
hidden_dim = 4         
l2wt = 0.1              # L2 regularization weight
learning_rate = 0.001
epochs = 10
batch_size = 32

# no embeddings
# x = Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l2(l2wt))(numerical_input)

# embeddings 
x = Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l2(l2wt))(conc)

# For half-life
half_life_output = Dense(1, activation="relu", name="half_life")(x) 
p_recall_output = Lambda(lambda inputs: tf.pow(2.0, -inputs[0] / (inputs[1] + 1e-6)), 
                         name="p_recall")([delta_input, half_life_output])


In [49]:
def nhlr_loss(y_true, y_pred):
    h_true, p_true = y_true[0], y_true[1]
    h_pred, p_pred = y_pred[0], y_pred[1]

    slh = tf.reduce_mean(tf.square(h_true - h_pred)) # half-life loss 
    slp = tf.reduce_mean(tf.square(p_true - p_pred)) # p_recall loss 

    return slp + slh 


In [50]:
print("Max tag index:", X_train_tags.max(), "Embedding input_dim:", len_tags)
print("Max lang index:", X_train_langs.max(), "Embedding input_dim:", len_langs)

print("Tags Input Shape:", tags_input.shape)
print("Langs Input Shape:", langs_input.shape)
print("Numerical Input Shape:", numerical_input.shape)

print("Flattened tags Embedded Shape:", flattened_tags.shape)
print("Flattened Langs Embedded Shape:", flattened_langs.shape)
print("Flattened Numerical Input Shape:", numerical_input.shape)

print("X_train_tags shape:", X_train_tags.shape)    # Should be (batch_size, 1)
print("X_train_langs shape:", X_train_langs.shape)  # Should be (batch_size, 1)
print("X_train_numerical shape:", X_train_numerical.shape)  
print("X_train_delta shape:", X_train_delta.shape)  

Max tag index: 1.0 Embedding input_dim: 401
Max lang index: 1.0 Embedding input_dim: 8
Tags Input Shape: (None, 1)
Langs Input Shape: (None, 1)
Numerical Input Shape: (None, 10)
Flattened tags Embedded Shape: (None, 50)
Flattened Langs Embedded Shape: (None, 4)
Flattened Numerical Input Shape: (None, 10)
X_train_tags shape: (10004398,)
X_train_langs shape: (10004398,)
X_train_numerical shape: (10004398, 10)
X_train_delta shape: (10004398,)


In [51]:
langs_input

<KerasTensor shape=(None, 1), dtype=float32, sparse=False, name=keras_tensor_46>

In [52]:
# no embeddings
# model = Model(inputs=[numerical_input, delta_input], outputs=[half_life_output, p_recall_output])

# embeddings 
model = Model(inputs=[tags_input, langs_input, numerical_input, delta_input], outputs=[half_life_output, p_recall_output])

model.compile(loss=nhlr_loss, optimizer= Adam(learning_rate=learning_rate), metrics=['MAE', 'MAE'])

# no embeddings 
# model.fit([X_train_numerical, X_train_delta], [y_train_half_life, y_train_p_recall], epochs=epochs, batch_size=batch_size, verbose=2)

# embeddings 
model.fit([X_train_tags, X_train_langs, X_train_numerical, X_train_delta], [y_train_half_life, y_train_p_recall], epochs=epochs, batch_size=batch_size, verbose=2)


# Without half lfie 
# model.fit([X_train_tags, X_train_langs, X_train_numerical], y_train, epochs=epochs, batch_size=batch_size, verbose=2)
# model.summary()

Epoch 1/10
312638/312638 - 149s - 475us/step - half_life_MAE: 120.3035 - half_life_loss: 30443.1973 - loss: 30522.3086 - p_recall_MAE: 0.1173 - p_recall_loss: 0.1624
Epoch 2/10
312638/312638 - 147s - 471us/step - half_life_MAE: 118.3181 - half_life_loss: 29826.4609 - loss: 29981.0000 - p_recall_MAE: 0.1161 - p_recall_loss: 0.1603
Epoch 3/10
312638/312638 - 150s - 480us/step - half_life_MAE: 117.5639 - half_life_loss: 29653.5781 - loss: 29800.0098 - p_recall_MAE: 0.1159 - p_recall_loss: 0.1610
Epoch 4/10
312638/312638 - 151s - 484us/step - half_life_MAE: 117.2330 - half_life_loss: 29559.7715 - loss: 29692.9160 - p_recall_MAE: 0.1159 - p_recall_loss: 0.1608
Epoch 5/10
312638/312638 - 146s - 466us/step - half_life_MAE: 117.0651 - half_life_loss: 29517.3516 - loss: 29641.6504 - p_recall_MAE: 0.1159 - p_recall_loss: 0.1608
Epoch 6/10
312638/312638 - 151s - 483us/step - half_life_MAE: 116.9837 - half_life_loss: 29510.9258 - loss: 29625.6074 - p_recall_MAE: 0.1159 - p_recall_loss: 0.1612
Epoc

<keras.src.callbacks.history.History at 0x16a31b5f0>

In [54]:
y_pred_half_life, y_pred_p_recall  = model.predict([X_test_tags, X_test_langs, X_test_numerical, X_test_delta])

y_test['p_recall_pred'] = y_pred_p_recall
y_test['half_life_pred'] = y_pred_half_life

[1m78160/78160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 302us/step


In [55]:
mae_p = np.mean(np.abs(y_test['p_recall'] - y_test['p_recall_pred']))
mae_h = np.mean(np.abs(y_test['half_life'] - y_test['half_life_pred']))

print(f"Final MAE - p_recall: {mae_p:.4f}")

Final MAE - p_recall: 0.1157


In [74]:
y_test.describe().apply(lambda s: s.apply('{0:.5f}'.format))


Unnamed: 0,p_recall,half_life,p_recall_pred,half_life_pred
count,2501100.0,2501100.0,2501100.0,2501100.0
mean,0.89736,155.58351,0.97944,154.6306
std,0.26974,124.07122,0.04045,25.28597
min,0.0001,0.01042,0.39903,1.82996
25%,0.9999,19.10324,0.97904,141.14271
50%,0.9999,274.0,0.99618,153.11523
75%,0.9999,274.0,0.99997,165.37991
max,0.9999,274.0,1.0,450.57632
