In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, Concatenate, Lambda
from tensorflow.keras.layers import LSTM
from keras import regularizers
from keras import losses
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from tensorflow.keras.losses import MeanSquaredError
from keras.optimizers import Adam
import os

#keras.layers.Flatten, Input

In [3]:
# Utility functions
def pclip(p):
    """Clip recall probability to avoid numerical issues."""
    return p.clip(0.0001, 0.9999)


def hclip(h):
    min_half_life = 15.0 / (24 * 60)  # 15 minutes in days
    max_half_life = 274.0   
    """Clip half-life to a reasonable range."""
    return h.clip(min_half_life, max_half_life)


In [4]:
current_dir = os.getcwd()
filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df_users = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'users_behaviur.csv')))
df_words = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'word_complexity_features.csv')), sep='\t')
dff = pd.merge(pd.merge(df_words, df, on = 'lexeme_id', how='inner'), df_users, on = ['user_id', 'lang_combination'], how='inner')

In [5]:
cols_to_drop = ['lexeme_id' ,'gender', 'def', 'tense', 'POS', 'person', 'number', 'word', 'session_seen', 'session_correct', 'avg_user_p_recall', 'timestamp', 'user_id', 'learning_language', 'ui_language']
dff.drop(columns=cols_to_drop, inplace=True)
dff.dropna(inplace=True)


dff['delta'] = dff['delta']/(60*60*24) # convert time delta to days
dff['avg_delta'] = dff['avg_delta']/(60*60*24) 
dff['std_delta'] = dff['std_delta']/(60*60*24)
dff['p_recall'] = pclip(dff['p_recall'])
dff['half_life'] = hclip(-dff['delta']/np.log2(dff['p_recall']))


In [6]:
dff.head()

Unnamed: 0,word_len,tags_list,SUBTLEX,p_recall,delta,history_seen,history_correct,h_recall,lang_combination,avg_delta,std_delta,avg_h_recall,half_life
0,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.9999,0.069016,8,6,0.75,en-de,0.035931,0.034457,0.890225,274.0
1,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.0001,0.002928,14,12,0.857143,en-de,0.035931,0.034457,0.890225,0.010417
2,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.9999,0.000752,15,12,0.8,en-de,0.035931,0.034457,0.890225,5.214388
3,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.5,0.000313,16,13,0.8125,en-de,0.035931,0.034457,0.890225,0.010417
4,5,"['vblex', 'pri', 'p3', 'sg']",3391.0,0.9999,0.002072,15,15,1.0,en-de,1.009879,1.633872,0.91407,14.359623


In [7]:
# Encoding tags and langs 
tag_encoder = LabelEncoder()
lang_encoder = LabelEncoder()

dff['tags_list'] = tag_encoder.fit_transform(dff['tags_list'])
dff['lang_combination'] = lang_encoder.fit_transform(dff['lang_combination'])

In [8]:
def prepare_dataset(df):
    categorical_features = df.select_dtypes(include='O').columns
    numeric_features = df.select_dtypes(exclude=['O']).columns.drop(['p_recall', 'half_life', 'delta'])

    scaler = MinMaxScaler()
    df[numeric_features] = scaler.fit_transform(df[numeric_features])
    return df, categorical_features, numeric_features 

In [9]:
dff_1, categorical_features, numeric_features = prepare_dataset(dff)

In [10]:
# Train test split
def split(df, numeric_features): 
    
    X = df.drop(columns=['p_recall', 'half_life'])
    y = df[['p_recall', 'half_life']]
    
    X_train, X_test, y_train, y_test = sklearn_train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=42)

    X_train_delta = X_train['delta'].to_numpy().reshape(-1,1)
    X_test_delta = X_test['delta'].to_numpy().reshape(-1,1)
    
    X_train_numerical = X_train[numeric_features].to_numpy()
    X_test_numerical = X_test[numeric_features].to_numpy()

    # Embeddings 
    X_train_tags = X_train['tags_list'].to_numpy().reshape(-1,1)
    X_train_langs = X_train['lang_combination'].to_numpy().reshape(-1,1)
    X_test_tags = X_test['tags_list'].to_numpy().reshape(-1,1)
    X_test_langs = X_test['lang_combination'].to_numpy().reshape(-1,1)
    

    y_train_p_recall = y_train['p_recall']
    y_train_half_life = y_train['half_life']
    y_test_p_recall = y_test['p_recall']
    y_test_half_life = y_test['half_life']


    # Embeddings 
    return df, X_train_tags, X_train_langs, X_train_numerical, X_test_tags, X_test_langs, X_test_numerical, X_train_delta, X_test_delta, X_test, y_train, y_test, y_train_half_life, y_train_p_recall, y_test_half_life, y_test_p_recall

    # No embeddings 
    # return df, X_train_delta, X_test_delta, X_train_numerical, X_test_numerical, y_train_p_recall, y_train_half_life, y_test_p_recall, y_test_half_life

# Embeddings 
df, X_train_tags, X_train_langs, X_train_numerical, X_test_tags, X_test_langs, X_test_numerical, X_train_delta, X_test_delta, X_test, y_train, y_test, y_train_half_life, y_train_p_recall, y_test_half_life, y_test_p_recall = split(dff_1.sample(frac=0.3), numeric_features)


# No embeddings
# df, X_train_delta, X_test_delta, X_train_numerical, X_test_numerical, y_train_p_recall, y_train_half_life, y_test_p_recall, y_test_half_life = split(dff_1.sample(frac=0.1), numeric_features)

In [11]:
# print('X_train_tags_size', X_train_tags.shape)
# print('X_train_langs_size', X_train_langs.shape)
# print('X_train_numerical_size', X_train_numerical.shape)
# print('X_test_tags_size', X_test_tags.shape)
# print('X_test_langs_size', X_test_langs.shape)
# print('X_test_numerical_size', X_test_numerical.shape)
# print('y_train_half_life_size', y_train_half_life.shape)
# print('y_train_p_recall_size', y_train_p_recall.shape)
# print('y_test_half_life_size', y_test_half_life.size)
# print('y_test_p_recall_size', y_test_p_recall.size)

In [12]:
# Embeddings
len_tags = len(np.unique(df['tags_list']))
len_langs = len(np.unique(df['lang_combination']))

embedding_tags_size = int(min(np.ceil((len_tags)/2), 50))
embedding_langs_size = int(min(np.ceil((len_langs)/2), 50))

tags_input = Input(shape=(1,))  
langs_input = Input(shape=(1,))  

tags_embedded = Embedding(input_dim=len_tags, output_dim=embedding_tags_size)(tags_input)  
langs_embedded = Embedding(input_dim=len_langs, output_dim=embedding_langs_size)(langs_input)  

flattened_tags = Flatten()(tags_embedded)
flattened_langs = Flatten()(langs_embedded)

numerical_input = Input(shape=(len(numeric_features),)) 
delta_input = Input(shape=(1,))  

# # Concatenate layers
conc_layers = [flattened_tags, flattened_langs, numerical_input]
conc = Concatenate()(conc_layers)

In [13]:
def nhlr_loss(y_true, y_pred):
    h_true, p_true = y_true[0], y_true[1]
    h_pred, p_pred = y_pred[0], y_pred[1]

    slh = tf.reduce_mean(tf.square(h_true - h_pred)) # half-life loss 
    slp = tf.reduce_mean(tf.square(p_true - p_pred)) # p_recall loss 

    return slp + slh 

In [14]:
# Neural Network Architectture
from tensorflow.keras.layers import Dropout, BatchNormalization

hidden_dim = 16
l2wt = 0.1            
learning_rate = 0.005
epochs = 2
batch_size = 64


# no embeddings
# x = Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l2(l2wt))(numerical_input)

# embeddings 
x = Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l2(l2wt))(conc)
# x = BatchNormalization()(x)
# x = Dropout(0.1)(x)


# x = Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l2(l2wt))(x)

half_life_output = Dense(1, activation="relu", name="half_life")(x) 
p_recall_output = Lambda(lambda inputs: tf.pow(2.0, -inputs[0] / (inputs[1] + 1e-6)), 
                         name="p_recall")([delta_input, half_life_output])


In [156]:
# print("Max tag index:", X_train_tags.max(), "Embedding input_dim:", len_tags)
# print("Max lang index:", X_train_langs.max(), "Embedding input_dim:", len_langs)

# print("Tags Input Shape:", tags_input.shape)
# print("Langs Input Shape:", langs_input.shape)
# print("Numerical Input Shape:", numerical_input.shape)

# print("Flattened tags Embedded Shape:", flattened_tags.shape)
# print("Flattened Langs Embedded Shape:", flattened_langs.shape)
# print("Flattened Numerical Input Shape:", numerical_input.shape)

# print("X_train_tags shape:", X_train_tags.shape)    
# print("X_train_langs shape:", X_train_langs.shape)  
# print("X_train_numerical shape:", X_train_numerical.shape)  
# print("X_train_delta shape:", X_train_delta.shape)  


# print(y_train_half_life.shape)
# print(y_train_p_recall.shape)


In [15]:
inputs_list = [tags_input, langs_input, numerical_input, delta_input]

# no embeddings
# model = Model(inputs=[numerical_input, delta_input], outputs=[half_life_output, p_recall_output])

# embeddings 
model = Model(inputs=inputs_list, outputs=[half_life_output, p_recall_output])

model.compile(loss=nhlr_loss, optimizer= Adam(learning_rate=learning_rate), metrics=['MAE', 'MAE'])

# no embeddings 
# model.fit([X_train_numerical, X_train_delta], [y_train_half_life, y_train_p_recall], epochs=epochs, batch_size=batch_size, verbose=2)

# embeddings 
X_train = [X_train_tags, X_train_langs, X_train_numerical, X_train_delta]
model.fit(X_train, [y_train_half_life, y_train_p_recall], epochs=epochs, batch_size=batch_size, verbose=2)
model.summary()

Epoch 1/2
46896/46896 - 17s - 365us/step - half_life_MAE: 119.2095 - half_life_loss: 30352.2129 - loss: 30463.1289 - p_recall_MAE: 0.1172 - p_recall_loss: 0.1624
Epoch 2/2
46896/46896 - 16s - 352us/step - half_life_MAE: 117.3298 - half_life_loss: 29663.0684 - loss: 29806.2500 - p_recall_MAE: 0.1159 - p_recall_loss: 0.1615


In [16]:
def build_model(hidden_dim=8, learning_rate=0.001):
    from tensorflow.keras.models import Model
    from tensorflow.keras.layers import Dense, Input, Lambda
    import tensorflow as tf
    from tensorflow.keras.optimizers import Adam

    # Inputs
    numerical_input = Input(shape=(X_train.shape[1],), name="numerical_input")
    delta_input = Input(shape=(1,), name="delta_input")

    # Hidden layer
    x = Dense(hidden_dim, activation="relu")(numerical_input)

    # Outputs
    half_life_output = Dense(1, activation="relu", name="half_life")(x)
    p_recall_output = Lambda(lambda inputs: tf.pow(2.0, -inputs[0] / (inputs[1] + 1e-6)),
                             name="p_recall")([delta_input, half_life_output])

    # Define model
    model = Model(inputs=[numerical_input, delta_input], outputs=[half_life_output, p_recall_output])
    model.compile(loss="mse", optimizer=Adam(learning_rate=learning_rate), metrics=['mae'])
    
    return model



In [101]:
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import make_scorer, r2_score
from scikeras.wrappers import KerasRegressor


X_train = np.concatenate([X_train_tags, X_train_langs, X_train_numerical, X_train_delta], axis=1)
y_train = np.column_stack([y_train_half_life, y_train_p_recall])  # Shape: (1000440, 2)


def multi_output_r2(y_true, y_pred):
    r2_half_life = r2_score(y_true[:, 0], y_pred[:, 0])
    r2_p_recall = r2_score(y_true[:, 1], y_pred[:, 1])
    return (r2_half_life + r2_p_recall) / 2  # Average R² for both outputs

scorer = make_scorer(multi_output_r2, greater_is_better=True)




grid = {
    'hidden_dims': [8,16,24],
    'learning_rates': [0.001, 0.1],
    'batches': [32, 64, 128, 512]
}

classifier = KerasRegressor(build_fn=build_model)
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = grid,
                           scoring = scorer,
                           cv=3)
grid_search = grid_search.fit(X_train, y_train, verbose = 2)




AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [None]:
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters: " + str(best_parameters))
print(f'Best Accuracy for {grid_result.best_score_:.4} using {grid_result.best_params_}')


In [None]:
# Predict 
# y_pred_half_life, y_pred_p_recall  = model.predict([X_test_tags, X_test_langs, X_test_numerical, X_test_delta])

dff_results = X_test.copy()
dff_results['p_recall_test'], dff_results['p_recall_pred'], dff_results['diff'] = y_test['p_recall'], y_pred_p_recall, y_test['p_recall'] - y_pred_p_recall
# dff_results['half_life'] = y_test['half_;ofe'] 
# dff_results['half_life_pred'] = y_pred_half_life

In [159]:
mae_p = np.mean(np.abs(dff_results['p_recall'] - dff_results['p_recall_pred']))
# mae_h = np.mean(np.abs(dff_results['half_life'] - dff_results['half_life_pred']))

print(f"MAE on test set - p_recall: {mae_p:.4f}")

Final MAE - p_recall: 0.1159


In [160]:
y_test.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,p_recall,half_life,p_recall_pred,half_life_pred
count,750330.0,750330.0,750330.0,750330.0
mean,0.89733,155.52076,0.97925,154.07173
std,0.26957,124.1195,0.0411,25.09539
min,0.0001,0.01042,0.53583,29.83831
25%,0.9999,19.01246,0.97909,138.49146
50%,0.9999,274.0,0.99621,152.89944
75%,0.9999,274.0,0.99997,166.77799
max,0.9999,274.0,1.0,438.33646


In [154]:
y_test.head()

Unnamed: 0,p_recall,half_life,p_recall_pred,half_life_pred
286998,0.9999,274.000000,0.997990,140.761597
2520530,0.9999,274.000000,0.990775,155.305801
7641043,0.0001,0.379833,0.976814,149.125015
9598725,0.9999,67.225497,0.999953,144.152252
7621672,0.9999,25.991720,0.999984,158.171280
...,...,...,...,...
5610954,0.9999,274.000000,0.956181,151.383163
10712399,0.9999,274.000000,0.869644,162.844254
6804651,0.9999,274.000000,0.924342,185.164291
2177608,0.9999,274.000000,0.996383,116.192062


In [None]:
# Predicted vs Actual 
plt.scatter(dff_results['p_recall'], dff_results['p_recall_pred'], alpha=0.5)
plt.plot([0, 1], [0, 1], color="red", linestyle="--")
plt.xlabel("Actual Recall Probability")
plt.ylabel("Predicted Recall Probability")
plt.title("Predicted vs. Actual Recall Probability")
plt.show()

In [None]:
# Residuals histogram 
dff_results['diff'].hist()

In [None]:
import shap 

# Initialize SHAP explainer
explainer = shap.Explainer(model, X_train)
shap_values = explainer([X_test_tags, X_test_langs, X_test_numerical, X_test_delta], max_evals=1500661)

# Plot summary
shap.summary_plot(shap_values, [X_test_tags, X_test_langs, X_test_numerical, X_test_delta], feature_names=numeric_features)
