In [340]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, Concatenate
from tensorflow.keras.layers import LSTM
from keras import regularizers
from keras import losses
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from tensorflow.keras.losses import MeanSquaredError
from keras.optimizers import Adam
import os

#keras.layers.Flatten, Input

In [345]:
# Utility functions
def pclip(p):
    """Clip recall probability to avoid numerical issues."""
    return p.clip(0.0001, 0.9999)


def hclip(h):
    min_half_life = 15.0 / (24 * 60)  # 15 minutes in days
    max_half_life = 274.0   
    """Clip half-life to a reasonable range."""
    return h.clip(min_half_life, max_half_life)


In [None]:
current_dir = os.getcwd()
filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df_users = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'users_behaviur.csv')))
df_words = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'word_complexity_features.csv')), sep='\t')
dff = pd.merge(pd.merge(df_words, df, on = 'lexeme_id', how='inner'), df_users, on = ['user_id', 'lang_combination'], how='inner')

In [None]:
cols_to_drop = ['lexeme_id' ,'gender', 'def', 'tense', 'POS', 'person', 'number', 'word', 'session_seen', 'session_correct', 'avg_user_p_recall', 'timestamp', 'user_id', 'learning_language', 'ui_language']
dff.drop(columns=cols_to_drop, inplace=True)
dff.dropna(inplace=True)

dff['p_recall'] = pclip(dff['p_recall'])
# dff['half_life'] = hclip(-dff['delta']/np.log2(dff['p_recall']))
dff['delta'] = dff['delta']/(60*60*24) # convert time delta to days
dff['avg_delta'] = dff['avg_delta']/(60*60*24) 
dff['std_delta'] = dff['std_delta']/(60*60*24)

In [None]:
dff.head()

In [None]:
tag_encoder = LabelEncoder()
lang_encoder = LabelEncoder()

dff['tags_list'] = tag_encoder.fit_transform(dff['tags_list'])
dff['lang_combination'] = lang_encoder.fit_transform(dff['lang_combination'])

In [None]:
def prepare_dataset(df):
    categorical_features = df.select_dtypes(include='O').columns
    numeric_features = df.select_dtypes(exclude=['O']).columns.drop(['p_recall'])

    scaler = MinMaxScaler()
    df[numeric_features] = scaler.fit_transform(df[numeric_features])
    return df, categorical_features, numeric_features 

In [None]:
dff_1 = dff.copy()
dff_1, categorical_features, numeric_features = prepare_dataset(dff_1)

In [None]:
# Train test split
def split(df, numeric_features): 
    X = df.drop(columns=['p_recall'])
    y = df['p_recall']
    X_train, X_test, y_train, y_test = sklearn_train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=42)
    X_train_tags = X_train['tags_list']
    X_train_langs = X_train['lang_combination']
    X_train_numerical = X_train[numeric_features] 
    
    X_test_tags = X_test['tags_list']
    X_test_langs = X_test['lang_combination']
    X_test_numerical = X_test[numeric_features] 

    # In case we use half-life regression
    # y_train_p_recall = y_train['p_recall']
    # y_train_half_life = y_train['half_life']
    # y_test_p_recall = y_test['p_recall']
    # y_test_half_life = y_test['half_life']


    
    return df, X_train_tags, X_train_langs, X_train_numerical, X_test_tags, X_test_langs, X_test_numerical, X_test, y_train, y_test
    # y_train_half_life, y_train_p_recall, y_test_half_life, y_test_p_recall

df_final, X_train_tags, X_train_langs, X_train_numerical,X_test_tags, X_test_langs, X_test_numerical, X_test, y_train, y_test = split(dff_1.sample(frac=0.1), numeric_features)

In [None]:
# print('X_train_tags_size', X_train_tags.shape)
# print('X_train_langs_size', X_train_langs.shape)
# print('X_train_numerical_size', X_train_numerical.shape)
# print('X_test_tags_size', X_test_tags.shape)
# print('X_test_langs_size', X_test_langs.shape)
# print('X_test_numerical_size', X_test_numerical.shape)
# print('y_train_half_life_size', y_train_half_life.shape)
# print('y_train_p_recall_size', y_train_p_recall.shape)
# print('y_test_half_life_size', y_test_half_life.size)
# print('y_test_p_recall_size', y_test_p_recall.size)

In [None]:
# Embeddings
len_tags = len(np.unique(df_final['tags_list']))
len_langs = len(np.unique(df_final['lang_combination']))

embedding_tags_size = int(min(np.ceil((len_tags)/2), 50))
embedding_lang_size = int(min(np.ceil((len_langs)/2), 50))


tags_input = Input(shape=(1,))  # Reshape input to (None, 1)
langs_input = Input(shape=(1,))  # Reshape input to (None, 1)
numerical_input = Input(shape=(len(numeric_features),))  # Should be 11

tags_embedded = Embedding(input_dim=len_tags, output_dim=embedding_tags_size)(tags_input)  
langs_embedded = Embedding(input_dim=len_langs, output_dim=embedding_langs_size)(langs_input)  

flattened_tags = Flatten()(tags_embedded)
flattened_langs = Flatten()(langs_embedded)

# Concatenate layers
conc = Concatenate()([flattened_tags, flattened_langs, numerical_input])

In [None]:
# print("Max tag index in X_train:", X_train_tags.max(), "Embedding input_dim:", len_tags)
# print("Max lang index in X_train:", X_train_langs.max(), "Embedding input_dim:", len_langs)
# print("Unique values in X_train_tags:", np.unique(X_train_tags))
# print("Unique values in X_train_langs:", np.unique(X_train_langs))

In [None]:
# Neural Network Architectture
# input_dim = X_train.shape[1] 
hidden_dim = 4         
l2wt = 0.1              # L2 regularization weight
learning_rate = 0.001
epochs = 10
batch_size = 32

x = Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l2(l2wt))(conc)
output = Dense(1, activation="sigmoid")(x) 

# p_recall_output = Dense(1, activation="sigmoid", name="p_recall")(x)
# half_life_output = Dense(1, activation="relu", name="half_life")(x) 

In [None]:
def nhlr_loss(y_true, y_pred):
    p_true, h_true = y_true[:, 0], y_true[:, 1]
    p_pred, h_pred = y_pred[:, 0], y_pred[:, 1]

    slp = tf.reduce_mean(tf.square(p_true - p_pred)) # p_recall loss 
    slh = tf.reduce_mean(tf.square(h_true - h_pred)) # half-life loss 

    return slp + slh 


In [None]:
# print("Tags Input Shape:", tags_input.shape)
# print("Langs Input Shape:", langs_input.shape)
# print("Numerical Input Shape:", numerical_input.shape)

# print("Flattened tags Embedded Shape:", flattened_tags.shape)
# print("Flattened Langs Embedded Shape:", flattened_langs.shape)
# print("Flattened Numerical Input Shape:", numerical_input.shape)

# print("X_train_tags shape:", X_train_tags.shape)    # Should be (batch_size, 1)
# print("X_train_langs shape:", X_train_langs.shape)  # Should be (batch_size, 1)
# print("X_train_numerical shape:", X_train_numerical.shape)  

In [None]:
model = Model(inputs=[tags_input, langs_input, numerical_input], outputs=output)
model.compile(loss=MeanSquaredError(), optimizer= Adam(learning_rate=learning_rate), metrics=['MAE'])
model.fit([X_train_tags, X_train_langs, X_train_numerical], y_train, epochs=epochs, batch_size=batch_size, verbose=2)
model.summary()

In [None]:
y_pred = model.predict([X_test_tags, X_test_langs, X_test_numerical])

In [None]:
y_test['p_recall_pred'] = y_pred_p_recall
y_test['half_life_pred'] = y_pred_half_life

In [None]:
y_test.describe()

In [None]:
results = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
# y_test['p_recall_pred'] = y_pred
# dff['half_life_pred'] = y_pred

mae_p = np.mean(np.abs(results['Actual'] - results['Predicted']))
# mae_h = np.mean(np.abs(dff['half_life'] - dff['half_life_pred']))

print(f"Final MAE - p_recall: {mae_p:.4f}")

In [399]:
print("Max tag index:", X_train_tags.max(), "Embedding input_dim:", len_tags)
print("Max lang index:", X_train_langs.max(), "Embedding input_dim:", len_langs)


Max tag index: 400 Embedding input_dim: 361
Max lang index: 7 Embedding input_dim: 8
