In [55]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
current_dir = os.getcwd()
filename = 'df_processed.csv'
filepath = os.path.normpath(os.path.join(current_dir, '../data/processed/', filename))

chunk_size = 10000
chunks = []

for chunk in pd.read_csv(filepath, chunksize=chunk_size):
    chunk.drop_duplicates(inplace=True)
    chunk.dropna(inplace=True)
    chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df_users = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'users_behaviur.csv')))
df_words = pd.read_csv(os.path.normpath(os.path.join(current_dir, '../data/features/', 'word_complexity_features.csv')), sep='\t')

In [None]:
df_1 = df.merge(df_words, on = 'lexeme_id', how='inner')
df_2 = df_1.merge(df_users, on = ['user_id', 'lang_combination'], how='inner')

In [None]:
df_2.columns

In [None]:
### PREPARE DATA FOR ML 
dff = df_2.drop(columns=['timestamp', 'lexeme_id', 'word', 'user_id', 'session_seen', 'session_correct', 'avg_user_p_recall'], errors='ignore')

In [None]:
dff.columns

In [None]:
# Two approaches to handle tags 
# 1) Store tags as list, but delete rows where there are less than x occurences 
dff_1 = dff.copy()
tag_counts = dff_1['tags_list'].value_counts()
rare_threshold = 1000
dff_1['tags_list'] = dff_1['tags_list'].apply(lambda x: x if tag_counts[x] > rare_threshold else 'rare')

dff_1.drop(columns=['POS', 'person', 'number', 'gender', 'tense', 'def'], inplace=True)
dff_1.dropna(inplace=True)

In [None]:
# Correlation between variables
sns.heatmap(dff_1.select_dtypes(exclude='O').corr(), annot=False, cmap='coolwarm')  

In [None]:
# 2) Store each tag as column, replace NaN values with a placeholder for categorical columns
categorical_cols = dff.select_dtypes(include=['object']).columns.tolist()
dff_2 = dff.copy()
dff_2[categorical_cols] = dff_2[categorical_cols].fillna('missing')
dff_2.drop(columns=['tags_list'], inplace=True)

In [None]:
# VIF -> iteratevily delete columns that had VIF > 10
def calculate_vif(X):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data
    
dff_VIF = dff_1.drop(columns=['p_recall', 'avg_h_recall', 'history_correct']).sample(2000000)
vif = calculate_vif(dff_VIF.select_dtypes(exclude='O'))
vif

In [None]:
removed_features = ["avg_h_recall", "avg_user_p_recall", 'history_correct']
correlation_with_target = dff_1[removed_features + ["p_recall"]].corr()["p_recall"]
print(correlation_with_target)

In [None]:
dff_1

In [None]:
# After deleting VIF
dff_final = dff_1.sample(1000000)
# dff_final = dff_2

In [None]:
# Transforming skewed variables
dff_final['delta'] = np.sqrt(dff_final['delta']/(60*60*24))
dff_final['avg_delta'] = np.sqrt(dff_final['avg_delta']/(60*60*24))
dff_final['history_seen'] = np.sqrt(dff_final['history_seen'])
dff_final['history_correct'] = np.sqrt(dff_final['history_correct'])

In [None]:
# Transformation of target variable 

# Logit transformation 
def logit_transform(y, epsilon=1e-10):
    y = np.clip(y, epsilon, 1 - epsilon)
    return np.log(y / (1 - y))

def inverse_logit_transform(y):
    return 1 / (1 + np.exp(-y))


# Log transformation 
def log_transform(y, epsilon=1e-10):
    return np.log(y + epsilon)

def inverse_log_transform(y):
    return np.exp(y)


# Box-Cox transformation
power_transformer = PowerTransformer(method='box-cox', standardize=False)
def power_transform(y, epsilon=1e-10):
    return power_transformer.fit_transform(y+epsilon) 

def inverse_power_transform(y, epsilon=1e-10):
    return power_transformer.inverse_transform(y+epsilon) 

In [None]:
# Create df without word features 
dff_t = dff_final.drop(columns=['tags_list', 'word_len', 'SUBTLEX'], errors='ignore')

# DF original 
dff_s = dff_final.drop(columns=['SUBTLEX', 'tags_list', 'word_len', 'avg_delta', 'std_delta', 'avg_h_recall', 'lang_combination', 'h_recall'], errors='ignore')

# DF without user columns 
dff_u = dff_final.drop(columns=['avg_delta', 'std_delta', 'avg_h_recall', 'h_recall'], errors='ignore')


In [None]:
def ohe(df):
    """
    One-hot encode categorical variables
    """ 
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    ohe = OneHotEncoder(sparse_output=False)
    ohe_data = ohe.fit_transform(df[categorical_cols])
    ohe_df = pd.DataFrame(ohe_data, columns=ohe.get_feature_names_out(categorical_cols))
    df_encoded = pd.concat([df.select_dtypes(exclude='O'), ohe_df], axis=1)
    df_encoded.dropna(inplace=True)
    return df_encoded

In [None]:
def split_dataset(df):
    X = df.drop(columns='p_recall')
    y = df['p_recall']
    X_train, X_test, y_train, y_test = sklearn_train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=42)
    return X_train, X_test, y_train, y_test

In [None]:
def cap_y(prediction): 
    return np.clip(prediction, 0, 1)

In [None]:
dff_encoded = ohe(dff_final)
dff_t_encoded = ohe(dff_t)
dff_s_encoded = ohe(dff_s)
dff_u_encoded = ohe(dff_u)


X_train, X_test, y_train, y_test = split_dataset(dff_encoded)
X_train_t, X_test_t, y_train_t, y_test_t = split_dataset(dff_t_encoded)
X_train_s, X_test_s, y_train_s, y_test_s = split_dataset(dff_s_encoded)
X_train_u, X_test_u, y_train_u, y_test_u = split_dataset(dff_u_encoded)

In [None]:
def create_pipeline(model_type="linear", alpha=1.0, **kwargs):
    """
    Create a pipeline for regression with optional feature selection.
    
    Returns:
    - Pipeline object
    """
    if model_type == "ridge":
        model = Ridge(alpha=alpha, fit_intercept=True)
    else:
        model = LinearRegression()

    steps = [
        ('scaler', StandardScaler()),
        # ('polynomial_features', PolynomialFeatures(degree=3, include_bias=False)),
        ('model', model)
        # ('model', TransformedTargetRegressor(regressor=model, 
        #                                     func=log_transform,
        #                                     inverse_func=inverse_log_transform))
    ]
    return Pipeline(steps)

In [None]:
def evaluate_pipeline(pipeline, X_train, X_test, y_train, y_test, name="Model"):
    """
    Train and evaluate a pipeline, returning metrics.
    """
    pipeline.fit(X_train, y_train)

    # Predict on both train and test
    y_train_pred = cap_y(pipeline.predict(X_train))
    y_test_pred = cap_y(pipeline.predict(X_test)) 

    # Calculate metrics 
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    
    print(f"{name}: Train R2 = {round(train_r2, 4)}, Test R2 = {round(test_r2,4)}, Train MAE = {round(train_mae,4)}, Test MAE = {round(test_mae, 4)}")
    return y_train_pred, y_test_pred, test_mae, test_r2

In [None]:
# Define Configurations for Pipelines
pipelines_config = [
    {"name": "Linear Regression", "model_type": "linear"},
    {"name": "Ridge Regression", "model_type": "ridge", "alpha": 100},
    {"name": "Lasso Regression", "model_type": "lasso"},
]

def regression_results(pipelines_config, X_train, X_test, y_train, y_test, importance='No'):
    results = []
    importances = [] 
    
    for config in pipelines_config:
        pipeline = create_pipeline(**config)
        y_test_pred, y_train_pred, mae, r2 = evaluate_pipeline(pipeline, X_train, X_test, y_train, y_test, name=config["name"])
        results.append((config["name"], mae, r2))

        if isinstance(importance, str) and importance == 'Yes':
            model = pipeline.named_steps["model"]
            importance = model.coef_
            importances.append((config["name"], X_train.columns, importance))

    importance_df = pd.DataFrame(importances, columns = ['Model', 'Feature', 'Importance'])
    results_df = pd.DataFrame(results, columns=['Model', 'Test MAE', 'Test R2'])   
    return results_df, importance_df

In [None]:
print('Results for all features')
results_all = regression_results(pipelines_config, X_train, X_test, y_train, y_test, importance='No')

print('Results after deleting word features')
results_without_word = regression_results(pipelines_config, X_train_t, X_test_t, y_train_t, y_test_t, importance='Yes')

print('Results without word and user features')
results_original = regression_results(pipelines_config, X_train_s, X_test_s, y_train_s, y_test_s, importance='Yes')

print('Results without user features') 
results_without_user = regression_results(pipelines_config, X_train_u, X_test_u, y_train_u, y_test_u, importance='No')

In [None]:
# Test MSE Plot
plt.subplot(1, 2, 1)
plt.bar(results_all['Model'], results_all['Test MSE'], color='skyblue')
plt.title('Test MSE Comparison')
plt.ylabel('Mean Squared Error')
plt.xticks(rotation=45)


# # Test R2 Plot
# plt.subplot(1, 2,1)
# plt.bar(results_all['Model'], results_all['Test R2'], color='lightgreen')
# plt.title('Test R2 Comparison')
# plt.ylabel('R2 Score')
# plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Check if residuals are normally distributed 
sns.distplot((y_test-y_pred_capped),bins=50);

# 