In [1]:
import sys, os
DMOL_DIR = "/Users/Henryye/research/shaf/DMOL"
sys.path.append(os.path.join(DMOL_DIR, "utils"))
from ml_utils import *

import numpy as np
import pandas as pd
import math
pd.options.display.max_columns = 999
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from astropy.stats import median_absolute_deviation
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("../data/class_data.csv")
df_proc = preprocess_df(df, verbose = False)

In [3]:
NEW_VAR_MAP.keys()

dict_keys(['week_1', 'week_2', 'week_3', 'week_4', 'week_5', 'post'])

In [4]:
def get_metrics(y_true, y_pred, k):
    print("MSE:  {}".format(mean_squared_error(y_true, y_pred)))
    print("MAE:  {}".format(mean_absolute_error(y_true, y_pred)))
    R = r2_score(y_true, y_pred)
    n = len(y_true)
    adjR = 1 - (1-R**2)*(n-1) / (n-k-1)
    print("r-squared:  {}".format(R))
    print("Adjusted r-squared:  {}".format(adjR))

def mad_remove_outlier(df):
    for i, c in enumerate(list(df.columns)):
        mad, mu = median_absolute_deviation(df[c]), df[c].mean()
        for idx in df.index:
            if df.loc[idx, c] > mu + 3 * mad:
                df.loc[idx, c] = mu + 3 * mad
            elif df.loc[idx, c] < mu - 3 * mad:
                df.loc[idx, c] = mu - 3 * mad
    return df
        
    
def group_construct_var(df, var_map_curr, n_comp = None, verbose = False):
    X = pd.DataFrame()
    pca = KernelPCA(n_components=n_comp, kernel="poly", degree=3) if n_comp is not None else None
    for construct in var_map_curr["Quant"]:
        temp = df[var_map_curr["Quant"][construct]].copy()
        temp = temp.fillna(temp.median())  # use median to fillna
        temp = mad_remove_outlier(temp)    # use mad to remove outliers
        if n_comp is None:
            temp_mean = pd.DataFrame(temp.mean(axis=1))
            temp_mean.columns = ["{}_mean".format(construct)]
            X = pd.concat([X, temp_mean], axis = 1)
        else:
            if len(var_map_curr["Quant"][construct]) < n_comp:
                X = pd.concat([X, temp.copy()], axis = 1)
            else:
                temp_pca = pd.DataFrame(pca.fit_transform(temp))
                temp_pca.columns = ["{}_PC{}".format(construct, i) for i in range(1, n_comp+1)]
                X = pd.concat([X, temp_pca], axis = 1)
                assert(temp_pca.shape[1] == 1)
    return X

def construct_X_y(df, week_num, n_comp, verbose):
    var_map_curr = NEW_VAR_MAP["post"] \
                   if week_num == "post" else NEW_VAR_MAP["week_{}".format(week_num)] 
    X = group_construct_var(df, var_map_curr, n_comp, verbose)
    y = df["gr_revqf"] if week_num == "post" else df["gr_revq{}".format(week_num)]
    y = y.fillna(y.median())
    week_num = 6 if week_num == "post" else week_num
    for gr in ["gr_revq{}".format(i) for i in range(1, week_num)]:
        X[gr] = df[gr]
    return X, y
    
def train_model(df, week_num, n_comp = None, use_fr = False, plot = False, verbose = False):
    X, y = construct_X_y(df, week_num, n_comp, verbose)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

    reg =  Pipeline([("min_max_scaler", MinMaxScaler()), 
                     ('reg', RandomForestRegressor(random_state = 42, n_jobs = 8))])
    
    param_grid = [{'reg__criterion': ['mse', 'mae'], 'reg__n_estimators': [50, 100, 150, 200, 300],
                   'reg__max_depth': [5, 10, 15, 20, 50, 80, 120]}] 
#     param_grid = [{'reg__criterion': ['mse', 'mae'], 'reg__n_estimators': [10, 15],
#                'reg__max_depth': [5, 10]}] 
    model = GridSearchCV(reg, param_grid, cv = 5, scoring = "neg_mean_squared_error")
    model.fit(X_train, y_train)
    
    print("Found Best Params:  {}".format(model.best_params_))
    print("Training Score with Best Params:  ")
    get_metrics(model.predict(X_train), y_train, X_train.shape[1]); print()
    print("Testing Score with Best Params:  ")
    get_metrics(model.predict(X_test), y_test, X_test.shape[1]); print('\n')
    print()
    return model, X_train, X_test, y_train, y_test
    

def feature_importance_map(df_feature_master, week_num, model, X_train):
    df_feature_imp = pd.DataFrame(model.best_estimator_.named_steps['reg'].feature_importances_, 
                                  index=  X_train.columns, columns = ["week_{}".format(week_num)])
    df_feature_master = pd.concat([df_feature_master, df_feature_imp], axis=1)
    return df_feature_master

In [5]:
df_feature_master = pd.DataFrame()
for week_num in [i for i in range(1, 6)] + ["post"]:
    print("Week Num:  {}".format(week_num))
    model, X_train, X_test, y_train, y_test = train_model(df_proc, week_num, 1)
    df_feature_master = feature_importance_map(df_feature_master, week_num, model, X_train)

Week Num:  1
Found Best Params:  {'reg__criterion': 'mae', 'reg__max_depth': 5, 'reg__n_estimators': 300}
Training Score with Best Params:  
MSE:  9.609982936507937
MAE:  2.354087301587301
r-squared:  0.10381456781992349
Adjusted r-squared:  -0.3242817814063561

Testing Score with Best Params:  
MSE:  20.53975944444444
MAE:  3.613666666666666
r-squared:  -2.677430228766848
Adjusted r-squared:  -11.33726525982899



Week Num:  2
Found Best Params:  {'reg__criterion': 'mse', 'reg__max_depth': 10, 'reg__n_estimators': 50}
Training Score with Best Params:  
MSE:  3.9161105614892002
MAE:  1.328679993030926
r-squared:  0.8251213053399991
Adjusted r-squared:  0.604604313248607

Testing Score with Best Params:  
MSE:  45.05632673937306
MAE:  4.277910144604491
r-squared:  -18.863418501958765
Adjusted r-squared:  -2482.7999030602823



Week Num:  3
Found Best Params:  {'reg__criterion': 'mae', 'reg__max_depth': 5, 'reg__n_estimators': 300}
Training Score with Best Params:  
MSE:  4.3717417989417

In [6]:
df_feature_master

Unnamed: 0,week_1,week_2,week_3,week_4,week_5,week_post
Achievement Goals (Mastery approach)_PC1,0.027512,,,,,0.012684
Achievement Goals (Mastery avoid)_PC1,0.035068,,,,,0.009725
Achievement Goals (Performance approach)_PC1,0.051002,,,,,0.021273
Achievement Goals (Performance avoid)_PC1,0.069101,,,,,0.026831
Achievement Goals (Work avoidance)_PC1,0.021236,,,,,
Activities_PC1,,,,,,0.053612
Attainment Value_PC1,0.151428,0.023718,0.011756,0.008653,0.012776,0.011537
Cost Value (Emotional)_PC1,0.097976,0.029014,0.031218,0.018844,0.006364,0.012127
Cost Value (Loss of Valued Alternatives)_PC1,0.030014,0.024403,0.020959,0.015276,0.004441,0.016885
Cost Value (Outside Effort)_PC1,0.09281,0.01908,0.067056,0.012465,0.0221,0.017596


In [7]:
print("Top 5 important features")
for c in df_feature_master.columns:
    print("{}: \n{}\n\n".format(c, df_feature_master[c].nlargest(5)))

Top 5 important features
week_1: 
Attainment Value_PC1                         0.151428
Cost Value (Emotional)_PC1                   0.097976
Online Self Regulation_PC1                   0.095545
Cost Value (Outside Effort)_PC1              0.092810
Achievement Goals (Performance avoid)_PC1    0.069101
Name: week_1, dtype: float64


week_2: 
gr_revq1                           0.402022
Other Activities Rank_PC1          0.100183
Regret for Activity_PC1            0.092720
Satisfaction for Activity_PC1      0.072494
Grade Expectations - Course_PC1    0.058516
Name: week_2, dtype: float64


week_3: 
gr_revq2                           0.414147
gr_revq1                           0.199905
Cost Value (Outside Effort)_PC1    0.067056
Grade Expectations - Quiz_PC1      0.065041
Plan for Completion_PC1            0.053372
Name: week_3, dtype: float64


week_4: 
gr_revq3                          0.534232
gr_revq2                          0.069650
Grade Expectations - Quiz_PC1     0.069535
Regret 

In [8]:
print("Pre and Post Comp:  ")
df_feature_master[["week_1", "week_post"]].dropna(how="all")

Pre and Post Comp:  


Unnamed: 0,week_1,week_post
Achievement Goals (Mastery approach)_PC1,0.027512,0.012684
Achievement Goals (Mastery avoid)_PC1,0.035068,0.009725
Achievement Goals (Performance approach)_PC1,0.051002,0.021273
Achievement Goals (Performance avoid)_PC1,0.069101,0.026831
Achievement Goals (Work avoidance)_PC1,0.021236,
Activities_PC1,,0.053612
Attainment Value_PC1,0.151428,0.011537
Cost Value (Emotional)_PC1,0.097976,0.012127
Cost Value (Loss of Valued Alternatives)_PC1,0.030014,0.016885
Cost Value (Outside Effort)_PC1,0.09281,0.017596


In [9]:
print("Weekly Comp:  ")
df_feature_master[["week_{}".format(i) for i in range(2, 6)]].dropna(how="all")

Weekly Comp:  


Unnamed: 0,week_2,week_3,week_4,week_5
Attainment Value_PC1,0.023718,0.011756,0.008653,0.012776
Cost Value (Emotional)_PC1,0.029014,0.031218,0.018844,0.006364
Cost Value (Loss of Valued Alternatives)_PC1,0.024403,0.020959,0.015276,0.004441
Cost Value (Outside Effort)_PC1,0.01908,0.067056,0.012465,0.0221
Grade Expectations - Course_PC1,0.058516,0.027278,0.037757,0.06938
Grade Expectations - Final_PC1,,,0.039276,0.02236
Grade Expectations - Midterm_PC1,0.053301,,,
Grade Expectations - Quiz_PC1,0.040112,0.065041,0.069535,0.007885
Interest Value_PC1,0.0125,0.012641,0.015545,0.011664
Internal/External Motivation_PC1,0.008357,0.032401,0.014166,0.012936


### Without previous grade

In [10]:
def construct_X_y(df, week_num, n_comp, verbose):
    var_map_curr = NEW_VAR_MAP["post"] \
                   if week_num == "post" else NEW_VAR_MAP["week_{}".format(week_num)] 
    X = group_construct_var(df, var_map_curr, n_comp, verbose)
    y = df["gr_revqf"] if week_num == "post" else df["gr_revq{}".format(week_num)]
    y = y.fillna(y.median())
    return X, y

In [11]:
df_feature_master_wo_quiz = pd.DataFrame()
for week_num in [i for i in range(1, 6)] + ["post"]:
    print("Week Num:  {}".format(week_num))
    model, X_train, X_test, y_train, y_test = train_model(df_proc, week_num, 1)
    df_feature_master_wo_quiz = feature_importance_map(df_feature_master_wo_quiz, week_num, model, X_train)

Week Num:  1
Found Best Params:  {'reg__criterion': 'mae', 'reg__max_depth': 5, 'reg__n_estimators': 300}
Training Score with Best Params:  
MSE:  9.609982936507937
MAE:  2.354087301587301
r-squared:  0.10381456781992349
Adjusted r-squared:  -0.3242817814063561

Testing Score with Best Params:  
MSE:  20.53975944444444
MAE:  3.613666666666666
r-squared:  -2.677430228766848
Adjusted r-squared:  -11.33726525982899



Week Num:  2
Found Best Params:  {'reg__criterion': 'mse', 'reg__max_depth': 5, 'reg__n_estimators': 200}
Training Score with Best Params:  
MSE:  9.61659692823134
MAE:  2.3561978257364644
r-squared:  0.15231443382569443
Adjusted r-squared:  -0.1922709705825547

Testing Score with Best Params:  
MSE:  29.97076438659404
MAE:  4.080214405036676
r-squared:  -4.343130728086491
Adjusted r-squared:  -249.0789832974873



Week Num:  3
Found Best Params:  {'reg__criterion': 'mae', 'reg__max_depth': 5, 'reg__n_estimators': 300}
Training Score with Best Params:  
MSE:  19.670360119047

In [12]:
df_feature_master_wo_quiz

Unnamed: 0,week_1,week_2,week_3,week_4,week_5,week_post
Achievement Goals (Mastery approach)_PC1,0.027512,,,,,0.015243
Achievement Goals (Mastery avoid)_PC1,0.035068,,,,,0.032683
Achievement Goals (Performance approach)_PC1,0.051002,,,,,0.033877
Achievement Goals (Performance avoid)_PC1,0.069101,,,,,0.061544
Achievement Goals (Work avoidance)_PC1,0.021236,,,,,
Activities_PC1,,,,,,0.127069
Attainment Value_PC1,0.151428,0.044239,0.073406,0.017708,0.026699,0.032519
Cost Value (Emotional)_PC1,0.097976,0.028514,0.076947,0.017829,0.053659,0.024176
Cost Value (Loss of Valued Alternatives)_PC1,0.030014,0.023472,0.049899,0.042226,0.045689,0.02169
Cost Value (Outside Effort)_PC1,0.09281,0.083099,0.137246,0.051306,0.03732,0.036599


In [13]:
print("Top 5 important features")
for c in df_feature_master_wo_quiz.columns:
    print("{}: \n{}\n\n".format(c, df_feature_master_wo_quiz[c].nlargest(5)))

Top 5 important features
week_1: 
Attainment Value_PC1                         0.151428
Cost Value (Emotional)_PC1                   0.097976
Online Self Regulation_PC1                   0.095545
Cost Value (Outside Effort)_PC1              0.092810
Achievement Goals (Performance avoid)_PC1    0.069101
Name: week_1, dtype: float64


week_2: 
Other Activities Rank_PC1           0.278860
Grade Expectations - Course_PC1     0.097622
Cost Value (Outside Effort)_PC1     0.083099
Grade Expectations - Midterm_PC1    0.080721
Other Activities Time_PC1           0.067165
Name: week_2, dtype: float64


week_3: 
Plan for Completion_PC1             0.182521
Grade Expectations - Quiz_PC1       0.149661
Cost Value (Outside Effort)_PC1     0.137246
Internal/External Motivation_PC1    0.102078
Cost Value (Emotional)_PC1          0.076947
Name: week_3, dtype: float64


week_4: 
Regret for Activity_PC1                0.329668
Grade Expectations - Final_PC1         0.245333
Plan for Completion_PC1       