In [1]:
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib
from sklearn.base import clone
from rfpimp import *
import collections as c
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
model = pickle.load(open('random_forest_score.p', 'rb')) # load the model

In [3]:
# model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#             max_depth=20, max_features='auto', max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
#             oob_score=False, random_state=None, verbose=0,
#             warm_start=False)

In [4]:
def dropcol_importances(rf, X_train, y_train):
    '''
    Calculates the drop-column feature importances of a Random Forest model. 
    Explanation here: https://explained.ai/rf-importance/index.html
    '''

    rf_ = clone(rf)
    rf_.random_state = 999
    rf_.fit(X_train, y_train)
    baseline = rf_.oob_score
    imp = []
    for col in X_train.columns:
        X = X_train.drop(col, axis=1)
        rf_ = clone(rf)
        rf_.random_state = 999
        rf_.fit(X, y_train)
        o = rf_.oob_score
        imp.append(baseline - o)
    imp = np.array(imp)
    I = pd.DataFrame(
            data={'Feature':X_train.columns,
                  'Importance':imp})
    I = I.set_index('Feature')
    I = I.sort_values('Importance', ascending=True)
    return I

In [5]:
def only_completed(X_train, y_train, X_test, y_test, y_train_not_comp, y_test_not_comp):
    '''
    Returns dataframes with only those students who completed the course for the purpose of regressing the final score.
    '''
    test_indices = []
    train_indices = []

    y_test_not_comp = y_test_not_comp[y_test_not_comp['module_not_completed'] == 1]
    for index, row in y_test_not_comp.iterrows():
        test_indices.append(index)

    y_train_not_comp = y_train_not_comp[y_train_not_comp['module_not_completed'] == 1]
    for index, row in y_train_not_comp.iterrows():
        train_indices.append(index)

    return X_train.drop(train_indices), y_train.drop(train_indices), X_test.drop(test_indices), y_test.drop(test_indices)

In [6]:
    X_train = pd.read_csv('../data/processed/X_train.csv')
    y_train = pd.read_csv('../data/processed/y_train.csv')
    y_train_not_comp = y_train[['module_not_completed']]
    y_train = y_train['estimated_final_score']
    X_test = pd.read_csv('../data/processed/X_test.csv')
    y_test = pd.read_csv('../data/processed/y_test.csv')
    y_test_not_comp = y_test[['module_not_completed']]
    y_test = y_test['estimated_final_score']
numeric_cols = ['num_of_prev_attempts', 'studied_credits',
'clicks_per_day', 'pct_days_vle_accessed','max_clicks_one_day',
'first_date_vle_accessed', 'avg_score', 'avg_days_sub_early', 'days_early_first_assessment',
'score_first_assessment']
# fill
    X_train.fillna(value = 0, inplace = True)
    y_train.fillna(value = 0, inplace = True)
    X_test.fillna(value = 0, inplace = True)
    y_test.fillna(value = 0, inplace = True)

In [7]:
    # only students who completed the course
    X_train, y_train, X_test, y_test = only_completed(X_train, y_train, X_test, y_test, y_train_not_comp, y_test_not_comp)


In [8]:
# model.fit(X_train, y_train)

In [9]:
predictions = model.predict(X_test)

In [10]:
rmse = np.sqrt(mean_squared_error(y_test, predictions))
rmse

7.341329793914999

In [11]:
r2 = r2_score(y_test, predictions)
r2

0.9223393063239247

In [13]:
# pd.DataFrame(data={'frps': fprs, 'tprs': tprs, 'Thresholds': thresh})

# Which features are most important?

In [19]:
feat_imp = importances(model, X_test, y_test)

In [22]:
feat_imp

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
code_module_GGG,1.374746
avg_score,0.3012493
days_early_first_assessment,0.1644599
score_first_assessment,0.02078149
avg_days_sub_early,0.0133279
code_module_BBB,0.01320231
pct_days_vle_accessed,0.006954371
clicks_per_day,0.00464037
code_module_CCC,0.003759122
first_date_vle_accessed,0.001019712


In [19]:
model.feature_importances_

array([8.07021634e-04, 2.46343904e-03, 7.49502154e-03, 6.57771031e-03,
       6.66257902e-03, 4.67870748e-03, 1.52472837e-01, 1.59989465e-02,
       8.85809462e-02, 1.36005807e-02, 7.02534237e-03, 1.75018119e-03,
       4.41847834e-04, 2.52877254e-04, 5.71618653e-04, 6.74198126e-01,
       0.00000000e+00, 5.77223761e-04, 7.33430181e-04, 6.95049772e-04,
       0.00000000e+00, 7.12977653e-04, 0.00000000e+00, 6.58347801e-04,
       4.99801614e-04, 6.65926797e-04, 2.62420781e-04, 4.23553672e-04,
       5.96651061e-04, 3.89146521e-04, 6.79164661e-04, 4.02729426e-04,
       4.01828945e-04, 4.86046594e-04, 2.60950031e-04, 0.00000000e+00,
       5.82890874e-04, 7.81097184e-04, 1.64281184e-04, 1.16456685e-04,
       0.00000000e+00, 5.29013716e-04, 5.32234831e-04, 5.62150590e-04,
       5.32971618e-04, 5.73773576e-04, 4.96028047e-04, 4.86645639e-04,
       5.05468907e-04, 5.12518824e-04, 2.13168388e-04, 6.78240808e-04,
       2.17248016e-04, 0.00000000e+00, 4.92778850e-04, 0.00000000e+00])

In [14]:
# feat_imp = list(list(model.feature_importances_))
features = list(X_test.columns)

In [20]:
coef_dict = c.OrderedDict((zip(feat_imp, features)))

In [21]:
sorted(coef_dict.items(), reverse=True)[:10]

[('Importance', 'num_of_prev_attempts')]

In [18]:
dropcol_importances(model, X_train, y_train)

KeyboardInterrupt: 

In [34]:
model.oob_score

False

In [None]:
pickle.dump(model, open('random_forest_completion.p', 'wb')) # save the model