In [1]:
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.model_selection import cross_validate
from sklearn.externals import joblib
from sklearn.base import clone
from rfpimp import *
import collections as c
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
rf_model = pickle.load(open('random_forest_score_first_quarter.p', 'rb')) # load the model

In [3]:
rf_model.get_params

<bound method BaseEstimator.get_params of RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.1, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score='True', random_state=None, verbose=0,
           warm_start=False)>

In [4]:
def only_completed(X_train, y_train, X_test, y_test, y_train_not_comp, y_test_not_comp):
    '''
    Returns dataframes with only those students who completed the course for the purpose of regressing the final score.
    '''
    test_indices = []
    train_indices = []

    y_test_not_comp = y_test_not_comp[y_test_not_comp['module_not_completed'] == 1]
    for index, row in y_test_not_comp.iterrows():
        test_indices.append(index)

    y_train_not_comp = y_train_not_comp[y_train_not_comp['module_not_completed'] == 1]
    for index, row in y_train_not_comp.iterrows():
        train_indices.append(index)

    return X_train.drop(train_indices), y_train.drop(train_indices), X_test.drop(test_indices), y_test.drop(test_indices)

In [5]:
    X_train = pd.read_csv('../data/processed/first_quarter/X_train.csv')
    y_train = pd.read_csv('../data/processed/first_quarter/y_train.csv')
    y_train_not_comp = y_train[['module_not_completed']]
    y_train = y_train['estimated_final_score']
    X_test = pd.read_csv('../data/processed/first_quarter/X_test.csv')
    y_test = pd.read_csv('../data/processed/first_quarter/y_test.csv')
    y_test_not_comp = y_test[['module_not_completed']]
    y_test = y_test['estimated_final_score']

    # fill
    X_train.fillna(value = 0, inplace = True)
    y_train.fillna(value = 0, inplace = True)
    X_test.fillna(value = 0, inplace = True)
    y_test.fillna(value = 0, inplace = True)

    # only students who completed the course
    X_train, y_train, X_test, y_test = only_completed(X_train, y_train, X_test, y_test, y_train_not_comp, y_test_not_comp)

In [6]:
    predictions = rf_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    evs = explained_variance_score(y_test, predictions)
    r2 = r2_score(y_test, predictions)

In [9]:
    print('Root Mean Squared Error: {}'.format(rmse))
    print('Target Standard Deviation: {}'.format(np.std(y_test)))
    print('R-Squared: {}'.format(r2))
    print('Explained Variance Score: {}'.format(evs))

Root Mean Squared Error: 10.05035675182217
Target Standard Deviation: 25.87929903087205
R-Squared: 0.8491803466541941
Explained Variance Score: 0.8491923775055047


In [8]:
    feat_imp = importances(rf_model, X_test, y_test)
    feat_imp.sort_values(by='Importance', ascending=False)[0:20]

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
code_module_GGG,1.060105
avg_score,0.244255
days_early_first_assessment,0.132039
avg_days_sub_early,0.020192
score_first_assessment,0.016391
sum_click_oucontent,0.012438
code_module_CCC,0.006613
sum_days_vle_accessed,0.00292
sum_click_quiz,0.001289
sum_click_ouwiki,0.000729
