In [1]:
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score, roc_auc_score, roc_curve, confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm
import collections as c
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 2000)
pd.set_option('display.max_rows', 2000)

In [2]:
# model = pickle.load(open('logistic_regression_completion.p', 'rb')) # load the model

In [3]:
model = LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start='False')

In [4]:
def scale_subset(df, columns):
    '''
    Use sklearn StandardScalar to scale only numeric columns.

    Parameters:
    ----------
    input {dataframe, list}: dataframe containing mixed feature variable types, list of names of numeric feature columns
    output: {dataframe}: dataframe with numeric features scaled and categorical features unchanged

    '''
    scalar = StandardScaler()
    numeric = df[columns]
    categorical = df.drop(columns, axis = 1)
    scalar.fit(numeric)
    num_scaled = pd.DataFrame(scalar.transform(numeric))
    num_scaled.rename(columns = dict(zip(num_scaled.columns, numeric_cols)), inplace = True)
    return pd.concat([num_scaled, categorical], axis = 1)


In [5]:
X_train = pd.read_csv('../data/processed/first_half/X_train.csv')
y_train = pd.read_csv('../data/processed/first_half/y_train.csv')
# y_train = y_train['module_not_completed']
X_test = pd.read_csv('../data/processed/first_half/X_test.csv')

y_test = pd.read_csv('../data/processed/first_half/y_test.csv')
# y_test = y_test['module_not_completed']

numeric_cols = ['num_of_prev_attempts', 'studied_credits',
'clicks_per_day', 'pct_days_vle_accessed','max_clicks_one_day',
'first_date_vle_accessed', 'avg_score', 'avg_days_sub_early', 'days_early_first_assessment',
'score_first_assessment']

# fill and scale
X_train.fillna(value = 0, inplace = True)
X_train = scale_subset(X_train, numeric_cols)
X_test.fillna(value = 0, inplace = True)
X_test = scale_subset(X_test, numeric_cols)



  return self.partial_fit(X, y)
  from ipykernel import kernelapp as app
  return self.partial_fit(X, y)
  from ipykernel import kernelapp as app


In [6]:
# drop features with collinearity (need VIF)
# X_train.drop(withdrawls, inplace=True)
X_train.drop(['days_early_first_assessment', 'pct_days_vle_accessed', 'score_first_assessment'], axis=1, inplace=True)
X_test.drop(['days_early_first_assessment', 'pct_days_vle_accessed', 'score_first_assessment'], axis=1, inplace=True)
# y_train.drop(withdrawls, inplace=True)

In [7]:
y_train = y_train['module_not_completed']
y_test = y_test['module_not_completed']

In [8]:
model.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start='False')

In [9]:
predictions = model.predict(X_test)

In [10]:
recall = recall_score(y_test, predictions, average = 'micro')
recall

0.6046857567869096

In [None]:
roc_auc = roc_auc_score(y_test, predictions)
roc_auc

In [11]:
probas = model.predict_proba(X_test)[:, :1]
# tprs, fprs, thresh = roc_curve(y_test, probas)

In [12]:
probas

array([[0.06198954],
       [0.36247184],
       [0.00352331],
       ...,
       [0.95273304],
       [0.17484739],
       [0.15133983]])

In [None]:
plt.figure(figsize=(12,10))
plt.plot(fprs, tprs, 
    label='ROC Curve AUC: {}\n\nRecall: {}'.format(roc_auc, recall),
    color='red')
plt.plot([0,1],[0,1], 'k:')
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Logistic Regression: Did the Student Complete the Course?")
plt.show()

In [None]:
# cross_val_score(model, X_train, y_train, scoring='recall')

In [None]:
# pd.DataFrame(data={'fprs': fprs, 'tprs': tprs, 'Thresholds': thresh})

# Which features are most important?

In [None]:
sm.stats.outliers_influence.va

In [None]:
y_test.shape

In [21]:
abs_coef = list(np.abs(model.coef_.ravel()))
features = list(X_test.columns)

In [22]:
coef_dict = c.OrderedDict((zip(abs_coef, features)))

In [23]:
sorted(coef_dict.items(), reverse=True)

[(2.105565159856323, 'avg_score'),
 (1.4252712693548633, 'code_module_GGG'),
 (1.0965596632191346, 'code_module_CCC'),
 (1.0455014915520826, 'code_module_FFF'),
 (0.7885425293641274, 'code_presentation_2013J'),
 (0.7483314467392886, 'code_module_DDD'),
 (0.6739150165744944, 'code_module_EEE'),
 (0.6667471488236419, 'code_presentation_2014J'),
 (0.6173057167636385, 'clicks_per_day'),
 (0.4809126585266301, 'highest_education_No Formal quals'),
 (0.4378541421246574, 'imd_band_nan'),
 (0.37881522213734087, 'age_band_55<='),
 (0.3116493628180959, 'highest_education_Lower Than A Level'),
 (0.3091706946677079, 'disability_Y'),
 (0.26008043733864294, 'gender_M'),
 (0.2393862523961181, 'age_band_35-55'),
 (0.23744545901619984, 'imd_band_90-100%'),
 (0.22634877891798053, 'region_West Midlands Region'),
 (0.21261759263963287, 'imd_band_70-80%'),
 (0.21020204714350546, 'region_Wales'),
 (0.19861602276342924, 'max_clicks_one_day'),
 (0.1918054804313232, 'region_East Midlands Region'),
 (0.188798669

In [13]:
print(classification_report(y_test, predictions))

               precision    recall  f1-score   support

  0_Withdrawn       0.67      0.71      0.69      3288
       1_Fail       0.51      0.21      0.30      2406
       2_Pass       0.58      0.85      0.69      4042
3_Distinction       0.55      0.23      0.32      1020

    micro avg       0.60      0.60      0.60     10756
    macro avg       0.58      0.50      0.50     10756
 weighted avg       0.59      0.60      0.57     10756



In [14]:
accuracy_score(y_test, predictions)

0.6046857567869096

In [15]:
print(confusion_matrix(y_test, predictions))

[[2350  236  672   30]
 [ 885  503  996   22]
 [ 251  236 3418  137]
 [   4    7  776  233]]


In [16]:
# pickle.dump(model, open('logistic_regression_completion.p', 'wb')) # pickle the model

Correlated predictor variables

In [17]:
cor = X_train.corr().abs()

In [18]:
s = cor.unstack()

In [19]:
so = s.sort_values(kind="quicksort", ascending=False)

In [20]:
so[::2]

disability_Y                                   disability_Y                                     1.000000
region_Scotland                                region_Scotland                                  1.000000
region_South Region                            region_South Region                              1.000000
region_Wales                                   region_Wales                                     1.000000
region_Yorkshire Region                        region_Yorkshire Region                          1.000000
highest_education_Lower Than A Level           highest_education_Lower Than A Level             1.000000
highest_education_Post Graduate Qualification  highest_education_Post Graduate Qualification    1.000000
imd_band_20-30%                                imd_band_20-30%                                  1.000000
imd_band_40-50%                                imd_band_40-50%                                  1.000000
imd_band_60-70%                                imd_band