In [17]:
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score, roc_auc_score, roc_curve, confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm
import collections as c
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 2000)
pd.set_option('display.max_rows', 2000)

In [2]:
# model = pickle.load(open('logistic_regression_completion.p', 'rb')) # load the model

In [3]:
model = LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start='False')

In [4]:
def scale_subset(df, columns):
    '''
    Use sklearn StandardScalar to scale only numeric columns.

    Parameters:
    ----------
    input {dataframe, list}: dataframe containing mixed feature variable types, list of names of numeric feature columns
    output: {dataframe}: dataframe with numeric features scaled and categorical features unchanged

    '''
    scalar = StandardScaler()
    numeric = df[columns]
    categorical = df.drop(columns, axis = 1)
    scalar.fit(numeric)
    num_scaled = pd.DataFrame(scalar.transform(numeric))
    num_scaled.rename(columns = dict(zip(num_scaled.columns, numeric_cols)), inplace = True)
    return pd.concat([num_scaled, categorical], axis = 1)


In [5]:
X_train = pd.read_csv('../data/processed/X_train.csv')
y_train = pd.read_csv('../data/processed/y_train.csv')
# y_train = y_train['module_not_completed']
X_test = pd.read_csv('../data/processed/X_test.csv')

y_test = pd.read_csv('../data/processed/y_test.csv')
# y_test = y_test['module_not_completed']

numeric_cols = ['num_of_prev_attempts', 'studied_credits',
'clicks_per_day', 'pct_days_vle_accessed','max_clicks_one_day',
'first_date_vle_accessed', 'avg_score', 'avg_days_sub_early', 'days_early_first_assessment',
'score_first_assessment']

# fill and scale
X_train.fillna(value = 0, inplace = True)
X_train = scale_subset(X_train, numeric_cols)
X_test.fillna(value = 0, inplace = True)
X_test = scale_subset(X_test, numeric_cols)



  return self.partial_fit(X, y)
  from ipykernel import kernelapp as app
  return self.partial_fit(X, y)
  from ipykernel import kernelapp as app


In [6]:
# drop student who did not complete
withdrawls = []
for index, row in y_train[y_train['final_result_num'] == 0].iterrows():
    withdrawls.append(index)

In [7]:
# drop features with collinearity (need VIF)
X_train.drop(withdrawls, inplace=True)
X_train.drop(['avg_score', 'avg_days_sub_early', 'clicks_per_day'], axis=1, inplace=True)
X_test.drop(['avg_score', 'avg_days_sub_early', 'clicks_per_day'], axis=1, inplace=True)
y_train.drop(withdrawls, inplace=True)

In [10]:
y_train = y_train['final_result_num']
y_test = y_test['final_result_num']

In [11]:
model.fit(X_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start='False')

In [12]:
predictions = model.predict(X_test)

In [13]:
recall = recall_score(y_test, predictions)
recall

ValueError: Target is multiclass but average='binary'. Please choose another average setting.

In [14]:
roc_auc = roc_auc_score(y_test, predictions)
roc_auc

ValueError: multiclass format is not supported

In [None]:
probas = model.predict_proba(X_test)[:, :1]
tprs, fprs, thresh = roc_curve(y_test, probas)

In [None]:
plt.figure(figsize=(12,10))
plt.plot(fprs, tprs, 
    label='ROC Curve AUC: {}\n\nRecall: {}'.format(roc_auc, recall),
    color='red')
plt.plot([0,1],[0,1], 'k:')
plt.legend()
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("Logistic Regression: Did the Student Complete the Course?")
plt.show()

In [None]:
# pd.DataFrame(data={'fprs': fprs, 'tprs': tprs, 'Thresholds': thresh})

# Which features are most important?

In [None]:
sm.stats.outliers_influence.va

In [None]:
y_train.shape

In [None]:
abs_coef = list(np.abs(model.coef_.ravel()))
features = list(X_test.columns)

In [None]:
coef_dict = c.OrderedDict((zip(abs_coef, features)))

In [None]:
sorted(coef_dict.items(), reverse=True)

In [15]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      3330
           1       0.32      0.65      0.42      2384
           2       0.62      0.87      0.72      4046
           3       0.59      0.13      0.21      1008

   micro avg       0.48      0.48      0.48     10768
   macro avg       0.38      0.41      0.34     10768
weighted avg       0.36      0.48      0.39     10768



  'precision', 'predicted', average, warn_for)


In [18]:
accuracy_score(y_test, predictions)

0.4812407132243685

In [16]:
print(confusion_matrix(y_test, predictions))

[[   0 2871  456    3]
 [   0 1540  841    3]
 [   0  450 3511   85]
 [   0   13  864  131]]


In [None]:
# pickle.dump(model, open('logistic_regression_completion.p', 'wb')) # pickle the model

In [None]:
c = X_train.corr().abs()

In [None]:
s = c.unstack()

In [None]:
so = s.sort_values(kind="quicksort", ascending=False)

In [None]:
so[::2]