In [341]:
import pandas as pd
import numpy as np

In [342]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import PolynomialFeatures

In [343]:
import matplotlib.pyplot as plt
import seaborn as sns 

In [344]:
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [345]:
from pathlib import Path

In [346]:
datasets = dict()
responses = dict()

def get_data(dict_X, dict_y):
    
    for i in range(1, 8):
        filename_X: Path = Path(f'X_{i}days.csv')
        filename_y: Path = Path(f'y_{i}days.csv')
        path: Path = Path("prepared_data_on_student_outflows")
        dict_X[f'X{i}days'] = pd.read_csv(path / filename_X)
        dict_y[f'y{i}days'] = pd.read_csv(path / filename_y)

get_data(datasets, responses)

In [347]:
# remove unnecessary columns from X and y
def data_transformation_y(dict_y):
    dict_y.drop("Unnamed: 0", axis=1, inplace=True)
    
def data_transformation_X(dict_X):
    dict_X.drop("user_id", axis=1, inplace=True)

In [348]:
for k, v in datasets.items():
        data_transformation_X(v)
        
for k, v in responses.items():
        data_transformation_y(v)

In [349]:
datasets["X3days"].head()

Unnamed: 0,day,total_steps,correct,wrong,correct_response_rate,viewed
0,1.0,2.0,2.0,0.0,1.0,9
1,1.0,4.0,4.0,4.0,0.5,20
2,1.0,11.0,9.0,21.0,0.3,154
3,1.0,1.0,0.0,1.0,0.0,9
4,3.0,23.0,23.0,27.0,0.46,132


In [350]:
responses["y3days"].head()

Unnamed: 0,passed_course
0,0
1,0
2,0
3,0
4,1


## LogisticRegression

In [351]:
# сreate a dataframe with the necessary metrics
results_logreg = pd.DataFrame()

In [352]:
results_logreg["names"] = [f'X{i}days' for i in range(1, 8)]
results_logreg["coef_C"] = np.zeros(results_logreg.shape[0])
results_logreg["precision"] = np.zeros(results_logreg.shape[0])
results_logreg["recall"] = np.zeros(results_logreg.shape[0])
results_logreg["accuracy_test_set"] = np.zeros(results_logreg.shape[0])
results_logreg["ROC_AUC"] = np.zeros(results_logreg.shape[0])

In [353]:
count_logreg = 0

In [354]:
def logistic_regression(X, y):
    
    global count_logreg
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=17)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)
    
    c_values = np.logspace(-2, 3, 500)
    
    logreg = LogisticRegressionCV(Cs=c_values, cv=skf, n_jobs=-1)
    logreg.fit(X_train, y_train)
    
    best_coef_C = logreg.C_
    results_logreg.loc[count_logreg,'coef_C'] = best_coef_C
    
    y_pred_log = logreg.predict(X_test)
    
    precision = precision_score(y_test, y_pred_log)
    results_logreg.loc[count_logreg,'precision'] = precision
    
    recall = recall_score(y_test, y_pred_log)
    results_logreg.loc[count_logreg,'recall'] = recall
    
    results_logreg.loc[count_logreg,'accuracy_test_set'] = logreg.score(X_test, y_test)
    
    ROC_AUC = metrics.roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1])
    results_logreg.loc[count_logreg,'ROC_AUC'] = ROC_AUC
    
    count_logreg += 1
    

In [355]:
%%time
for i in range(1, 8):
    logistic_regression(datasets[f'X{i}days'], responses[f'y{i}days'])

CPU times: total: 8.91 s
Wall time: 16.7 s


### results of predicting user churn after the first 1 to 7 days on the course using logistic regression

In [356]:
results_logreg

Unnamed: 0,names,coef_C,precision,recall,accuracy_test_set,ROC_AUC
0,X1days,0.051454,0.54717,0.05653,0.914478,0.814089
1,X2days,0.01,0.495935,0.124236,0.917172,0.826936
2,X3days,0.155734,0.496689,0.163399,0.922559,0.845188
3,X4days,0.123647,0.560976,0.184,0.919192,0.85065
4,X5days,0.012308,0.614583,0.227799,0.920202,0.859005
5,X6days,0.020447,0.613208,0.27027,0.927104,0.872503
6,X7days,0.093744,0.676617,0.276423,0.929125,0.878134


## DecisionTreeClassifier

In [357]:
results_dt = pd.DataFrame()

results_dt["names"] = [f'X{i}days' for i in range(1, 8)]
results_dt["precision"] = np.zeros(results_dt.shape[0])
results_dt["recall"] = np.zeros(results_dt.shape[0])
results_dt["accuracy_test_set"] = np.zeros(results_dt.shape[0])
results_dt["ROC_AUC"] = np.zeros(results_dt.shape[0])

In [358]:
count_dt = 0

In [359]:
def decision_tree_classifier(X, y):
    
    global count_dt
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=17)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)
    clf = tree.DecisionTreeClassifier()
    
    parametrs = {'max_depth': range(1, 31, 5),
          'min_samples_leaf': range(1, 51, 5),
          'min_samples_split': range(2, 71, 5)   
    }
    
    tree_grid = GridSearchCV(clf, parametrs, cv=skf)
    tree_grid.fit(X_train, y_train)
    
    best_parametrs = tree_grid.best_params_
    
    best_clf = tree_grid.best_estimator_
    
    results_dt.loc[count_dt, 'accuracy_test_set'] = best_clf.score(X_test, y_test)
        
    y_pred = best_clf.predict(X_test)
    
    precision = precision_score(y_test, y_pred)
    results_dt.loc[count_dt, 'precision'] = precision
    
    recall = recall_score(y_test, y_pred)
    results_dt.loc[count_dt, 'recall'] = recall
    
    ROC_AUC = metrics.roc_auc_score(y_test, best_clf.predict_proba(X_test)[:, 1])
    results_dt.loc[count_dt, 'ROC_AUC'] = best_clf.score(X_test, y_test)
    
    feature_importances = best_clf.feature_importances_
    feature_importances_df = pd.DataFrame({"features": list(X), 
                                       "feature_importances": feature_importances})
    feature_importances_df = feature_importances_df.sort_values("feature_importances", ascending=False)

    count_dt += 1
    
    print(best_parametrs)
    print(feature_importances_df[:3])
    

### the best tree parameters and the 3 most important features

In [360]:
%%time
for i in range(1, 8):
    decision_tree_classifier(datasets[f'X{i}days'], responses[f'y{i}days'])

{'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2}
      features  feature_importances
2      correct                  1.0
0          day                  0.0
1  total_steps                  0.0
{'max_depth': 6, 'min_samples_leaf': 26, 'min_samples_split': 2}
                features  feature_importances
1            total_steps             0.780157
4  correct_response_rate             0.110316
5                 viewed             0.041635
{'max_depth': 6, 'min_samples_leaf': 21, 'min_samples_split': 2}
                features  feature_importances
1            total_steps             0.820680
5                 viewed             0.070490
4  correct_response_rate             0.043678
{'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 37}
      features  feature_importances
1  total_steps             0.648301
2      correct             0.122003
0          day             0.103794
{'max_depth': 6, 'min_samples_leaf': 31, 'min_samples_split': 2}
      features  feat

### results of predicting user churn after the first 1 to 7 days on the course using decision tree classifier

In [361]:
results_dt

Unnamed: 0,names,precision,recall,accuracy_test_set,ROC_AUC
0,X1days,0.0,0.0,0.913636,0.913636
1,X2days,0.510204,0.050916,0.917508,0.917508
2,X3days,0.490683,0.172113,0.922222,0.922222
3,X4days,0.506757,0.15,0.916162,0.916162
4,X5days,0.590909,0.225869,0.918855,0.918855
5,X6days,0.549356,0.266112,0.922896,0.922896
6,X7days,0.647059,0.313008,0.928956,0.928956


## RandomForestClassifier

In [362]:
results_rf = pd.DataFrame()

results_rf["names"] = [f'X{i}days' for i in range(1, 8)]
results_rf["precision"] = np.zeros(results_rf.shape[0])
results_rf["recall"] = np.zeros(results_rf.shape[0])
results_rf["accuracy_test_set"] = np.zeros(results_rf.shape[0])
results_rf["ROC_AUC"] = np.zeros(results_rf.shape[0])

In [363]:
count_rf = 0

In [364]:
def random_forest_classifier(X, y):
        
    global count_rf
            
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=17)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)
    clf = RandomForestClassifier()
    
    parametrs = {'n_estimators':range(10, 200), 
                'max_depth': range(1, 31), 
                'min_samples_leaf': range(1, 51), 
                'min_samples_split': range(2, 51)   
                }
    
    grid = RandomizedSearchCV(clf, parametrs, cv=skf, n_jobs=-1)
    grid.fit(X_train, y_train)
    
    best_parametrs = grid.best_params_
    
    best_clf = grid.best_estimator_
    
    results_rf.loc[count_rf, 'accuracy_test_set'] = best_clf.score(X_test, y_test)
        
    y_pred = best_clf.predict(X_test)
    
    precision = precision_score(y_test, y_pred)
    results_rf.loc[count_rf, 'precision'] = precision
    
    recall = recall_score(y_test, y_pred)
    results_rf.loc[count_rf, 'recall'] = recall
    
    ROC_AUC = metrics.roc_auc_score(y_test, best_clf.predict_proba(X_test)[:, 1])
    results_rf.loc[count_rf, 'ROC_AUC'] = best_clf.score(X_test, y_test)
    
    feature_importances = best_clf.feature_importances_
    feature_importances_df = pd.DataFrame({"features": list(X), 
                                       "feature_importances": feature_importances})
    feature_importances_df = feature_importances_df.sort_values("feature_importances", ascending=False)
    
    count_rf += 1
    
    print(best_parametrs)
    print(feature_importances_df[:3])

### the best tree parameters and the 3 most important features

In [365]:
%%time
for i in range(1, 8):
    random_forest_classifier(datasets[f'X{i}days'], responses[f'y{i}days'])

{'n_estimators': 95, 'min_samples_split': 26, 'min_samples_leaf': 23, 'max_depth': 19}
      features  feature_importances
2      correct             0.328682
1  total_steps             0.216020
5       viewed             0.198422
{'n_estimators': 121, 'min_samples_split': 20, 'min_samples_leaf': 50, 'max_depth': 28}
      features  feature_importances
1  total_steps             0.335521
2      correct             0.293423
5       viewed             0.155837
{'n_estimators': 162, 'min_samples_split': 17, 'min_samples_leaf': 9, 'max_depth': 24}
      features  feature_importances
1  total_steps             0.258822
5       viewed             0.237189
2      correct             0.218740
{'n_estimators': 23, 'min_samples_split': 31, 'min_samples_leaf': 15, 'max_depth': 17}
      features  feature_importances
2      correct             0.321020
1  total_steps             0.237974
0          day             0.154504
{'n_estimators': 140, 'min_samples_split': 37, 'min_samples_leaf': 7, 'max_

### results of predicting user churn after the first 1 to 7 days on the course using random forest classifier

In [366]:
results_rf

Unnamed: 0,names,precision,recall,accuracy_test_set,ROC_AUC
0,X1days,0.75,0.011696,0.91431,0.91431
1,X2days,0.528302,0.057026,0.917845,0.917845
2,X3days,0.486842,0.16122,0.922054,0.922054
3,X4days,0.566434,0.162,0.919024,0.919024
4,X5days,0.63253,0.202703,0.920202,0.920202
5,X6days,0.623529,0.220374,0.926094,0.926094
6,X7days,0.716867,0.24187,0.929293,0.929293
