In [1]:
# Import the usual suspects. Any new functions will be introduced individually for clarity.
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import make_classification #generates a random classification problem
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from xgboost import XGBClassifier
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions #plotting regions
%matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, 
                              AdaBoostClassifier, BaggingRegressor)

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.datasets.california_housing import fetch_california_housing
import sklearn.linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
import sklearn.svm
from imblearn.over_sampling import ADASYN

# make prettier plots
# %config InlineBackend.figure_format = 'svg' 

Using TensorFlow backend.


In [2]:
df = pd.read_pickle("./raw_data.pkl")

In [3]:
X = df.loc[:, df.columns != 'target']
y = df.target

In [None]:
df.drop(columns=('top_100_count'))


#### Create ADASYN

In [6]:
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_sample(X,y)

In [7]:
X_train_adasyn, X_test_adasyn, y_train_adasyn, y_test_adasyn = train_test_split(X_adasyn,y_adasyn 
                                                                                , test_size=0.2
                                                                                , random_state=42)

X_train_adasyn, X_val_adasyn, y_train_adasyn, y_val_adasyn = train_test_split(X_train_adasyn
                                                                              ,y_train_adasyn 
                                                                              , test_size=0.25
                                                                              , random_state=42)

In [8]:
Counter(y_adasyn)

Counter({0: 7020, 1: 7021})

#### XGBoost

In [17]:
clf = XGBClassifier().fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = clf.predict(X_val_adasyn)

In [20]:
recall = metrics.recall_score(y_val_adasyn,y_pred_adasyn)
print(f"Recall Score: {recall}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn, average='macro', beta=0.4)}")
      

Recall Score: 0.995053003533569
FBeta: 0.9872947877983895


#### KNN

In [11]:
knn_param = {'n_neighbors' : range(1,100), 'algorithm':['auto', 'ball_tree','kd_tree','brute'], 
             'weights':['uniform', 'distance']}

In [12]:
knn = KNeighborsClassifier()
rand_search = RandomizedSearchCV(knn,knn_param, cv=5,scoring='recall',iid=True)
rand_search.fit(X_train_adasyn, y_train_adasyn)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.9265336031222889
{'weights': 'distance', 'n_neighbors': 27, 'algorithm': 'brute'}


In [21]:
# Train on training set, and Test on testing set
knn = KNeighborsClassifier(n_neighbors=27, weights = 'distance', algorithm='brute')
knn.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = knn.predict(X_val_adasyn)

print(f"Accuracy Score: {metrics.accuracy_score(y_val_adasyn, y_pred_adasyn)}")
print(f"Recall Score: {metrics.recall_score(y_val_adasyn,y_pred_adasyn)}")
print(f"F1 Score: {metrics.f1_score(y_val_adasyn,y_pred_adasyn)}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn, average='macro', beta=0.4)}")

Accuracy Score: 0.9205840455840456
Recall Score: 0.9321554770318021
F1 Score: 0.9220552254456483
FBeta: 0.9207416272485827


#### Decision Tree

In [14]:
decision_tree_param = {"class_weight": ['balanced',None],"max_depth": [3, None],
              "max_features": ['auto','sqrt','log2'],
              "min_samples_split": range(10,500,10),
              "criterion": ["gini", "entropy"],
                       'max_depth':range(1,10,2),
                      'random_state': [42]}



In [15]:
decision_tree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(decision_tree, decision_tree_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train_adasyn, y_train_adasyn)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.9462660175792941
{'random_state': 42, 'min_samples_split': 430, 'max_features': 'auto', 'max_depth': 9, 'criterion': 'entropy', 'class_weight': None}


In [22]:
decision_tree = DecisionTreeClassifier(criterion='entropy',class_weight= None
                                       ,random_state = 42, max_features = 'auto', min_samples_leaf = 2
                                       ,min_samples_split = 430
                                       ,max_depth = 9)
decision_tree.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = decision_tree.predict(X_val_adasyn)


print(f"Accuracy Score: {metrics.accuracy_score(y_val_adasyn, y_pred_adasyn)}")
print(f"Recall Score: {metrics.recall_score(y_val_adasyn,y_pred_adasyn)}")
print(f"F1 Score: {metrics.f1_score(y_val_adasyn,y_pred_adasyn)}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn, average='macro', beta=0.4)}")


Accuracy Score: 0.8963675213675214
Recall Score: 0.9250883392226148
F1 Score: 0.8999656239257476
FBeta: 0.8971889972456903


#### Logistic Regression

In [23]:
logistic_regression_param = {"C":np.logspace(-4, 4, 20), "penalty":["l2"]}


In [24]:
logistic_regression = LogisticRegression(solver="lbfgs")
rand_search = RandomizedSearchCV(logistic_regression, logistic_regression_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train_adasyn, y_train_adasyn)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.9360496163771823
{'penalty': 'l2', 'C': 10000.0}


In [25]:
logistic_regression = LogisticRegression(penalty= 'l2', C=0.0001)
logistic_regression.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = logistic_regression.predict(X_val_adasyn)

print(f"Accuracy Score: {metrics.accuracy_score(y_val_adasyn, y_pred_adasyn)}")
print(f"Recall Score: {metrics.recall_score(y_val_adasyn,y_pred_adasyn)}")
print(f"F1 Score: {metrics.f1_score(y_val_adasyn,y_pred_adasyn)}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn, average='macro', beta=0.4)}")

Accuracy Score: 0.7183048433048433
Recall Score: 0.9194346289752651
F1 Score: 0.7668729737695256
FBeta: 0.736729212689692




#### Random Forest

In [26]:
random_forest_param = {'n_estimators': [4,5,6,7,8, 9], 
              'max_features': ['log2', 'sqrt','auto',None],
              'criterion': ['entropy', 'gini'], 
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8], 'class_weight' : ['balanced','balanced_subsample', None]}


In [27]:
random_forest = RandomForestClassifier(n_jobs = -1)
rand_search = RandomizedSearchCV(random_forest, random_forest_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train_adasyn, y_train_adasyn)
print(rand_search.best_score_)
print(rand_search.best_params_)

0.9824061709734895
{'n_estimators': 4, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': 10, 'criterion': 'entropy', 'class_weight': None}


In [28]:
random_forest = RandomForestClassifier(n_estimators= 4, min_samples_split=3
                                       ,min_samples_leaf= 5, max_features= None, max_depth= 10
                                       , criterion= 'gini'
                                       ,class_weight='balanced_subsample')

random_forest.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = random_forest.predict(X_val_adasyn)

print(f"Accuracy Score: {metrics.accuracy_score(y_val_adasyn, y_pred_adasyn)}")
print(f"Recall Score: {metrics.recall_score(y_val_adasyn,y_pred_adasyn)}")
print(f"F1 Score: {metrics.f1_score(y_val_adasyn,y_pred_adasyn)}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn, average='macro', beta=0.4)}")

Accuracy Score: 0.9732905982905983
Recall Score: 0.9858657243816255
F1 Score: 0.9738219895287958
FBeta: 0.9735325236069964


#### SVC Model

In [None]:
svc_model_param = {'kernel':['linear', 'poly', 'rbf','sigmoid']
                   ,'gamma':[0.1, 1, 10, 100]
                   ,'C':[0.1, 1, 10, 100, 1000]
                   , 'degree':[0, 1, 2, 3, 4, 5, 6]}


In [None]:
svc_model = svm.SVC()
rand_search = RandomizedSearchCV(svc_model, svc_model_param, cv=5,scoring='recall',iid=True)

rand_search.fit(X_train_adasyn, y_train_adasyn)
print(rand_search.best_score_)
print(rand_search.best_params_)

In [None]:
svc_model = svm.SCV()

scv_model.fit(X_train_adasyn, y_train_adasyn)
y_pred_adasyn = scv_model.predict(X_val_adasyn)

print(f"Accuracy Score: {metrics.accuracy_score(y_val_adasyn, y_pred_adasyn)}")
print(f"Recall Score: {metrics.recall_score(y_val_adasyn,y_pred_adasyn)}")
print(f"F1 Score: {metrics.f1_score(y_val_adasyn,y_pred_adasyn)}")
print(f"FBeta: {metrics.fbeta_score(y_val_adasyn, y_pred_adasyn, average='macro', beta=0.4)}")