In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, auc, roc_auc_score, roc_curve, f1_score, precision_score, recall_score

%matplotlib inline

### Logistic Regression

In [None]:
sns.set()

In [None]:
# data
x = np.array([0,1,1.5,2,3,4,5,8,11,12,13,14,15]).reshape(-1,1)
y = np.array([1,1,1,1,0,0,1,0,1,0,0,0,0])

In [None]:
model = LogisticRegression(C=10).fit(x, y)

In [None]:
# parameters of the model
a0 = model.coef_
b0 = model.intercept_

In [None]:
print(a0[0], b0)

In [None]:
def sigmoid(x):
    return (1 / (1 + np.exp(-x)))

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x,y,color='b')
x_ = np.linspace(-5, 25, 100)
y_lr1 = sigmoid(a0[0]*x_ + b0[0])
plt.plot(x_, y_lr1, linestyle='dashed', color='g')
plt.axhline(y=0.5, xmin=0.05, xmax=0.36, linewidth=1, linestyle='dashed', color='r')
plt.axvline(x=5.5, ymin=0.12, ymax=0.92, linewidth=1, linestyle='dashed', color='r')
plt.yticks([0, 0.25,0.5,0.75,1])
plt.ylim([-0.2,1.2])
plt.legend(['Logit function'])
plt.show()

In [None]:
pred_train = model.predict(x)

In [None]:
pred_train

In [None]:
probabilities = model.predict_proba(x)
probabilities[:,1]

In [None]:
accuracy_score(y, pred_train)

In [None]:
precision_score(y, pred_train)

In [None]:
recall_score(y,pred_test)

In [None]:
print(classification_report(y, pred_train))

In [None]:
roc_auc_score(y, pred_train)

In [None]:
roc_auc_score(y, probabilities[:,1])

In [None]:
# fpr, tpr, thresholds and roc auc
fpr, tpr, thresholds = roc_curve(y, probabilities[:,1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
transformer = PolynomialFeatures(degree=3, include_bias=False)
transformer.fit(x)

In [None]:
x_3 = transformer.transform(x)

In [None]:
x_3

In [None]:
model2 = LogisticRegression(C=1000).fit(x_3, y)

In [None]:
model2.coef_

In [None]:
model2.intercept_

In [None]:
b2 = model2.coef_[0][2]
c2 = model2.coef_[0][1]
d2 = model2.coef_[0][0]
e2 = model2.intercept_[0]

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x,y,color='b')
y_lr2 = sigmoid(b2*(x_**3) + c2*(x_)**2 + d2*x_ + e2)
plt.plot(x_, y_lr2, linestyle='dashed', color='g')
plt.axhline(y=0.5, xmin=0.05, xmax=0.54, linewidth=1, linestyle='dashed', color='r')
plt.axvline(x=2.8, ymin=0.12, ymax=0.92, linewidth=1, linestyle='dashed', color='r')
plt.axvline(x=7.6, ymin=0.12, ymax=0.92, linewidth=1, linestyle='dashed', color='r')
plt.axvline(x=11.15, ymin=0.12, ymax=0.92, linewidth=1, linestyle='dashed', color='r')
plt.yticks([0, 0.25,0.5,0.75,1])
plt.ylim([-0.2,1.2])
plt.legend(['Logit function'])
plt.show()

In [None]:
y_pred = model2.predict(x_3)

In [None]:
y_pred 

In [None]:
accuracy_score(y, y_pred)

In [None]:
print(classification_report(y, y_pred))

In [None]:
# fpr, tpr, thresholds and roc auc
fpr, tpr, thresholds = roc_curve(y, model2.predict_proba(x_3)[:,1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
model3 = LogisticRegression(C=0.1).fit(x_3, y)

In [None]:
model3.coef_

In [None]:
model3.intercept_

In [None]:
b2 = model3.coef_[0][2]
c2 = model3.coef_[0][1]
d2 = model3.coef_[0][0]
e2 = model3.intercept_[0]

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x,y,color='b')
y_lr3 = sigmoid(b2*(x_**3) + c2*(x_)**2 + d2*x_ + e2)
plt.plot(x_, y_lr3, linestyle='dashed', color='g')
plt.axhline(y=0.5, xmin=0.05, xmax=0.43, linewidth=1, linestyle='dashed', color='r')
plt.axvline(x=7.7, ymin=0.12, ymax=0.92, linewidth=1, linestyle='dashed', color='r')
plt.yticks([0, 0.25,0.5,0.75,1])
plt.ylim([-0.2,1.2])
plt.legend(['Logit function'])
plt.show()

In [None]:
y_pred3 = model3.predict(x_3)

In [None]:
y_pred3

In [None]:
accuracy_score(y, y_pred3)

In [None]:
print(classification_report(y, y_pred3))

In [None]:
# fpr, tpr, thresholds and roc auc
fpr, tpr, thresholds = roc_curve(y, model3.predict_proba(x_3)[:,1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
x_test = np.array([0.5, 3.5, 5.5, 10, 19]).reshape(-1,1)
y_test = np.array([1, 0, 1, 0, 0])

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x,y,color='b')
plt.scatter(x_test, y_test, color='black', marker='*')
plt.plot(x_, y_lr1, linestyle='dashed', color='magenta')
plt.plot(x_, y_lr2, linestyle='dashed', color='g')
plt.plot(x_, y_lr3, linestyle='dashed', color='orange')
plt.axhline(y=0.5, xmin=0.05, xmax=0.93, linewidth=1, linestyle='dotted', color='r')
plt.yticks([0, 0.25,0.5,0.75,1])
plt.ylim([-0.2,1.2])
plt.legend(['1 degree','3 degree, C=1000', '3 degree, C=0.1'])
plt.show()
plt.show()

In [None]:
x_test_3 = transformer.fit_transform(x_test)

In [None]:
# fpr, tpr, thresholds and roc auc
fpr1, tpr1, thresholds1 = roc_curve(y_test, model.predict_proba(x_test)[:,1])
fpr2, tpr2, thresholds2 = roc_curve(y_test, model2.predict_proba(x_test_3)[:,1])
fpr3, tpr3, thresholds3 = roc_curve(y_test, model3.predict_proba(x_test_3)[:,1])
roc_auc1 = auc(fpr1, tpr1)
roc_auc2 = auc(fpr2, tpr2)
roc_auc3 = auc(fpr3, tpr3)
# Plot ROC curve
plt.plot(fpr1, tpr1, label='ROC curve (area = %0.3f)' % roc_auc1, color='magenta')
plt.plot(fpr2, tpr2, label='ROC curve (area = %0.3f)' % roc_auc2, color='g')
plt.plot(fpr3, tpr3, label='ROC curve (area = %0.3f)' % roc_auc3, color='orange')
plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

### Exercise
Compare other metrics for train and tets dataset, make conclusions

### K-nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=2)
neigh.fit(x, y) 

In [None]:
test_pr_knn = neigh.predict(x_test)
train_knn = neigh.predict(x)

In [None]:
print(classification_report(y_test,test_pr_knn))

In [None]:
print(classification_report(y,train_knn))

In [None]:
roc_auc_score(y_test,test_pr_knn)

In [None]:
roc_auc_score(y,train_knn)

In [None]:
x_linsp = np.linspace(0,25,100).reshape(-1,1)
model1 = KNeighborsClassifier(n_neighbors=1).fit(x,y)
y_knn_1 = model1.predict_proba(x_linsp)[:,1]
model2 = KNeighborsClassifier(n_neighbors=2).fit(x,y)
y_knn_2 = model2.predict_proba(x_linsp)[:,1]
model3 = KNeighborsClassifier(n_neighbors=5, weights='distance').fit(x,y)
y_knn_3 = model3.predict_proba(x_linsp)[:,1]

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x,y,color='b')
plt.scatter(x_test, y_test, color='black', marker='*')
plt.plot(x_linsp, y_knn_1, linestyle='dashed', color='orange')
plt.plot(x_linsp, y_knn_2, linestyle='dashed', color='green')
plt.plot(x_linsp, y_knn_3, linestyle='dashed', color='red')

plt.legend(['knn-1', 'knn-2','knn-5', 'data', 'test'])
plt.yticks([0, 0.25,0.5,0.75,1])
plt.ylim([-0.2,1.2])
plt.show()

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(max_depth=2)
dt.fit(x, y)
test_pred = dt.predict(x_test)
train_pred = dt.predict(x)

In [None]:
x_linsp = np.linspace(0,25,100).reshape(-1,1)
model1 = DecisionTreeClassifier(max_depth=2).fit(x,y)
y_dt_1 = model1.predict_proba(x_linsp)[:,1]
model2 = DecisionTreeClassifier(max_depth=3).fit(x,y)
y_dt_2 = model2.predict_proba(x_linsp)[:,1]
model3 = DecisionTreeClassifier(max_depth=5).fit(x,y)
y_dt_3 = model3.predict_proba(x_linsp)[:,1]

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x,y,color='b')
plt.scatter(x_test, y_test, color='black', marker='*')
plt.plot(x_linsp, y_dt_1, linestyle='dashed', color='orange')
plt.plot(x_linsp, y_dt_2, linestyle='dashed', color='green')
plt.plot(x_linsp, y_dt_3, linestyle='dashed', color='red')
plt.legend(['max_depth=2', 'max_depth=3', 'max_depth=5', 'data', 'test'])
plt.yticks([0, 0.25,0.5,0.75,1])
plt.ylim([-0.2,1.2])
plt.show()

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=6, random_state=40)
rf.fit(x, y)
test_pred = rf.predict(x_test)
train_pred = rf.predict(x)

In [None]:
rf1 = RandomForestClassifier(n_estimators=3, max_depth=3).fit(x,y)
y_rf_1 = rf1.predict_proba(x_linsp)[:,1]
rf2 = RandomForestClassifier(n_estimators=5, max_depth=3).fit(x,y)
y_rf_2 = rf2.predict_proba(x_linsp)[:,1]
rf3 = RandomForestClassifier(n_estimators=6, max_depth=3).fit(x,y)
y_rf_3 = rf3.predict_proba(x_linsp)[:,1]

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x,y,color='b')
plt.scatter(x_test, y_test, color='black', marker='*')
plt.plot(x_linsp, y_rf_1, linestyle='dashed', color='orange')
plt.plot(x_linsp, y_rf_2, linestyle='dashed', color='green')
plt.plot(x_linsp, y_rf_3, linestyle='dashed', color='red')
plt.legend(['RF_4trees', 'RF_5trees', 'RF_6trees', 'data', 'test'])
plt.yticks([0, 0.25,0.5,0.75,1])
plt.ylim([-0.2,1.2])
plt.show()

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=0.02, max_depth=2)
gbc.fit(x, y)
test_pred = gbc.predict(x_test)
train_pred = gbc.predict(x)

In [None]:
gbc1 =  GradientBoostingClassifier(n_estimators=5, max_depth=2).fit(x,y)
y_gbc_1 = gbc1.predict_proba(x_linsp)[:,1]
gbc2 =  GradientBoostingClassifier(n_estimators=10, max_depth=2).fit(x,y)
y_gbc_2 = gbc2.predict_proba(x_linsp)[:,1]
gbc3 =  GradientBoostingClassifier(n_estimators=20, max_depth=2).fit(x,y)
y_gbc_3 = gbc3.predict_proba(x_linsp)[:,1]
gbc4 =  GradientBoostingClassifier(n_estimators=50, max_depth=2).fit(x,y)
y_gbc_4 = gbc4.predict_proba(x_linsp)[:,1]

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x,y,color='b')
plt.scatter(x_test, y_test, color='black', marker='*')
plt.plot(x_linsp, y_gbc_1, linestyle='dashed', color='orange')
plt.plot(x_linsp, y_gbc_2, linestyle='dashed', color='green')
plt.plot(x_linsp, y_gbc_3, linestyle='dashed', color='grey')
plt.plot(x_linsp, y_gbc_4, linestyle='dashed', color='red')
plt.legend(['5 estimators', '10 estimators', '20 estimators', '50 estimators', 'data', 'test'])
plt.yticks([0, 0.25,0.5,0.75,1])
plt.ylim([-0.2,1.2])
plt.show()