### Importing

In [2]:
import numpy as np
import pandas as pd
import pickle
import joblib

from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_iris
from sklearn.datasets import load_wine
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score,\
     precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

### Decision Tree

In [None]:
# data
iris = load_iris(as_frame=True)
X = iris.data
y = iris.target

In [None]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
X.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, y_train.shape

((120, 4), (120,))

In [None]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
X_train_scaled[:5]

array([[0.08823529, 0.66666667, 0.        , 0.04166667],
       [0.41176471, 1.        , 0.0877193 , 0.125     ],
       [0.70588235, 0.45833333, 0.59649123, 0.54166667],
       [0.14705882, 0.58333333, 0.10526316, 0.04166667],
       [0.02941176, 0.5       , 0.05263158, 0.04166667]])

In [None]:
# Decision Tree

model_dt = DecisionTreeClassifier()
model_dt.fit(X_train_scaled, y_train)
model_dt.score(X_test_scaled, y_test)

0.9666666666666667

In [None]:
# hyperparameter tuning

parameters = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [2,3,4,5],
    'min_samples_split': [2,4,6,8],
    'min_samples_leaf': [1,2,3]
}

model_dt_h = GridSearchCV(DecisionTreeClassifier(), parameters, scoring=['accuracy'], refit='accuracy')
model_dt_h.fit(X_train_scaled, y_train)

In [None]:
model_dt_h.best_params_

{'criterion': 'gini',
 'max_depth': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 2,
 'splitter': 'random'}

In [None]:
y_pred = model_dt_h.predict(X_test_scaled)
y_pred[:5]

array([1, 0, 2, 1, 1])

In [None]:
y_test[:5]

73     1
18     0
118    2
78     1
76     1
Name: target, dtype: int64

In [None]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.82      1.00      0.90         9
           2       1.00      0.82      0.90        11

    accuracy                           0.93        30
   macro avg       0.94      0.94      0.93        30
weighted avg       0.95      0.93      0.93        30



In [None]:
acc = accuracy_score(y_test, y_pred)
# precision = precision_score(y_test, y_pred)
# recall = precision_score(y_test, y_pred)
# roc_auc = roc_auc_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print(f"accuracy - {acc}")
# print(f"precision - {precision}")
# print(f"recall - {recall}")
# print(f"roc auc score - {roc_auc}")
# print(f"f1 score - {f1}")
print(f"confusion matrix \n{confusion}")

accuracy - 0.9333333333333333
confusion matrix 
[[10  0  0]
 [ 0  9  0]
 [ 0  2  9]]


### Random Forest

In [None]:
breast_cancer = load_breast_cancer(as_frame=True)
X = breast_cancer.data
y = breast_cancer.target

X.shape, y.shape

((569, 30), (569,))

In [None]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [None]:
X.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [None]:
np.unique(y)

array([0, 1])

In [None]:
sum(y==1) / len(y)

0.6274165202108963

In [None]:
transformer = PCA(n_components=5)
X_pca = transformer.fit_transform(X)
X_pca[:5]

array([[ 1.16014257e+03, -2.93917544e+02,  4.85783976e+01,
        -8.71197531e+00,  3.20004861e+01],
       [ 1.26912244e+03,  1.56301818e+01, -3.53945342e+01,
         1.78612832e+01, -4.33487404e+00],
       [ 9.95793889e+02,  3.91567432e+01, -1.70975298e+00,
         4.19934010e+00, -4.66529118e-01],
       [-4.07180803e+02, -6.73803198e+01,  8.67284783e+00,
        -1.17598673e+01,  7.11546109e+00],
       [ 9.30341180e+02,  1.89340742e+02,  1.37480074e+00,
         8.49918256e+00,  7.61328922e+00]])

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

X_train.shape, y_train.shape

((455, 5), (455,))

In [None]:
scaler2 = MinMaxScaler()

X_train_scaled = scaler2.fit_transform(X_train)
X_test_scaled = scaler2.fit_transform(X_test)
X_train_scaled[:5]

array([[0.02749364, 0.34581246, 0.19957842, 0.56726111, 0.35453034],
       [0.46813975, 0.33927906, 0.07342801, 0.42226664, 0.38046771],
       [0.02605476, 0.35941014, 0.21443748, 0.60277145, 0.42376553],
       [0.06218263, 0.35771521, 0.172822  , 0.36475231, 0.21057348],
       [0.04254619, 0.37047378, 0.23007997, 0.58648373, 0.4596695 ]])

In [None]:
# Random forest

model_rd = RandomForestClassifier()
model_rd.fit(X_train, y_train)

In [None]:
y_pred = model_rd.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.6228070175438597

In [None]:
# hyper prameter tuning

params = {
    'n_estimators': [50, 60, 70, 80, 90, 100, 110, 120],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth':[2, 4, 6, 8, 10, 20, 30],
    'min_samples_split':[2, 4, 6, 8],
    'min_samples_leaf': [1,2,3,4,5,6,7],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
}

model_rd_h = RandomizedSearchCV(RandomForestClassifier(), params, n_iter=30, scoring='accuracy')
model_rd_h.fit(X_train_scaled, y_train)

In [None]:
cv_scores = cross_val_score(model_rd_h, X_scaled, y, cv=5, scoring='accuracy')
print(f'mean accuracy - {cv_scores.mean()}')

mean accuracy - 0.9648657040832168


In [None]:
y_pred_train = model_rd_h.predict(X_train_scaled)
y_pred_test = model_rd_h.predict(X_test_scaled)

y_pred_acc_train = accuracy_score(y_train, y_pred_train)
y_pred_acc_test = accuracy_score(y_test, y_pred_test)

print(f"accuracy in training data = {y_pred_acc_train}")
print(f"accuracy in testing data = {y_pred_acc_test}")

accuracy in training data = 0.9956043956043956
accuracy in testing data = 0.9035087719298246


In [None]:
report = classification_report(y_test, y_pred_test)
print('test data report\n')
print(report)

test data report
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        43
           1       1.00      0.85      0.92        71

    accuracy                           0.90       114
   macro avg       0.90      0.92      0.90       114
weighted avg       0.92      0.90      0.90       114



In [None]:
report = classification_report(y_train, y_pred_train)
print('train data report\n')
print(report)

train data report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       169
           1       0.99      1.00      1.00       286

    accuracy                           1.00       455
   macro avg       1.00      0.99      1.00       455
weighted avg       1.00      1.00      1.00       455



In [None]:
acc = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
recall = precision_score(y_test, y_pred_test)
roc_auc = roc_auc_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)
confusion = confusion_matrix(y_test, y_pred_test)

print('In test data - \n')
print(f"accuracy - {acc}")
print(f"precision - {precision}")
print(f"recall - {recall}")
print(f"roc auc score - {roc_auc}")
print(f"f1 score - {f1}")
print(f"confusion matrix \n{confusion}")

In test data - 

accuracy - 0.9035087719298246
precision - 1.0
recall - 1.0
roc auc score - 0.9225352112676056
f1 score - 0.916030534351145
confusion matrix 
[[43  0]
 [11 60]]


In [None]:
acc = accuracy_score(y_train, y_pred_train)
precision = precision_score(y_train, y_pred_train)
recall = precision_score(y_train, y_pred_train)
roc_auc = roc_auc_score(y_train, y_pred_train)
f1 = f1_score(y_train, y_pred_train)
confusion = confusion_matrix(y_train, y_pred_train)

print('In train data - \n')
print(f"accuracy - {acc}")
print(f"precision - {precision}")
print(f"recall - {recall}")
print(f"roc auc score - {roc_auc}")
print(f"f1 score - {f1}")
print(f"confusion matrix \n{confusion}")

In train data - 

accuracy - 0.9956043956043956
precision - 0.9930555555555556
recall - 0.9930555555555556
roc auc score - 0.9940828402366865
f1 score - 0.9965156794425087
confusion matrix 
[[167   2]
 [  0 286]]


### SVM

In [4]:
wine = load_wine(as_frame=True)
X = wine.data
y = wine.target

X.shape, y.shape

((178, 13), (178,))

In [5]:
X.sample(5)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
139,12.84,2.96,2.61,24.0,101.0,2.32,0.6,0.53,0.81,4.92,0.89,2.15,590.0
97,12.29,1.41,1.98,16.0,85.0,2.55,2.5,0.29,1.77,2.9,1.23,2.74,428.0
101,12.6,1.34,1.9,18.5,88.0,1.45,1.36,0.29,1.35,2.45,1.04,2.77,562.0
154,12.58,1.29,2.1,20.0,103.0,1.48,0.58,0.53,1.4,7.6,0.58,1.55,640.0
124,11.87,4.31,2.39,21.0,82.0,2.86,3.03,0.21,2.91,2.8,0.75,3.64,380.0


In [7]:
X.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [9]:
X.isna().sum()

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((142, 13), (36, 13))

In [13]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [15]:
# svm model

model_svm = SVC()
model_svm.fit(X_train_scaled, y_train)
model_svm.score(X_test_scaled, y_test)

0.9166666666666666

In [19]:
params = {
    'C' : [1,2,3,4, 5,6,7,8],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2,3,4,5,6, 7],
    'gamma': ['scale', 'auto'],
}

model_svm_h = GridSearchCV(SVC(), params)
model_svm_h.fit(X_train_scaled, y_train)

In [20]:
score_train = model_svm_h.score(X_train_scaled, y_train)
score_test = model_svm_h.score(X_test_scaled, y_test)

print(f"train dataset accuracy - {score_train}")
print(f"test dataset accuracy - {score_test}")

train dataset accuracy - 0.9859154929577465
test dataset accuracy - 0.9722222222222222


In [21]:
model_svm_h.best_params_

{'C': 1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}

### Saving model

In [None]:
with open('model.pkl', 'wb') as file:
  pickle.dump(model_svm_h)