In [82]:
# Data Manupulation
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# impute
from sklearn.impute import SimpleImputer, KNNImputer

# ColumnTransformer
from sklearn.compose import ColumnTransformer

# preprocessing
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler, MinMaxScaler, MaxAbsScaler, PolynomialFeatures

# model_selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, RepeatedKFold

# metrics
from sklearn.metrics import accuracy_score, confusion_matrix

# liner_model
from sklearn.linear_model import LogisticRegression, SGDClassifier

# svm
from sklearn.svm import SVC, LinearSVC

# ensemble
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, AdaBoostClassifier

# tree
from sklearn.tree import DecisionTreeClassifier

# naive_bayes
from sklearn.naive_bayes import GaussianNB

# neighbors
from sklearn.neighbors import KNeighborsClassifier

# xgboost
from xgboost import XGBClassifier

# Read Dataset

In [2]:
df = pd.read_csv('../../test_data/iris.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0,5.1,3.5,1.4,0.2,setosa
1,1,4.9,3.0,1.4,0.2,setosa


In [3]:
del df['Unnamed: 0']
df.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [4]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [5]:
df['species'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

# x, y split

In [6]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [7]:
x[0:2]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2]])

In [8]:
y[0:2]

array(['setosa', 'setosa'], dtype=object)

# LabelEncoder

In [9]:
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
y[0:2]

array([0, 0])

# Deal with Missing Values

In [11]:
imputer = KNNImputer(n_neighbors=5, missing_values=np.nan)
x[:, :] = imputer.fit_transform(x[:, :])

In [12]:
x[0:2]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2]])

# Train Test Split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Scalling

In [14]:
scaler = StandardScaler()
x_train[:, :] = scaler.fit_transform(x_train[:, :])
x_test[:, :] = scaler.fit_transform(x_test[:, :])

In [15]:
x_train[0:2]

array([[-1.01827123,  1.2864604 , -1.39338902, -1.3621769 ],
       [-0.7730102 ,  2.43545215, -1.33550342, -1.49647603]])

In [16]:
y_train[0:2]

array([0, 0])

In [17]:
x_test[0:2]

array([[ 0.25621067, -0.71903739,  0.58987181,  0.05585913],
       [-0.21299441,  1.61629772, -1.03834577, -1.06789518]])

In [18]:
y_test[0:2]

array([1, 0])

# Model Quality

In [19]:
def print_summary(classifier=None, x_train=None, y_train=None, y_test=None, y_pred=None):
    # confusion matrix
    con_matrix = confusion_matrix(y_test, y_pred)
    print("confusion matrix = \n", con_matrix)

    # accuracy score
    acc_score = accuracy_score(y_test, y_pred)
    print("accuracy score = ", acc_score)

    np.set_printoptions(precision=2)
    print("Compare with Predict and actual\n", np.concatenate((y_test.reshape(-1,1), y_pred.reshape(-1,1)), 1)[0:10])

# Logistic Regression

In [20]:
params = {
    'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 
    'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 
    'random_state': 0, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False
}
classifier = LogisticRegression()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 10  1]
 [ 0  0 12]]
accuracy score =  0.9736842105263158
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 2]
 [1 1]]


# Support Vector

In [24]:
params = {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 
          'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 
          'probability': False, 'random_state': 0, 'shrinking': True, 'tol': 0.001, 'verbose': False}

classifier = SVC()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# LinearSVC

In [28]:
params = {
    'C': 1.0, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 
    'loss': 'squared_hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None,
    'tol': 0.0001, 'verbose': 0
}
classifier = LinearSVC()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 10  1]
 [ 0  0 12]]
accuracy score =  0.9736842105263158
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# Random Forest Classifier

In [33]:
params = {
    'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None,
    'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0,
    'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100,
    'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False
}
classifier = RandomForestClassifier()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# Decision Tree Classifier

In [38]:
params = {
    'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None,
    'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2,
    'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'
}
classifier = DecisionTreeClassifier()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# SGD Classifier

In [42]:
params = {
    'alpha': 0.0001, 'average': False, 'class_weight': None, 'early_stopping': False, 'epsilon': 0.1, 
    'eta0': 0.0, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'optimal', 'loss': 'hinge',
    'max_iter': 1000, 'n_iter_no_change': 5, 'n_jobs': None, 'penalty': 'l2', 'power_t': 0.5,
    'random_state': None, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0,
    'warm_start': False
}
classifier = SGDClassifier()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 10  1]
 [ 0  0 12]]
accuracy score =  0.9736842105263158
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 2]
 [1 1]]


# K-Neighbors Classifier

In [60]:
params = {
    'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 
    'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'
}
classifier = KNeighborsClassifier()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# Gaussian Naive Bayes

In [54]:
params = {
    'priors': None, 'var_smoothing': 1e-09
}
classifier = GaussianNB()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 10  1]
 [ 0  0 12]]
accuracy score =  0.9736842105263158
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# XGB Classifier

In [80]:
params = {
    'objective': 'binary:logistic', 'use_label_encoder': False, 'base_score': None, 'booster': None,
    'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 
    'enable_categorical': False, 'gamma': None, 'gpu_id': None, 'importance_type': None, 
    'interaction_constraints': None, 'learning_rate': None, 'max_delta_step': None, 'max_depth': None,
    'min_child_weight': None, 'missing': np.nan, 'monotone_constraints': None, 'n_estimators': 100,
    'n_jobs': None, 'num_parallel_tree': None, 'predictor': None, 'random_state': None, 'reg_alpha': None,
    'reg_lambda': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None,
    'validate_parameters': None, 'verbosity': None
}
classifier = XGBClassifier()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# AdaBoost Classifier

In [83]:
params = {
    'algorithm': 'SAMME.R', 'base_estimator': None, 'learning_rate': 1.0, 'n_estimators': 50, 
    'random_state': None
}
classifier = AdaBoostClassifier()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# Voting Classifier

In [51]:
# Example 1
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[
        ('clf1', log_clf), ('clf2', rnd_clf), ('clf3', svm_clf)
    ],
    voting='hard'
)

voting_clf.fit(x_train, y_train)
y_pred = voting_clf.predict(x_test)
print_summary(classifier, x_train, y_train, y_test, y_pred)

print("#"*20)

# Example 2
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
eclf1 = eclf1.fit(x_train, y_train)
print(eclf1.predict(x_test))

eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
eclf2 = eclf2.fit(x_train, y_train)
print(eclf2.predict(x_test))

eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[2,1,1], flatten_transform=True)
eclf3 = eclf3.fit(x_train, y_train)
print(eclf3.predict(x_test))
print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]
####################
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# Bagging Classifier

In [64]:
params = {
    'base_estimator': DecisionTreeClassifier(), 
    'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 
    'max_samples': 1.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None,
    'verbose': 0, 'warm_start': False
}
classifier = BaggingClassifier()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# Stacking

In [67]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

estimators=[
    ('clf1', log_clf), ('clf2', rnd_clf), ('clf3', svm_clf)
]
classifier = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]


# Gradient Boosted

In [70]:
params = {
    'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 1.0, 'loss': 'deviance', 
    'max_depth': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 
    'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 
    'n_iter_no_change': None, 'random_state': 0, 'subsample': 1.0, 'tol': 0.0001, 
    'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False
}
classifier = GradientBoostingClassifier()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 10  1]
 [ 0  0 12]]
accuracy score =  0.9736842105263158
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 2]
 [1 1]]


# Hist Gradient Boosting Classifier

In [72]:
params = {
    'categorical_features': None, 'early_stopping': 'auto', 'l2_regularization': 0.0, 'learning_rate': 0.1,
    'loss': 'auto', 'max_bins': 255, 'max_depth': None, 'max_iter': 100, 'max_leaf_nodes': 31, 
    'min_samples_leaf': 20, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': None, 
    'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False
}
classifier = HistGradientBoostingClassifier()
classifier.set_params(**params)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

print_summary(classifier, x_train, y_train, y_test, y_pred)

confusion matrix = 
 [[15  0  0]
 [ 0 11  0]
 [ 0  0 12]]
accuracy score =  1.0
Compare with Predict and actual
 [[1 1]
 [0 0]
 [2 2]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [2 2]
 [1 1]
 [1 1]]
