# <a href="https://github.com/DataDisca/classification_challenge"> DataDisca_classification_challenge </a>

In [3]:
import openml
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Classification Algorithm
* K-nearest neighbour classification
* Linear discriminant analysis
* Naïve Bayes classifier
* Support vector machine
* Logistic regression
* Random forests
* Ada boost
* Gradient boost
* XGBoost

### Parameters
* K-nearest neighbour classification
    * <b> n_neighbors </b> int, default=5 Number of neighbors to use 
    * <b>weight:</b> The basic nearest neighbors regression uses uniform weights: that is, each point in the local neighborhood contributes uniformly to the classification of a query point. Under some circumstances, it can be advantageous to weight points such that nearby points contribute more to the regression than faraway points.
    * <b> algorithm: </b>
        * <b> brute </b> The brute-force computation of distances between all pairs of points in the dataset: for  samples in  dimensions. Efficient brute-force neighbors searches can be very competitive for small data samples. However, as the number of samples  grows, the brute-force approach quickly becomes infeasible.
        * <b> kd_tree </b> KD tree are the k dim binary tree. Split one of dimention(x1) to median, left child would be smaller and right would be greater then median of x1 then split from median x2 in both left and right child. Do until <b> leaf_size </b> condition meet. (D dimentions) if D<20 works well. high D cause curse of dimentionality.
        * <b> ball_tree </b> computational expensive because of hyper-sphere boundaries. but suitable with high dimentions. 
        
        
* Linear discriminant analysis
* Naïve Bayes classifier
* Support vector machine
* Logistic regression
* Random forests
* Ada boost
* Gradient boost
* XGBoost

In [11]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import minmax_scale
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

### import classification model 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer



In [5]:
def getDataset(dataset):
    import openml
    data =  openml.datasets.get_dataset(dataset).get_data()
    y_col = data[3][np.where(data[2])[0][0]]   ### get the index of target col and get name of col
    y = data[0][y_col]                         ### target 
    X = data[0].drop(y_col, axis=1)            ### features
    return X,y
 
X,y = getDataset('iris')

In [178]:
# def classifiers(dataset, model):
    
#     X,y = getDataset(dataset)
    
#     pipelines = {
#     "KNN":Pipeline([('scale',MinMaxScaler()),('KNN', KNeighborsClassifier())]),
#     "LDA":Pipeline([('scale',MinMaxScaler()),('LDA', LinearDiscriminantAnalysis())]),
#     "Naive_Bayes": Pipeline([('scale',MinMaxScaler()),('Naive Bayes', GaussianNB())]),
#     "SVC":Pipeline([('scale',MinMaxScaler()),('SVC',SVC())]),
#     "Logistic_Regression": Pipeline([('scale',MinMaxScaler()),('Logistic Regression', LogisticRegression())]),
#     "Random_Forest": Pipeline([('scale',MinMaxScaler()),('Random Forest', RandomForestClassifier())]),
#     "Ada_Boost": Pipeline([('scale',MinMaxScaler()),('Ada Boost', AdaBoostClassifier())]),
#     "Gradient_Boost": Pipeline([('scale',MinMaxScaler()),('Gradient Boost', GradientBoostingClassifier())]),
#     "XGBoost": Pipeline([('scale',MinMaxScaler()),('XGBoost', XGBClassifier())])
#     }
    
    
#     return pipelines[model].fit(X,y).score(X,y)


In [6]:
pipelines = {
    "KNN":Pipeline([('scale',MinMaxScaler()),('KNN', KNeighborsClassifier())]),
    "LDA":Pipeline([('scale',MinMaxScaler()),('LDA', LinearDiscriminantAnalysis())]),
    "Naive_Bayes": Pipeline([('scale',MinMaxScaler()),('Naive Bayes', GaussianNB())]),
    "SVC":Pipeline([('scale',MinMaxScaler()),('SVC',SVC())]),
    "Logistic_Regression": Pipeline([('scale',MinMaxScaler()),('Logistic Regression', LogisticRegression())]),
    "Random_Forest": Pipeline([('scale',MinMaxScaler()),('Random Forest', RandomForestClassifier())]),
    "Ada_Boost": Pipeline([('scale',MinMaxScaler()),('Ada Boost', AdaBoostClassifier())]),
    "Gradient_Boost": Pipeline([('scale',MinMaxScaler()),('Gradient Boost', GradientBoostingClassifier())]),
    "XGBoost": Pipeline([('scale',MinMaxScaler()),('XGBoost', XGBClassifier())])
    }

In [42]:
param = {
    "KNN":{
        'KNN__n_neighbors': range(1,20,2),
        'KNN__metric': ['euclidean','manhattan'], 
        'KNN__weights':['uniform', 'distance']
    },
    "LDA": {
        'LDA__n_components': range(1,10),
#         'LDA__solver': ['svd', 'lsqr', 'eigen'], 
#         'LDA__shrinkage':range(0, 1)
    },
    "Naive_Bayes": {
        
    },
    "SVC": {
        'SVC__C': range(0,11), 
        'SVC__kernel': {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}
    },  
    
    "Logistic_Regression": {
        'Logistic Regression__C': [0.001,0.01,0.1,1.0,10,100],
        'Logistic Regression__penalty': ['none','l1','l2','elasticnet']
    },
    "Random_Forest": {
        'Random Forest__max_depth': [1,2,3,4,5,10,15,20,30,40,50,70,100],
        'Random Forest__max_features': ['auto', 'sqrt', 'log2'],
        'Random Forest__min_samples_leaf': [1,2,4],
        'Random Forest__min_samples_split': [2,5,10],
        'Random Forest__n_estimators': [10,50,100,500,1000]
    },
    "Ada_Boost": {
        'Ada Boost__learning_rate': [0.001,0.01,0.1,0.3,0.5,0.7,0.9,1.0],
        'Ada Boost__n_estimators': [10,50,100,500,1000]
    },
    "Gradient_Boost": {
        'Gradient Boost__learning_rate': [0.001,0.01,0.1,0.3,0.5,0.7,0.9,1.0],
        'Gradient Boost__max_depth': [1,2,3,4,5,10,15,20,30,40,50,70,100],
        'Gradient Boost__max_features': ['auto', 'sqrt', 'log2'],
        'Gradient Boost__min_samples_leaf': [1,2,4],
        'Gradient Boost__min_samples_split': [2,5,10],
        'Gradient Boost__n_estimators': [10,50,100,500,1000]
    },
    "XGBoost": {
        'XGBoost__learning_rate': [0.001,0.01,0.1,0.3,0.5,0.7,0.9,1.0],
        'XGBoost__gamma': [0,0.5,1],
        'XGBoost__max_depth': [1,2,3,4,5,10,15,20,30,40,50,70,100],
        'XGBoost__n_estimators': [10,50,100,500,1000]
               
    }
    }


In [8]:
# pipelines['Naive_Bayes'].get_params()

In [43]:
# models = ["KNN","LDA","Naive_Bayes","SVC","Logistic_Regression","Random_Forest","Ada_Boost","Gradient_Boost",
#           "XGBoost" ]

models = ['LDA']
dataset = 'iris'

# score = {'F1_Score' : f1_score}
columns = ['Dataset','Classifier','Random_state','F1_Score']   ## + split score


X,y = getDataset(dataset)

f1_scorer = make_scorer(f1_score, average='macro')


rand_state = [0] #list(range(0,9,1))

for m in models:
    for rs in rand_state:
        print(m)
        print(rs)
        gs = GridSearchCV(estimator=pipelines[m],
                          param_grid=param[m],
                          scoring = f1_scorer,
                          return_train_score=True,
                          cv=StratifiedKFold(n_splits=10, random_state=rs, shuffle=True))
        gs.fit(X,y)
        
        t = pd.DataFrame(gs.cv_results_)
        t['Random_state'] = rs
    t['classification']  = m


    
        
        




LDA
0


Traceback (most recent call last):
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/discriminant_analysis.py", line 539, in fit
    raise ValueError(
ValueError: n_components cannot be larger than min(n_features, n_classes - 1).

Traceback (most recent call last):
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/discriminant_analy

Traceback (most recent call last):
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/discriminant_analysis.py", line 539, in fit
    raise ValueError(
ValueError: n_components cannot be larger than min(n_features, n_classes - 1).

Traceback (most recent call last):
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/discriminant_analy

Traceback (most recent call last):
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/discriminant_analysis.py", line 539, in fit
    raise ValueError(
ValueError: n_components cannot be larger than min(n_features, n_classes - 1).

Traceback (most recent call last):
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/jd/.local/lib/python3.8/site-packages/sklearn/discriminant_analy

In [141]:
rand_state = list(range(0,9,1))
for m in models:
    for rs in rand_state:
        print(m)
        print(rs)
        gs = GridSearchCV(estimator=pipelines[m], param_grid=param[m],cv=StratifiedKFold(n_splits=10, random_state=rs, shuffle=True))
        gs.fit(X,y)
        
#         temp = {'Random_state' : rs , 'Classifier' : m, }
        print(pd.DataFrame(gs.cv_results_))df = pd.DataFrame(columns=columns)

In [147]:
df.append({'Dataset':'iris', 'Classifier':'KNN','Random_state':0,'KNN__n_neighbors': 1, 'KNN__metric':1,'KNN__weights':2}, ignore_index = True)
 

Unnamed: 0,Dataset,Classifier,Random_state,KNN__n_neighbors,KNN__metric,KNN__weights
0,iris,KNN,0,1,1,2


In [192]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, make_scorer

pipelines['KNN'].fit(X,y).score(X,y)



# f1_score()

0.96

In [30]:
# t.append({'a':100}, ignore_index=True)

In [40]:
t

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_KNN__metric,param_KNN__n_neighbors,param_KNN__weights,params,split0_test_score,split1_test_score,...,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score,Random_state,classification
0,0.004362,0.000168,0.003921,0.000318,euclidean,1,uniform,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",1.0,0.93266,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0,KNN
1,0.00463,0.000683,0.004074,0.001176,euclidean,1,distance,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",1.0,0.93266,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0,KNN
2,0.004308,0.000102,0.00385,0.0001,euclidean,3,uniform,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",1.0,0.93266,...,0.962958,0.970356,0.962958,0.962958,0.97037,0.962958,0.965181,0.003394,0,KNN
3,0.004255,4.9e-05,0.003462,0.000104,euclidean,3,distance,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",1.0,0.93266,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0,KNN
4,0.004307,3.7e-05,0.003836,3.9e-05,euclidean,5,uniform,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",1.0,0.93266,...,0.948142,0.962922,0.962958,0.955534,0.97037,0.970356,0.961462,0.006458,0,KNN
5,0.004282,6e-05,0.003488,0.000228,euclidean,5,distance,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",1.0,0.93266,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0,KNN
6,0.004911,0.000354,0.004124,0.000269,euclidean,7,uniform,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",1.0,0.93266,...,0.962958,0.962922,0.970356,0.962922,0.977775,0.977753,0.968867,0.006469,0,KNN
7,0.004601,0.000939,0.003416,0.000104,euclidean,7,distance,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",1.0,0.93266,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0,KNN
8,0.004603,0.000425,0.004241,0.000283,euclidean,9,uniform,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",1.0,0.93266,...,0.97037,0.97037,0.970356,0.97037,0.977775,0.970356,0.970362,0.004691,0,KNN
9,0.00424,0.000138,0.003423,7.8e-05,euclidean,9,distance,"{'KNN__metric': 'euclidean', 'KNN__n_neighbors...",1.0,0.93266,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0,KNN


In [37]:
t.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_KNN__metric', 'param_KNN__n_neighbors', 'param_KNN__weights',
       'params', 'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'split5_test_score',
       'split6_test_score', 'split7_test_score', 'split8_test_score',
       'split9_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score', 'split0_train_score', 'split1_train_score',
       'split2_train_score', 'split3_train_score', 'split4_train_score',
       'split5_train_score', 'split6_train_score', 'split7_train_score',
       'split8_train_score', 'split9_train_score', 'mean_train_score',
       'std_train_score', 'Random_state'],
      dtype='object')