In [1]:
import pandas as pd
import numpy as np
import os
import shutil
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean
from numpy import std
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

In [2]:
# read csv using pandas
from numpy import nan
from sklearn import preprocessing


df = pd.read_csv('./secondary_data.csv',sep = ';')
label_encoder = preprocessing.LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])

# temp = label_encoder.fit_transform(df['class'])

df = df.replace(nan,'z')
print(df)

       class  cap-diameter cap-shape cap-surface cap-color  \
0          1         15.26         x           g         o   
1          1         16.60         x           g         o   
2          1         14.07         x           g         o   
3          1         14.17         f           h         e   
4          1         14.64         x           h         o   
...      ...           ...       ...         ...       ...   
61064      1          1.18         s           s         y   
61065      1          1.27         f           s         y   
61066      1          1.27         s           s         y   
61067      1          1.24         f           s         y   
61068      1          1.17         s           s         y   

      does-bruise-or-bleed gill-attachment gill-spacing gill-color  \
0                        f               e            z          w   
1                        f               e            z          w   
2                        f               e   

In [3]:
categorical_col = ['cap-shape',
                   
                   'cap-surface','cap-color','does-bruise-or-bleed','gill-attachment','gill-spacing',
                  'gill-color','stem-root','stem-surface','stem-color','veil-type','veil-color','has-ring',
                  'ring-type','spore-print-color','habitat','season']
transformer = make_column_transformer(
    (OneHotEncoder(sparse=False), categorical_col),
    remainder='passthrough')

transformed = transformer.fit_transform(df)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names())
print(transformed_df.head())


   onehotencoder__x0_b  onehotencoder__x0_c  onehotencoder__x0_f  \
0                  0.0                  0.0                  0.0   
1                  0.0                  0.0                  0.0   
2                  0.0                  0.0                  0.0   
3                  0.0                  0.0                  1.0   
4                  0.0                  0.0                  0.0   

   onehotencoder__x0_o  onehotencoder__x0_p  onehotencoder__x0_s  \
0                  0.0                  0.0                  0.0   
1                  0.0                  0.0                  0.0   
2                  0.0                  0.0                  0.0   
3                  0.0                  0.0                  0.0   
4                  0.0                  0.0                  0.0   

   onehotencoder__x0_x  onehotencoder__x1_d  onehotencoder__x1_e  \
0                  1.0                  0.0                  0.0   
1                  1.0                  0.0   



In [4]:
y = (transformed_df.loc[:,['class']].values).reshape(-1)
transformed_df.drop(['class'], axis=1)
X = transformed_df.iloc[:,0:-1].values
print(X)

[[ 0.    0.    0.   ...  1.   15.26 16.95]
 [ 0.    0.    0.   ...  1.   16.6  17.99]
 [ 0.    0.    0.   ...  1.   14.07 17.8 ]
 ...
 [ 0.    0.    0.   ...  1.    1.27  3.86]
 [ 0.    0.    1.   ...  1.    1.24  3.56]
 [ 0.    0.    0.   ...  1.    1.17  3.25]]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
param_grid = {'max_depth': [5,10],'n_estimators': [200]}

model = RandomForestClassifier()
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv = 10)
grid.fit(X_train, y_train)

best_grid = grid.best_params_
print('RandomForestClassifier:',best_grid)
rfc=RandomForestClassifier(random_state=0, n_estimators= best_grid['n_estimators'], max_depth=best_grid['max_depth'])

rfc.fit(X_train,y_train)

print('Model Train Score: %.3f, ' %rfc.score(X_train, y_train))
joblib.dump(rfc, 'rfc.pkl')

RandomForestClassifier: {'max_depth': 5, 'n_estimators': 200}
Model Train Score: 1.000, 


['rfc.pkl']

In [None]:
# param_grid = {'C': [0.01,0.1, 1, 10], 
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf']} 

# model = svm.SVC()

# grid = GridSearchCV(estimator=model, param_grid=param_grid, cv = 15)
# grid.fit(X_train, y_train)

# best_grid = grid.best_params_
# print('SVM Grid:',best_grid)

# model = svm.SVC(C=best_grid['C'],kernel = best_grid['kernel'],
#                   gamma = best_grid['gamma'],random_state=42)

# model.fit(train_X,train_Y)
# print('Model Train Score: %.3f, ' %model.score(X_train, y_train))
# joblib.dump(model, 'svm_grid.pkl')

In [9]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

target_names = ['1', '0']

yhat_rfc = rfc.predict(X_test)
rfc_report = classification_report(y_test, yhat_rfc, target_names=target_names)
rfc_accuracy = accuracy_score(y_test, yhat_rfc)
print("Random Forest Classfication Test Set Prediction:")
print(rfc_report)
print(f"Test Accuracy :{rfc_accuracy} ")



print(y_test)
print(yhat_rfc)
print('accuracy for ',accuracy_score(y_test,yhat_rfc))
# print('jaccard_similarity_score for k=',jaccard_similarity_score(y_test,y_hat))
# print('f1_score for k=',f1_score(y_test,y_hat,average='macro'))

Random Forest Classfication Test Set Prediction:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      8923
           0       1.00      1.00      1.00     11230

    accuracy                           1.00     20153
   macro avg       1.00      1.00      1.00     20153
weighted avg       1.00      1.00      1.00     20153

Test Accuracy :1.0 
[0. 1. 0. ... 0. 1. 0.]
[0. 1. 0. ... 0. 1. 0.]
accuracy for  1.0
