In [1]:
# Required libraries
import pickle
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('dataset/hist.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,labels
0,11.0,11.0,32.0,70.0,191.0,388.0,579.0,559.0,673.0,569.0,...,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,metal
1,48974.0,178.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,27.0,29.0,31.0,56.0,52.0,69.0,78.0,69.0,60.0,glass
2,48418.0,516.0,162.0,35.0,17.0,3.0,0.0,1.0,0.0,0.0,...,86.0,76.0,56.0,36.0,23.0,19.0,17.0,9.0,21.0,paper
3,49152.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,1.0,1.0,3.0,3.0,0.0,4.0,3.0,plastic
4,49152.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,14.0,21.0,43.0,35.0,57.0,55.0,63.0,72.0,cardboard


In [3]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Random forest
rfc = RandomForestClassifier() 
rfc.fit(X_train, y_train)
preds = rfc.predict(X_test)
print('Random Forest accuracy :- ', accuracy_score(preds, y_test) * 100)

Random Forest accuracy :-  93.45238095238095


In [5]:
# XGBoost
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
preds = xgb.predict(X_test)
print('XGBoost accuracy :- ', accuracy_score(preds, y_test) * 100)

XGBoost accuracy :-  74.14285714285714


In [6]:
# Decision tree
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
preds = dtc.predict(X_test)
print('Decision Tree accuracy :- ', accuracy_score(preds, y_test) * 100)

Decision Tree accuracy :-  81.9047619047619


In [7]:
# Tuning Random Forest with GridSearchCV
clc = RandomForestClassifier()
parameters = [{ 'max_depth': [20, 30, 40], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 
               'n_estimators': [100, 200, 300] }]
grid_search = GridSearchCV(estimator = clc, param_grid = parameters, 
                scoring = 'accuracy', cv = 5, n_jobs = -1, verbose = 2)

grid_search = grid_search.fit(X, y)
best_acc = grid_search.best_score_
best_param = grid_search.best_params_

# Printing best parameters and accuracy
print(best_param)
print(best_acc)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 353 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 26.1min finished


{'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
0.9784285714285714


In [10]:
# Tuning Random Forest with GridSearchCV
clc = RandomForestClassifier()
parameters = [{ 'max_depth': [40, 60, 80], 'min_samples_leaf': [1], 'min_samples_split': [2], 
               'n_estimators': [200, 250] }]
grid_search = GridSearchCV(estimator = clc, param_grid = parameters, 
                scoring = 'accuracy', cv = 5, n_jobs = -1, verbose = 2)

grid_search = grid_search.fit(X, y)
best_acc = grid_search.best_score_
best_param = grid_search.best_params_

# Printing best parameters and accuracy
print(best_param)
print(best_acc)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.2min finished


{'max_depth': 80, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 250}
0.9782142857142857


In [11]:
# Tuning Random Forest with GridSearchCV
clc = RandomForestClassifier()
parameters = [{ 'max_depth': [80, 100, 120], 'min_samples_leaf': [1], 'min_samples_split': [2], 
               'n_estimators': [250] }]
grid_search = GridSearchCV(estimator = clc, param_grid = parameters, 
                scoring = 'accuracy', cv = 5, n_jobs = -1, verbose = 2)

grid_search = grid_search.fit(X, y)
best_acc = grid_search.best_score_
best_param = grid_search.best_params_

# Printing best parameters and accuracy
print(best_param)
print(best_acc)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:   57.7s remaining:   14.3s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.4min finished


{'max_depth': 80, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 250}
0.9786428571428571


In [12]:
# Random Forest training on complete data
final_clc = RandomForestClassifier(max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=200)
final_clc.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=40, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [13]:
# Save classifier
filename = 'model/RFCmodel.sav'
pickle.dump(final_clc, open(filename, 'wb'))

In [4]:
# Random Forest training on complete data
final_clc = XGBClassifier()
final_clc.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [5]:
# Save classifier
filename = 'model/XGBmodel.sav'
pickle.dump(final_clc, open(filename, 'wb'))