In [1]:
# Load libraries
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV 

import warnings
warnings.filterwarnings('ignore')

## Exploratory Data Analysis & Data Preparation

In [2]:
# Load dataset
dataset = pd.read_csv("Data.csv")

In [3]:
# dataset = dataset.drop(columns = ['UnitPrice','Rating'])
dataset.head()
# dataset.sample(10)

Unnamed: 0,Collar_bone_x,Collar_bone_y,Collar_bone_z,Fore_arm_x,Fore_arm_y,Fore_arm_z,Hand_x_test,Hand_y_test,Hand_z_test,Upper_arm_x,Upper_arm_y,Upper_arm_z,Output
0,-0.155914,0.228653,2.120145,-0.530222,0.282706,2.109175,-0.58682,0.292541,2.103178,-0.343568,0.267775,2.130795,0
1,-0.118622,0.211508,2.183545,-0.369436,0.278433,1.948353,-0.39021,0.338841,1.909705,-0.285245,0.148957,2.067946,0
2,-0.209549,0.211172,2.116959,-0.243034,0.294249,1.793241,-0.22086,0.311429,1.783123,-0.270904,0.142925,1.896383,0
3,-0.124988,0.2148,2.193182,-0.140547,0.25977,1.712908,-0.141839,0.256737,1.714774,-0.233295,0.215391,1.948592,0
4,-0.163393,0.347503,2.142695,-0.184431,-0.156693,2.006275,-0.173476,-0.223631,1.98903,-0.197073,0.073565,2.104431,0


In [4]:
# shape
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 13 columns):
Collar_bone_x    20 non-null float64
Collar_bone_y    20 non-null float64
Collar_bone_z    20 non-null float64
Fore_arm_x       20 non-null float64
Fore_arm_y       20 non-null float64
Fore_arm_z       20 non-null float64
Hand_x_test      20 non-null float64
Hand_y_test      20 non-null float64
Hand_z_test      20 non-null float64
Upper_arm_x      20 non-null float64
Upper_arm_y      20 non-null float64
Upper_arm_z      20 non-null float64
Output           20 non-null int64
dtypes: float64(12), int64(1)
memory usage: 2.1 KB


In [5]:
# shape
print(dataset.shape)
# dataset descriptions
print(dataset.describe())

(20, 13)
       Collar_bone_x  Collar_bone_y  Collar_bone_z  Fore_arm_x  Fore_arm_y  \
count      20.000000      20.000000      20.000000   20.000000   20.000000   
mean       -0.156388       0.287747       2.151467   -0.281257    0.232572   
std         0.051161       0.061330       0.073076    0.171858    0.149264   
min        -0.254668       0.191613       1.965306   -0.596950   -0.156693   
25%        -0.181370       0.220749       2.119349   -0.444100    0.164294   
50%        -0.157482       0.309796       2.142040   -0.206816    0.269652   
75%        -0.118338       0.337123       2.197529   -0.144663    0.305285   
max        -0.062534       0.368385       2.290074   -0.079191    0.549684   

       Fore_arm_z  Hand_x_test  Hand_y_test  Hand_z_test  Upper_arm_x  \
count   20.000000    20.000000    20.000000    20.000000    20.000000   
mean     1.906220    -0.277814     0.229388     1.881084    -0.273937   
std      0.193315     0.204655     0.180510     0.212168     0.077391

In [6]:
# class distribution
dataset.groupby('Output').size()

Output
0    10
1    10
dtype: int64

## Selecting and Training Models

In [8]:
# Split dataset into train, test and validation sets
# train - 80
# test - 20
X = dataset.drop(columns=['Output'])
Y = dataset['Output']
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=0.20)

In [11]:
# Make predictions on validation dataset with KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.75
[[3 1]
 [0 0]]
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         4
           1       0.00      0.00      0.00         0

    accuracy                           0.75         4
   macro avg       0.50      0.38      0.43         4
weighted avg       1.00      0.75      0.86         4



In [12]:
# Make predictions on validation dataset with SVM classifier
SVM = SVC(probability=True)
SVM.fit(X_train, Y_train)
predictions = SVM.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.0
[[0 4]
 [0 0]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       4.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       4.0
   macro avg       0.00      0.00      0.00       4.0
weighted avg       0.00      0.00      0.00       4.0



In [13]:
# Make predictions on validation dataset with CART (Decision Tree Classifier)
CART =DecisionTreeClassifier()
CART.fit(X_train, Y_train)
predictions = CART.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

0.25
[[1 3]
 [0 0]]
              precision    recall  f1-score   support

           0       1.00      0.25      0.40         4
           1       0.00      0.00      0.00         0

    accuracy                           0.25         4
   macro avg       0.50      0.12      0.20         4
weighted avg       1.00      0.25      0.40         4



## Hyperparameter Tuning using GridSearchCV

In [14]:
# defining parameter range 
# param_grid = {'C': [0.1, 1, 10, 100, 1000],  
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
#               'kernel': ['rbf']}  
  
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']} 

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train, Y_train) 

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.500 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=1.000 total time=   0.0s
[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.333 total time=   0.0s
[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=1.000 total time=   0.0s
[CV 1/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.250 total time=   0.0s
[CV 2/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.333 total time=   0.0s
[CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.250 total time=   0.0s
[CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.667 total time=   0.0s
[CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.667 total time=   0.0s
[CV 4/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.333 total time=   0.0s
[CV 5/5] END ....C=1000, gam

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             verbose=3)

In [15]:
# print best parameter after tuning 
# print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
# print(grid.best_estimator_) 

grid_predictions = grid.predict(X_validation) 
print(accuracy_score(Y_validation, grid_predictions))
print(confusion_matrix(Y_validation, grid_predictions))
print(classification_report(Y_validation, grid_predictions))

0.5
[[2 2]
 [0 0]]
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.00      0.00      0.00         0

    accuracy                           0.50         4
   macro avg       0.50      0.25      0.33         4
weighted avg       1.00      0.50      0.67         4



## Model Training

In [16]:
# model training
model = grid.fit(X_train, Y_train)
print(model)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.500 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.500 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

[CV 3/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.667 total time=   0.0s
[CV 4/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.000 total time=   0.0s
[CV 5/5] END .....C=1000, gamma=0.1, kernel=rbf;, score=0.333 total time=   0.0s
[CV 1/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.250 total time=   0.0s
[CV 2/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.667 total time=   0.0s
[CV 3/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.667 total time=   0.0s
[CV 4/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.333 total time=   0.0s
[CV 5/5] END ....C=1000, gamma=0.01, kernel=rbf;, score=0.667 total time=   0.0s
[CV 1/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.500 total time=   0.0s
[CV 2/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.667 total time=   0.0s
[CV 3/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.667 total time=   0.0s
[CV 4/5] END ...C=1000, gamma=0.001, kernel=rbf;, score=0.667 total time=   0.0s
[CV 5/5] END ...C=1000, gamm

## Sample Test

In [26]:
def predict_mpg(config, model):
    
    if type(config) == dict:
        dataset = pd.DataFrame(config)
    else:
        dataset = config
    
    
    y_pred = model.predict(dataset)
#     print(y_pred)
    return y_pred
    

In [27]:
config = {
    'Collar_bone_x': [-0.1559136],
    'Collar_bone_y': [0.2286534],
    'Collar_bone_z': [2.120145],
    'Fore_arm_x': [-0.5302216],
    'Fore_arm_y': [0.2827062],
    'Fore_arm_z': [2.109175],
    'Hand_x_test': [-0.5868202],
    'Hand_y_test': [0.2925405],
    'Hand_z_test': [2.103178],
    'Upper_arm_x': [-0.3435675],
    'Upper_arm_y': [0.2677751],
    'Upper_arm_z': [2.130795]
    
}

predict_mpg(config, model)

array([0], dtype=int64)

In [28]:
# dataset.sample(10)

In [29]:
# config=[[0,1,3,20,8.5]]
# new_output = model.predict(config)

# print(new_output)

## Save the Model

In [30]:
import pickle

# Save to file in the current working directory
pkl_filename = "model.bin"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_validation, Y_validation)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(X_validation)

Test score: 50.00 %


In [31]:
##loading the model from the saved file
pkl_filename = "model.bin"
with open(pkl_filename, 'rb') as f_in:
    model = pickle.load(f_in)

predict(config, model)

   Collar_bone_x  Collar_bone_y  Collar_bone_z  Fore_arm_x  Fore_arm_y  \
0      -0.155914       0.228653       2.120145   -0.530222    0.282706   

   Fore_arm_z  Hand_x_test  Hand_y_test  Hand_z_test  Upper_arm_x  \
0    2.109175     -0.58682      0.29254     2.103178    -0.343568   

   Upper_arm_y  Upper_arm_z  
0     0.267775     2.130795  
[0]


array([0], dtype=int64)