In [None]:
# Load libraries
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV 

import warnings
warnings.filterwarnings('ignore')

## Exploratory Data Analysis & Data Preparation

In [None]:
# Load dataset
dataset = pd.read_csv("data.csv")

In [None]:
# dataset = dataset.drop(columns = ['UnitPrice','Rating'])
dataset.head()
# dataset.sample(10)

In [None]:
# shape
dataset.info()

In [None]:
# shape
print(dataset.shape)
# dataset descriptions
print(dataset.describe())

In [None]:
# class distribution
dataset.groupby('ItemName').size()

In [None]:
# box and whisker plots
dataset.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()

In [None]:
# histograms
dataset.hist()
plt.show()

In [None]:
# scatter plot matrix
scatter_matrix(dataset)
plt.show()

## Selecting and Training Models

In [None]:
# Split dataset into train, test and validation sets
# train - 80
# test - 20
X = dataset.drop(columns=['ItemName'])
Y = dataset['ItemName']
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=0.20)

In [None]:
# Building and evaluating  classification Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('GradientBoosting', GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=6)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
# Make predictions on validation dataset with KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
# Make predictions on validation dataset with SVM classifier
SVM = SVC(probability=True)
SVM.fit(X_train, Y_train)
predictions = SVM.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
# Make predictions on validation dataset with CART (Decision Tree Classifier)
CART =DecisionTreeClassifier()
CART.fit(X_train, Y_train)
predictions = CART.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

## Hyperparameter Tuning using GridSearchCV

In [None]:
# defining parameter range 
# param_grid = {'C': [0.1, 1, 10, 100, 1000],  
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
#               'kernel': ['rbf']}  
  
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']} 

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train, Y_train) 

In [None]:
# print best parameter after tuning 
# print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
# print(grid.best_estimator_) 

grid_predictions = grid.predict(X_validation) 
print(accuracy_score(Y_validation, grid_predictions))
print(confusion_matrix(Y_validation, grid_predictions))
print(classification_report(Y_validation, grid_predictions))

## Model Training

In [None]:
# model training
model = grid.fit(X_train, Y_train)
print(model)

## Sample Test

In [None]:
def predict(config, model):
    
    if type(config) == dict:
        dataset = pd.DataFrame(config)
    else:
        dataset = config
    
    print(dataset)
    
    y_pred = model.predict(dataset)
    print(y_pred)
    return y_pred
    

In [None]:
config = {
    'CustomerType': [1],
    'Gender': [0],
    'Age': [3],
    'UnitPrice': [35],
    'Rating': [7]
    
}

predict(config, model)

In [None]:
# dataset.sample(10)

In [None]:
# config=[[0,1,3,20,8.5]]
# new_output = model.predict(config)

# print(new_output)

## Save the Model

In [None]:
import pickle

# Save to file in the current working directory
pkl_filename = "model.bin"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(X_validation, Y_validation)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(X_validation)

In [None]:
##loading the model from the saved file
pkl_filename = "model.bin"
with open(pkl_filename, 'rb') as f_in:
    model = pickle.load(f_in)

predict(config, model)