## 1. Preparation

### 1.1. Import Library

### 1.1.1. Import Standard Library

In [None]:
import pandas as pd

### 1.1.2. Import Ploting Library

In [None]:
import matplotlib.pyplot as plt

### 1.1.3. Import Progress Bar Library

In [None]:
from tqdm import tqdm_notebook

### 1.1.4. Import Machine Learning Model Library

In [None]:
# Linear Model
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Nonlinear Model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

### 1.1.5. Import Model Selection Library

In [None]:
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

### 1.2. Import Data

In [None]:
data = pd.read_excel('G:\\data.xlsx')

In [None]:
data.head()

In [None]:
# Return dimensions of the dataframe.
data.shape

In [None]:
# Overview of data types. 
data.info()

In [None]:
# Overview of basic statistical details.
data.describe()

## 2. Data Transformation

### 2.1. Option 1: Categorization

In [None]:
# Split dataframe into 5 bins.
data['RankBins'] = pd.cut(data['Rank'], 5)

In [None]:
data.head()

In [None]:
# To make sure that "RankBins" is classified as categorical data.
data.info()

### 2.2. Option 2: Category Labeling

In [None]:
# When a record with "Rank" no larger than 10, set its "RankBins" as "1."
data.loc[data['Rank'] <= 10, 'RankBins'] = 1
data.loc[(data['Rank'] > 10) & (data['Rank'] <= 20), 'RankBins'] = 2
data.loc[(data['Rank'] > 20) & (data['Rank'] <= 30), 'RankBins'] = 3
data.loc[(data['Rank'] > 30) & (data['Rank'] <= 40), 'RankBins'] = 4
data.loc[(data['Rank'] > 40) & (data['Rank'] <= 50), 'RankBins'] = 5

In [None]:
data.head()

In [None]:
# To make sure that "RankBins" is classified as numerical data.
data.info()

## 3. Data Splitting

### 3.1. Independent and Dependent Variable

### 3.1.1. Independent Variable

In [None]:
l = [5,6,7,8,9]
X = data.iloc[:,l]

### 3.1.2. Dependent Variable

In [None]:
Y = data['RankBins']

### 3.2. Training and Validation Data

In [None]:
# Divide dataset into 2 partitions as "training" and "validation" 80% and 20% respectively.
validation_size = 0.20
# "Validation" data will be selected randomly in accordance with random seed 7. 
seed = 7

In [None]:
# Independent and dependent variables are to be divided into "training" and "validation" sets respectively.
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, 
                                                                                test_size=validation_size, 
                                                                                random_state=seed)

In [None]:
# To make sure that "validation" data is exactly 80% of the given data.
len(Y_train)

## 4. Model Selection

### 4.1. Data Transformation

In [None]:
# 4.1.1. Option 1: "RankBins" is a Categorical Data
# Categorical data cannot be used for further analysis and therefore should be transformed into "object" property.
Y_train = Y_train.astype(str)

In [None]:
# 4.1.2. Option 2: "RankBins" is a Numerical Data
# Data transformation will not be necessary.

### 4.2. Model Deployment

In [None]:
# Create a list containing 6 supervised learning models.
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

### 4.3. Model Evaluation

In [None]:
# K number of "folds" will be selected randomly in accordance with random seed 7. 
seed = 7
# Record the returned "predictive accuracy" of each model which will be used to plot "Boxplots." 
# (6 Models * 10 Folds = 60 Results)
results = []
# Create a list containing the names of 6 models which will be used to plot "Boxplots."
names = []
# Model evaluation will be based on "predictive accuracy."
scoring = 'accuracy'

# Model Evaluation
for name, model in tqdm_notebook(models):
    # Specify how to conduct cross-validation: 10 folds selected randomly in accordance with random seed 7. 
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    # Conduct cross-validation with the return of 10 "predictive accuracy" results. 
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    # Record 10 "predictive accuracy" results.
    results.append(cv_results)
    # Record the name of the model which will be used to plot "Boxplots."
    names.append(name)
    # Return mean and standard error of the 10 "predictive accuracy" results.
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)    

In [None]:
# Plot "Boxplots."
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
# Specify the display of the figure.
ax = fig.add_subplot(111)
# Define Y-axis of the figure.
plt.boxplot(results)
# Define X-axis of the figure.
ax.set_xticklabels(names)
plt.show()

### 4.4. Data Training

In [None]:
# The optimal model to predict the "RankBins," namely the "Sales Rank" of the fruits sold by the fruit shop is
# "CART(Classification and Regression Trees)."

In [None]:
# Create an empty CART model.
cart = DecisionTreeClassifier()
# Fit CART model to training data.
cart.fit(X_train, Y_train)

### 4.5. Data Validation

In [None]:
# Deploy CART model to make prediction. 
predictions = cart.predict(X_validation)

In [None]:
# Compare the prediction with the actual results.
# Since "predictions" is a "numpy array," "Y_validation" should be transformed from "series" to "array" by method "values."
# 4.5.1. Option 1: "RankBins" is a Categorical Data
# Categorical data cannot be used for further analysis and therefore should be transformed into "object" property.
print(accuracy_score(Y_validation.astype(str).values, predictions))

In [None]:
# 4.5.2. Option 2: "RankBins" is a Numerical Data
# Data transformation will not be necessary.
print(accuracy_score(Y_validation.values, predictions))

## 5. Prediction

In [None]:
df = pd.DataFrame([[350,200,120,80,0.40]], columns=['Sales Volume','Turnover','Cost','Gross Profit','Gross Margin'])

In [None]:
predictions = cart.predict(df)

In [None]:
predictions 