## 4. Model Selection

### 4.1. Data Transformation

In [None]:
# 4.1.1. Option 1: "RankBins" is a Categorical Data
# Categorical data cannot be used for further analysis and therefore should be transformed into "object" property.
Y_train = Y_train.astype(str)

In [None]:
# 4.1.2. Option 2: "RankBins" is a Numerical Data
# Data transformation will not be necessary.

### 4.2. Model Deployment

In [None]:
# Create a list containing 6 supervised learning models.
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

### 4.3. Model Evaluation

In [None]:
# K number of "folds" will be selected randomly in accordance with random seed 7. 
seed = 7
# Record the returned "predictive accuracy" of each model which will be used to plot "Boxplots." 
# (6 Models * 10 Folds = 60 Results)
results = []
# Create a list containing the names of 6 models which will be used to plot "Boxplots."
names = []
# Model evaluation will be based on "predictive accuracy."
scoring = 'accuracy'

# Model Evaluation
for name, model in tqdm_notebook(models):
    # Specify how to conduct cross-validation: 10 folds selected randomly in accordance with random seed 7. 
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    # Conduct cross-validation with the return of 10 "predictive accuracy" results. 
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    # Record 10 "predictive accuracy" results.
    results.append(cv_results)
    # Record the name of the model which will be used to plot "Boxplots."
    names.append(name)
    # Return mean and standard error of the 10 "predictive accuracy" results.
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)    

In [None]:
# Plot "Boxplots."
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
# Specify the display of the figure.
ax = fig.add_subplot(111)
# Define Y-axis of the figure.
plt.boxplot(results)
# Define X-axis of the figure.
ax.set_xticklabels(names)
plt.show()

### 4.4. Data Training

In [None]:
# The optimal model to predict the "RankBins," namely the "Sales Rank" of the fruits sold by the fruit shop is
# "CART(Classification and Regression Trees)."

In [None]:
# Create an empty CART model.
cart = DecisionTreeClassifier()
# Fit CART model to training data.
cart.fit(X_train, Y_train)

### 4.5. Data Validation

In [None]:
# Deploy CART model to make prediction. 
predictions = cart.predict(X_validation)

In [None]:
# Compare the prediction with the actual results.
# Since "predictions" is a "numpy array," "Y_validation" should be transformed from "series" to "array" by method "values."
# 4.5.1. Option 1: "RankBins" is a Categorical Data
# Categorical data cannot be used for further analysis and therefore should be transformed into "object" property.
print(accuracy_score(Y_validation.astype(str).values, predictions))

In [None]:
# 4.5.2. Option 2: "RankBins" is a Numerical Data
# Data transformation will not be necessary.
print(accuracy_score(Y_validation.values, predictions))