# Getting and Visualizing the Data

In [None]:
import pandas as pd
from pandas.plotting import scatter_matrix
data = pd.read_csv('New_Data.csv')

X = data.iloc[:,:4]
Y = data['Name']

In [None]:
Y

In [None]:
data.info()

In [None]:

%matplotlib inline
from pandas.plotting import scatter_matrix

dataFrame = pd.DataFrame(X, columns=["Cheek diff", "Eye diff", "Mouth diff", "Nose diff"])
 #Gets us the first 5 feature names. 
scatter_matrix(X, figsize = (10, 10), c = Y, alpha = 0.8, marker = 'O')

In [None]:
X.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
X_train_org, X_test_org, y_train, y_test = train_test_split(X, Y, random_state = 0, test_size = 0.2)
#Split the data into training and testing sets.

scaler = MinMaxScaler() #Scaling the data because sometimes, the data varies a lot(check X.describe())
X_train = scaler.fit_transform(X_train_org)
X_test = scaler.fit_transform(X_test_org)

In [None]:
X_train_org  #Original without scaling. (Values vary a lot)

In [None]:
X_train

In [None]:
dataFrame = pd.DataFrame(X_train, columns = ["Cheek diff", "Eye diff", "Mouth diff", "Nose diff"])
dataFrame

In [None]:
import numpy as np
columns = X
target = Y
corr_list = []; #Correlation list- Which features have the biggest role in the accuracy of the model. Higher -> More important.
for i in range(0,4):
    corr_list.append(np.corrcoef(X_train[:,i], y_train)[0,1])#Get all the rows and the ith column.
print(corr_list)

# KNN Classifier

### Training KNN for different k's

In [None]:
from sklearn.neighbors import KNeighborsClassifier

train_score_array = []
test_score_array = []

for k in range (1, 30):
    knn = KNeighborsClassifier(k)
    knn.fit(X_train, y_train)
    train_score_array.append(knn.score(X_train, y_train))
    test_score_array.append(knn.score(X_test, y_test))

### Determining Optimal k Value Through Code

In [None]:
k_value = 1
min = 100
k_value2 = 10
max = 0
d = zip(train_score_array, test_score_array)
#What the code below does is find the k-value where the difference between test and training scores is as small as possible.
#This k-value is the "optimal" k-value.
for i, element in enumerate(d):
    both = element
    value = abs(both[0] - both[1])
    value2 = (both[0] + both[1])/2
    #print(value)
    if(value2 > max and i > 2 and i < 30):
        max = value2
        k_value2 = i + 1
    else:
        k_value2 = k_value2
    
    if(value < min and i > 2 and i < 30):
        min = value
        k_value = i + 1
    else:
        continue

print("The optimal k should be:",k_value, "based on the minimum distance between training and testing set accuracies")
print("The k value with the highest average accuracy was:",k_value2)

### Determining Optimal k Value through Graph

In [None]:
import matplotlib.pyplot as plt
x_axis = range(1,30) # x_axis values
%matplotlib inline
#x-values, y-values, Name for legend, color
plt.plot(x_axis, train_score_array , label = "Train Score", c= "g") #Plots a green line
plt.plot(x_axis, test_score_array, label = "Test Score", c= "b")  #Plots a blue line
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.legend()

## Grid Search Algorithm for best k

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
vals = range(1,40)
param_grid = {'n_neighbors': vals}
print("Parameter grid:\n{}".format(param_grid))

In [None]:
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv = 15, return_train_score = True)
grid_search.fit(X_train, y_train)

In [None]:
print("Best score: {:.4f}".format(grid_search.best_score_))
print("Best parameters: {}".format(grid_search.best_params_))

### Accuracy

In [None]:
knn = KNeighborsClassifier(1)
knn.fit(X_train, y_train)
print("Training Accuracy:",knn.score(X_train, y_train)) #Accuracy of the model when training.
print("Testing Accuracy:", knn.score(X_test, y_test) )#Accuracy of the test.

### Predicting Custom Inputs

In [None]:
new_x = [[0.470437,0.530412,0.486742,0.141006], [0.273302,0.402896,0.928058,0.127295] ]
y = knn.predict(new_x)
print(y)

###  Expected Outputs vs Predicted Outputs

In [None]:
predictions = knn.predict(X_test) #Make predictions for the testing data
values = []

for i in zip(y_test, predictions):
    values.append(list(i))
dataFrame = pd.DataFrame(values, columns = ["Expected", "Predicted"])
dataFrame

# Support Vector Classifier (SVC)

### Train SVC

In [None]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='linear')  
svclassifier.fit(X_train, y_train)  

### Accuracy

In [None]:
print("Training Accuracy:",svclassifier.score(X_train, y_train)) #Accuracy of the model when training.
print("Testing Accuracy:", svclassifier.score(X_test, y_test) )#Accuracy of the test.

## Grid Search Algorithm for Optimal Gamma and C Values

### Declaring parameters grid

In [None]:
param_grid = {'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000],
              'gamma':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000], 'kernel': ['linear', 'rbf']}
print("Parameter grid:\n{}".format(param_grid))

### Training the Grid Search Algorithm with the parameters.

In [None]:
grid_search = GridSearchCV(SVC(), param_grid, cv = 6, return_train_score = True)
grid_search.fit(X_train, y_train)

### Accuracy

In [None]:
print("Best score: {:.4f}".format(grid_search.best_score_))
print("Best parameters: {}".format(grid_search.best_params_))

### Expected Outpts vs Predicted Outputs

In [None]:
svclassifier = SVC(kernel='rbf', C = 5, gamma = 10)  
svclassifier.fit(X_train, y_train)  
y_pred = svclassifier.predict(X_test)
print(svclassifier.score(X_test, y_test))
values = []

for i in zip(y_test, y_pred):
    values.append(list(i))
dataFrame = pd.DataFrame(values, columns = ["Expected", "Predicted"])
dataFrame

# Decision Trees - Gini

### Train Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 10, max_depth=5, min_samples_leaf=5) 
# Performing training 
clf_gini.fit(X_train, y_train) 

### Accuracy

In [None]:
print("Training Accuracy:",clf_gini.score(X_train, y_train)) #Accuracy of the model when training.
print("Testing Accuracy:", clf_gini.score(X_test, y_test) )#Accuracy of the test.

train_score_array = []
test_score_array = []

for k in range (1, 6):
    clf = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth=k, min_samples_leaf=5)  
    clf.fit(X_train, y_train)
    train_score_array.append(clf.score(X_train, y_train))
    test_score_array.append(clf.score(X_test, y_test))
x_axis = range(1,6) # x_axis values
%matplotlib inline
#x-values, y-values, Name for legend, color
plt.plot(x_axis, train_score_array , label = "Train Score", c= "g") #Plots a green line
plt.plot(x_axis, test_score_array, label = "Test Score", c= "b")  #Plots a blue line
plt.xlabel('Maximum depth of trees')
plt.ylabel('Accuracy')
plt.legend()

### Expected Outputs vs Predicted Outputs

In [None]:
y_pred = clf_gini.predict(X_test)

values = []
for i in zip(y_test, y_pred):
    values.append(list(i))
dataFrame = pd.DataFrame(values, columns = ["Expected", "Predicted"])
dataFrame

# Decision Trees - Entropy

### Train Decision Tree

In [None]:
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth = 5, min_samples_leaf = 5) 
# Performing training 
clf_entropy.fit(X_train, y_train) 

### Accuracy

In [None]:
print("Training Accuracy:",clf_entropy.score(X_train, y_train)) #Accuracy of the model when training.
print("Testing Accuracy:", clf_entropy.score(X_test, y_test) )#Accuracy of the test.

train_score_array = []
test_score_array = []

for k in range (1, 6):
    clf = DecisionTreeClassifier(criterion = "entropy", random_state = 100, max_depth = k, min_samples_leaf = 5) 
    clf.fit(X_train, y_train)
    train_score_array.append(clf.score(X_train, y_train))
    test_score_array.append(clf.score(X_test, y_test))
x_axis = range(1,6) # x_axis values
%matplotlib inline
#x-values, y-values, Name for legend, color
plt.plot(x_axis, train_score_array , label = "Train Score", c= "g") #Plots a green line
plt.plot(x_axis, test_score_array, label = "Test Score", c= "b")  #Plots a blue line
plt.xlabel('Maximum depth of tree')
plt.ylabel('Accuracy')
plt.legend()

### Expected Outputs vs Predicted Outputs

In [None]:
y_pred = clf_entropy.predict(X_test)

values = []

for i in zip(y_test, y_pred):
    values.append(list(i))
dataFrame = pd.DataFrame(values, columns = ["Expected", "Predicted"])
dataFrame

# Decision Tree - Grid Search Algorithm for best parameters

In [None]:
param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
              'min_samples_leaf':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'criterion': ['entropy', 'gini']}
print("Parameter grid:\n{}".format(param_grid))

In [None]:
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 6, return_train_score = True)
grid_search.fit(X_train, y_train)

In [None]:
print("Best score: {:.4f}".format(grid_search.best_score_))
print("Best parameters: {}".format(grid_search.best_params_))

In [None]:
clf = DecisionTreeClassifier(criterion = "entropy", random_state = 10, max_depth = 4, min_samples_leaf = 1) 
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

# Random Forests

### Train Decision Tree

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 20, random_state = 42)
rf.fit(X_train, y_train); #n_estimators is the number of decision trees being used.

### Accuracy

In [None]:
print("Training Accuracy:",rf.score(X_train, y_train)) #Accuracy of the model when training.
print("Testing Accuracy:", rf.score(X_test, y_test) )#Accuracy of the test.

train_score_array = []
test_score_array = []

for k in range (1, 21):
    rf = RandomForestClassifier(n_estimators = k, random_state = 42)
    rf.fit(X_train, y_train)
    train_score_array.append(rf.score(X_train, y_train))
    test_score_array.append(rf.score(X_test, y_test))
x_axis = range(1,21) # x_axis values
%matplotlib inline
#x-values, y-values, Name for legend, color
plt.plot(x_axis, train_score_array , label = "Train Score", c= "g") #Plots a green line
plt.plot(x_axis, test_score_array, label = "Test Score", c= "b")  #Plots a blue line
plt.xlabel('# of decision trees')
plt.ylabel('Accuracy')
plt.legend()

## Grid Search Algorithm for best Random Forest parameters.

In [None]:
param_grid = {'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
              'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
              'random_state':[42]}
print("Parameter grid:\n{}".format(param_grid))

In [None]:
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv = 6, return_train_score = True)
grid_search.fit(X_train, y_train)

In [None]:
print("Best score: {:.4f}".format(grid_search.best_score_))
print("Best parameters: {}".format(grid_search.best_params_))

In [None]:
rf = RandomForestClassifier(n_estimators = 35, random_state = 42, max_depth = 5, min_samples_leaf = 1) 
rf.fit(X_train, y_train)

In [None]:
print(rf.score(X_test, y_test))

### Expected Outputs vs Predicted Outputs

In [None]:
y_pred = rf.predict(X_test)

values = []

for i in zip(y_test, y_pred):
    values.append(list(i))
dataFrame = pd.DataFrame(values, columns = ["Expected", "Predicted"])
dataFrame

# Logistic Regression

### Train Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

### Accuracy

In [None]:
print("Training Accuracy:",lr.score(X_train, y_train)) #Accuracy of the model when training.
print("Testing Accuracy:", lr.score(X_test, y_test) )#Accuracy of the test.

## Grid Search Algorithm for Logistic Regression = max_iterations

In [None]:
param_grid = {'max_iter': [50, 100, 200, 500, 1000, 1500],'multi_class': ['ovr', 'auto']}
print("Parameter grid:\n{}".format(param_grid))

In [None]:
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv = 6, return_train_score = True)
grid_search.fit(X_train, y_train)

In [None]:
print("Best score: {:.4f}".format(grid_search.best_score_))
print("Best parameters: {}".format(grid_search.best_params_))

In [None]:
lreg = LogisticRegression(max_iter = 50, multi_class = 'ovr') 
lreg.fit(X_train, y_train)

In [None]:
lreg.score(X_test, y_test)

### Expected Outputs vs Predicted Ouputs

In [None]:
y_pred = lr.predict(X_test)

values = []

for i in zip(y_test, y_pred):
    values.append(list(i))
dataFrame = pd.DataFrame(values, columns = ["Expected", "Predicted"])
dataFrame

# Naive Bayes 

### Train Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

### Accuracy

In [None]:
print("Training Accuracy:", gnb.score(X_train, y_train)) #Accuracy of the model when training.
print("Testing Accuracy:", gnb.score(X_test, y_test) )#Accuracy of the test.

### Expected Outputs vs Predicted Outputs

In [None]:
y_pred = gnb.predict(X_test)

values = []

for i in zip(y_test, y_pred):
    values.append(list(i))
dataFrame = pd.DataFrame(values, columns = ["Expected", "Predicted"])
dataFrame

In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np

fig = figure(num=None, figsize=(16, 9), dpi=80, facecolor='w', edgecolor='k')

objects = ('KNN', 'SVC', 'DT - Gini', 'DT - Entropy', 'Random Forests',  'Gaussian NB', 'Logistic Regression')
y_pos = np.arange(len(objects))
performance = [90.47,95.24,76.19, 76.19, 80.95, 76.19, 66.66]
plt.xticks([0,1,2,3,4,5,6], objects, fontsize=14)
plt.yticks([0,20,40,60,80,100], (0,20,40,60,80,100), fontsize=19)
    
plt.rcParams['xtick.labelsize'] = 20 
barlist = plt.bar(y_pos, performance, color = 'g', align='center', alpha=0.5)
#plt.x_ticks.font_size(10)
#plt.xticks(fontsize=14, rotation=90)
barlist[0].set_color('r')
barlist[1].set_color('b')
barlist[2].set_color('k')
barlist[3].set_color('y')
barlist[4].set_color('m')
barlist[5].set_color('c')

plt.ylabel('Accuracy (%)', size = 24)
#plt.title('\n \nThe accuracy of different classification algorithms on the dataset of facial features after optimization\n of their hyperparameters through the Grid Search Algorithm.')
plt.xlabel("Classification Algorithms", size = 23)
plt.suptitle('Accuracy of Different Classification Algorithms After Hyperparameter\nOptimization Through the Grid Search Algorithm', size = 27)
plt.figure(figsize = [16,9])
#plt.legend()
A = range(7)
for xy in zip(A, performance):                                       # <--
    plt.annotate('%s%%' % xy[1], xy = xy, textcoords='data') # <--
         
plt.show()
