# Loading libraries 

In [1]:
# import libraries 
import pandas as pd
import os 
import numpy as np
from PIL import Image
from sklearn import preprocessing
from pathlib import Path
from sklearn.model_selection import train_test_split
from numpy import mean
from numpy import std

In [2]:
# import ML methods 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

In [3]:
# importing learning rate graph libraries 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import learning_curve
from sklearn import datasets
import matplotlib.pyplot as plt

In [4]:
#import conusion matrix (plot)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

# import training set 

In [5]:
# load x train data
def import_x():
    #size of crop
    left = 175
    top = 250
    right = 320
    bottom = 275
    #create array for images 
    all_images_as_array = []
    #create array for name of images with sunglasses 
    remove_array = []
    #create array for name of images without sunglasses 
    keep_array = []
    # iterate through all images in the number
    for filename in os.listdir('Datasets\\cartoon_set\\img\\'):
        #open the file 
        img = Image.open('Datasets\\cartoon_set\\img\\'+filename)
        #crop the file 
        new_img = img.crop((left, top, right, bottom))
        #average the image
        average = np.average(new_img)
        # if light enough (no 0 pixel for sunglasses) add to array
        if average >= 120:
            #convert to an array
            np_array = np.asarray(new_img)
            #normalise 
            np_array = np_array/255
            #append to one large array
            all_images_as_array.append(np_array)
            #add name to keep images without sunglasses list 
            keep_array.append(filename)
            # if too ow i.e. 0 pixel vlaue for sunglasses then remove 
        if average <120:
            # add name to list of images with sunglasses that is removed 
            remove_array.append(filename)
    # makae array numpy 
    x_train = np.array(all_images_as_array)
    #pre-process
    x_train = np.reshape(x_train, (x_train.shape[0], -1))
    #create normalisation 
    scaler = preprocessing.StandardScaler().fit(x_train)
    #normalise the input 
    x_train_not_split = scaler.transform(x_train)
    #print nubmer of images removed because contain sunglasses 
    print(len(remove_array),'images containing sunglasses were removed from the training dataset')
    #output the x values and list of images without sunglasses 
    return  x_train_not_split,keep_array

In [6]:
# import y label train
def import_y(keep_array):
    #read label folders 
    dataFrame = pd.read_csv('Datasets\\cartoon_set\\labels.csv',header = None, prefix="data")
    #split the labels into columns 
    dataFrame['data1']= dataFrame['data0'].str.split('\t')
    # label the column names 
    df3 = pd.DataFrame(dataFrame['data1'].to_list(), columns=['image_number','eye color','face shape','file name'])
    #replace the -1 with 0
    df3['eye color'] = df3['eye color'].replace(['-1'], '0')
    #alphabitise the numbes r
    df3 = df3.sort_values(by ='image_number')
    #remove row zero
    df3 = df3.drop(0)
    # make numbers 
    df3['eye color'] = pd.to_numeric(df3['eye color'])
    #select columns 
    df3_y = df3[['file name','eye color']]

    # filter for the same as the non-sunglasses images
    df_y = df3_y[df3_y['file name'].isin(keep_array)]
    # select the eye colour column 
    y_train_not_split = df_y['eye color']
     #output labels 
    return y_train_not_split 

# split data into training and validation set 

In [7]:
#split data 
def split_data(x,y):
    #split training input into 80% training 20% validation
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) 
    #output the training and validation set 
    return x_train,x_test,y_train,y_test 

# load test data 

In [110]:
# import x test 
def import_x_test():
    #size croped images 
    left = 175
    top = 250
    right = 320
    bottom = 275
    #create array for images that are kept 
    all_images_as_array_test = []
    #create arrray to list the number of array that removed 
    remove_array_test = []
    #create array for images that have sunglasses 
    keep_array_test = []
    #iterate through every file in folder 
    for filename in os.listdir('Datasets\\cartoon_set_test\\img\\'):
        #open file in folder 
        img = Image.open('Datasets\\cartoon_set_test\\img\\'+filename)
        #crop images 
        new_img = img.crop((left, top, right, bottom))
        #average image
        average = np.average(new_img)
        # if array enough light pixel for no sunglasses add to image array
        if average >= 120:
            #convert image to array
            np_array = np.asarray(new_img)
            #normalise 
            np_array = np_array/255
            #append to final array 
            all_images_as_array_test.append(np_array)
            #add name to keep list 
            keep_array_test.append(filename)
            #if array to dark i.e. has sunglasses -  remove 
        if average <120:
            # add number to remove list 
            remove_array_test.append(filename)
    #make array numpy 
    x_test_test_data = np.array(all_images_as_array_test)
    #pre-process results 
    x_test_test_data = np.reshape(x_test_test_data, (x_test_test_data.shape[0], -1))
    #create standardisation 
    scaler = preprocessing.StandardScaler().fit(x_test_test_data)
    #normalise the inputs 
    x_test_test_data = scaler.transform(x_test_test_data)
    #print number of removed sunglasses 
    print(len(remove_array_test),'number of images containing sunglasses were removed from the testing dataset')
    #output the x variable an list of image numbers of images without sunglasses 
    return x_test_test_data, keep_array_test

In [131]:
# import y label test
def import_y_test(keep_array_test):
    #read the csv label file 
    test_dataFrame = pd.read_csv('Datasets\\cartoon_set_test\\labels.csv',header = None, prefix="data")
    #seperate label into individual column
    test_dataFrame['data1']= test_dataFrame['data0'].str.split('\t')
    #label column
    test_df3 = pd.DataFrame(test_dataFrame['data1'].to_list(), columns=['image_number','eye color','face shape','file name'])
    #replace -1 with 0
    test_df3['eye color'] = test_df3['eye color'].replace(['-1'], '0')
    #alphabitise by image number so same as order images loaded 
    test_df3 = test_df3.sort_values(by ='image_number')
    #remove row zero
    test_df3 = test_df3.drop(0)
    #make number 
    test_df3['eye color'] = pd.to_numeric(test_df3['eye color'])
    
    # filter for the same as the non-sunglasses images 
    df_y = test_df3[test_df3['file name'].isin(keep_array_test)]
    # select the eye colour column 
    y_test_test_data = df_y['eye color']
    #output labels 
    return y_test_test_data

# Create final model 

In [136]:
#function to create final model 
def final_model(x_train,y_train):
    #load classifier 
    models = LogisticRegression(C= 0.01)

    # Fit the classifier
    models.fit(x_train, y_train)
    # print the final model 
    print('The optimised model used for B2 is', models)
    #output model 
    return models

# evaluate the final model

In [140]:
# function to evaluate fina lmodel 
def asses_model(models,x_test_test,y_test_test):
    # Make predictions
    predictions = models.predict(x_test_test)

    # Calculate metrics
    accuracy= accuracy_score(predictions, y_test_test)
    #print final model accuracy as percentage 
    print('This is the accuracy for B2',accuracy*100,'%')

    # plot confusion matrix 
    plot_confusion_matrix(models, x_test_test, y_test_test, cmap=plt.cm.Blues)  
    #show graph 
    plt.show()
    

# Creating and evaluating model 

In [141]:
#function that input training and test data and evaluates 
def B2():
    #import x and array of image numbers that do not have sun glassses 
    x_train_not_split, keep_array = import_x()
    #import y that don't have sunglasses 
    y_train_not_split = import_y(keep_array)
    #split data to training and validation dataset 
    x_train,x_test,y_train,y_test  = split_data(x_train_not_split,y_train_not_split)
    # input x test and the lis that don't have sunglasses 
    x_test_test,keep_array_test = import_x_test()
    #import labels that don't have sunglasses 
    y_test_test = import_y_test(keep_array_test)
    # create final model 
    models = final_model(x_train,y_train)
    #evaluate final model 
    asses_model(models,x_test_test,y_test_test) 


  


In [None]:
# run B2 function to prduce model and evaluate 
B2()

# import training and validation 

In [10]:
# function to import y and x and divide into train and validation set 
def import_X_Y():
    #import x and array of image numbers that do not have sun glassses 
    x_train_not_split, keep_array = import_x()
    #import y that don't have sunglasses 
    y_train_not_split = import_y(keep_array)
    #split data to training and validation dataset 
    x_train,x_test,y_train,y_test  = split_data(x_train_not_split,y_train_not_split)
    # putput the training and validation set 
    return x_train,x_test,y_train,y_test

In [11]:
x_train,x_test,y_train,y_test = import_X_Y()

1612 images containing sunglasses were removed from the training dataset


# Cross validation for model selection 

In [33]:
## create dictoary for the CV results 
CV_df = pd.DataFrame({"METHOD":[],"MEAN":[],"STD":[]})
## create k folds for repeating the model for cross validation 
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [34]:
#Logistic regression 
# create model
model = LogisticRegression()
# evaluate model
scores = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
#create dicoary of results 
CV_df_LR = {'Method':'Logistic regression','MEAN':scores.mean(),'STD':scores.std()}


In [35]:
## support vecotr machine 
#create model 
model = LinearSVC()
## evaluate model
scores = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
#create dictionary of results 
CV_df_SV = {'Method':'Support Vector','MEAN':scores.mean(),'STD':scores.std()}

In [36]:

#Decision  tree 
#create model 
model = DecisionTreeClassifier()
# evaluate model
scores = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
#create dicotary of results 
CV_df_DT = {'Method':'Decision Tree','MEAN':scores.mean(),'STD':scores.std()}

In [37]:
#Random Forest  
#creat model
model = RandomForestClassifier()
# evaluate model
scores = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
#create dicotary of results 
CV_df_RF = {'Method':'Random Forest','MEAN':scores.mean(),'STD':scores.std()}

In [38]:
#K nearest Neighbours  
#creat model
model = KNeighborsClassifier()
#evaluate model
scores = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
#creaet dictoary of results 
CV_df_KN = {'Method':'K-Nearest Neighbors','MEAN':scores.mean(),'STD':scores.std()}

In [39]:
#compile dictoaries into one pandnas tabel
list_of_dict = CV_df_LR, CV_df_DT,CV_df_RF,CV_df_KN
# add list to pandas data frame 
df = pd.DataFrame(list_of_dict)
#print table of results 
df

# Hyperparameter tuning

In [40]:

## Create the parameter grid based on the results of random search 
param_grid = {'C':[100,10,1.0,0.1,0.01]}

## Create a based model
lr = LogisticRegression()
# iterate through all the C values 3 times 
grid_search = GridSearchCV(estimator = lr, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)

In [41]:
## Fit the grid search to the data
grid_search.fit(x_train, y_train)
#find best parameters 
grid_search.best_params_
best_grid = grid_search.best_estimator_

In [42]:
#print best parameters 
best_grid

# Making hyperparameter boxplot  

In [43]:
# result of CV into tabel 
# compile results into pandas table 
results_df = pd.DataFrame(grid_search.cv_results_)
# sort by best test score 
results_df = results_df.sort_values(by=["rank_test_score"])
# compine results to find averages 
results_df = results_df.set_index(results_df["params"].apply(lambda x: "_".join(str(val) for val in x.values()))).rename_axis("kernel")
# add columns for the mean/ standard deviation to table 
results_df[["params", "rank_test_score", "mean_test_score", "std_test_score"]]

In [44]:
# extract paramets and test scores 
r_df = results_df[['param_C','split0_test_score','split1_test_score' , 'split2_test_score']]

In [45]:
## plot test scores 
#rotate table 
df = r_df.set_index('param_C')
#plot box plot 
df.T.boxplot()

# Making Learning rate plot  

In [46]:
##create data for all the iteration needed for the learning rate graph 
# create pipeline for optomised model 
pipeline = make_pipeline(LogisticRegression(C= 0.01))
#create parameters to measure and vary #
train_sizes, train_scores, test_scores = learning_curve(estimator=pipeline, X=x_train_not_split, y=y_train_not_split,cv=10, train_sizes=np.linspace(0.1, 1.0, 10),n_jobs=1)

In [47]:
# calculates points for plot 
#calcuate training plots 
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
#calculates validation set plot points 
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [48]:
#create learning plot 
#create plot for the training data against training size 
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training Accuracy')
#fill with blue to show standard deviation 
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
#creat eplot for valiation data 
plt.plot(train_sizes, test_mean, color='green', marker='+', markersize=5, linestyle='--', label='Validation Accuracy')
#fill standard divation in green 
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
#title and axis label the groah 
plt.title('Learning Curve')
plt.xlabel('Training Data Size')
plt.ylabel('Model accuracy')
plt.grid()
#create legend for colours 
plt.legend(loc='lower right')
#show graph 
plt.show()

# Making Final Model 

# Testing model