In [1]:
import matplotlib.pyplot as plt
import csv
from PIL import Image
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import argparse
import pickle

This is Training Program, 
main program is to run the trainData() function. 
Note : change training_data_directory to the actual training dataset.

Input : training_data
Output : will dump out model to device for testing purpose

We will use training data to train 3 models with best parameter possible. Each model will show its training accuracy, validation accuracy, fitting duration, confusion matrix, cross validation score, cross validation accuracy

In [2]:
training_data_directory = "/Users/kaiyuan/Desktop/3023Project/TrafficSignData/Training"

In [3]:
import cv2 as cv
from skimage import exposure

def readTrafficSigns(rootpath):
    '''Reads traffic sign data 
    Arguments: path to the traffic sign data, for example './TrafficSignData/Training'
    Returns:   list of images, list of corresponding labels'''
    images = [] # images
    labels = [] # corresponding labels
    # loop over N classes, at most we have 42 classes
    N=15
    for c in range(0,N):
        prefix = rootpath + '/' + format(c, '05d') + '/' # subdirectory for class
        gtFile = open(prefix + 'GT-'+ format(c, '05d') + '.csv') # annotations file
        gtReader = csv.reader(gtFile, delimiter=';') # csv parser for annotations file
        #gtReader.next() # skip header
        next(gtReader)
        # loop over all images in current annotations file
        for row in gtReader:
            img=Image.open(prefix + row[0])  # the 1th column is the filename
            # preprocesing image, make sure the images are in the same size
            img=img.resize((32,32), Image.BICUBIC)
            
            img=np.array(img)
            img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
            img=exposure.equalize_adapthist(img,clip_limit=0.1)
            images.append(img) 
            labels.append(row[7]) # the 8th column is the label
        gtFile.close()
    return images, labels


In [6]:
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.10

def loadData():
    trainImages, trainLabels = readTrafficSigns(training_data_directory)
    print('number of historical data=', len(trainLabels))
    # design the input and output for model
    X=[]
    Y=[]
    for i in range(0,len(trainLabels)):
        # input X just the flattern image
        X.append(trainImages[i].flatten())
        Y.append(int(trainLabels[i]))
    X=np.array(X)
    Y=np.array(Y)
    print(f"shape of X : {np.shape(X)}")
    X_train, X_val, y_train, y_val = train_test_split(X,Y, test_size=TEST_SIZE)
    print(f"shape of X_train : {np.shape(X_train)}")
    print(f"length of training data : {len(X_train)}.")
    print(f"length of validation data : {len(X_val)}.")
    return X_train, X_val, y_train, y_val

In [7]:
X_train, X_val, y_train, y_val = loadData()

number of historical data= 15540
shape of X : (15540, 1024)
shape of X_train : (13986, 1024)
length of training data : 13986.
length of validation data : 1554.


In [8]:
# with BEST performance parameter (random forest)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import time

randomForest = RandomForestClassifier(n_estimators=40, random_state=0,max_features = "sqrt")
start_time = time.time()
randomForest = randomForest.fit(X_train, y_train)
end_time = time.time()

duration = end_time - start_time

model_confusion_matrix = confusion_matrix(y_val,randomForest.predict(X_val))
        
print("RandomForestClassifier, training accuracy: %.3f, validation accuracy: %.3f" % (
    randomForest.score(X_train, y_train), randomForest.score(X_val, y_val)))
print(f"Fitting Duration (seconds) :{duration:.2f}s")
print(f"Confusion Matrix :")
print(model_confusion_matrix)

cv_scores = cross_val_score(randomForest, X_train, y_train, cv=5)
print("Cross-Validation-Scores: "+str(cv_scores))
print("MAX Cross-Validation-Score: "+str(max(cv_scores)))
print("MIN Cross-Validation-Score: "+str(min(cv_scores)))
print("Cross-Validation-Accuracy: %.6f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))

RandomForestClassifier, training accuracy: 1.000, validation accuracy: 0.950
Fitting Duration (seconds) :8.22s
Confusion Matrix :
[[ 15   1   1   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0 129   3   0   1   0   0   1   1   0   0   0   0   0   0]
 [  0   4 141   0   1   4   0   1   1   0   0   0   0   1   0]
 [  0   1   2  85   1   6   0   0   1   1   0   0   0   0   0]
 [  0   0   1   0 133   0   0   0   1   0   0   0   0   0   0]
 [  0   2   5   0   2 115   0   1   4   0   2   0   1   1   0]
 [  0   0   0   0   0   1  34   0   0   0   0   0   1   0   0]
 [  0   1   0   0   0   6   0  86   4   0   0   0   0   0   0]
 [  0   0   1   0   0   0   0   3  90   0   0   0   1   0   0]
 [  0   0   0   0   0   0   0   0   2 108   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 115   0   0   0   0]
 [  0   0   0   0   0   0   0   1   1   0   0  94   0   0   0]
 [  0   0   1   0   0   0   0   1   0   1   0   0 134   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0  

In [9]:
# with BEST performance parameter (SVM)
from sklearn import svm
clf = svm.SVC(kernel="rbf", C=10.5)
start_time = time.time()

clf.fit(X_train,y_train)
end_time = time.time()

duration = end_time - start_time
score = clf.score(X_val,y_val)
model_confusion_matrix = confusion_matrix(y_val,clf.predict(X_val))


print("SVM, training accuracy: %.3f, validation accuracy: %.3f" % (
    clf.score(X_train, y_train), clf.score(X_val, y_val)))
print(f"Fitting Duration (seconds) :{duration:.4f}s")
print(f"Confusion Matrix :")
print(model_confusion_matrix)

cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Cross-Validation-Scores: "+str(cv_scores))
print("MAX Cross-Validation-Score: "+str(max(cv_scores)))
print("MIN Cross-Validation-Score: "+str(min(cv_scores)))
print("Cross-Validation-Accuracy: %.6f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))



SVM, training accuracy: 1.000, validation accuracy: 0.984
Fitting Duration (seconds) :62.4688s
Confusion Matrix :
[[ 17   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0 130   3   0   0   1   0   0   1   0   0   0   0   0   0]
 [  0   0 152   0   0   1   0   0   0   0   0   0   0   0   0]
 [  0   1   0  95   0   1   0   0   0   0   0   0   0   0   0]
 [  0   0   2   0 132   0   0   0   1   0   0   0   0   0   0]
 [  0   0   1   1   0 128   0   1   0   0   0   0   0   2   0]
 [  0   0   0   0   0   0  36   0   0   0   0   0   0   0   0]
 [  0   1   3   0   0   1   0  92   0   0   0   0   0   0   0]
 [  0   0   1   0   0   0   0   0  94   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0 109   0   0   0   1   0]
 [  0   0   0   0   0   0   0   0   0   0 115   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0  94   2   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0 137   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0 146   0]
 [  

In [10]:
#best paramter  (MLP)
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='sgd', alpha=1e-5,
                    hidden_layer_sizes=(100,), random_state=0)

start_time = time.time()

clf.fit(X_train,y_train)
end_time = time.time()

duration = end_time - start_time
score = clf.score(X_val,y_val)
model_confusion_matrix = confusion_matrix(y_val,clf.predict(X_val))

print("MLP, training accuracy: %.3f, validation accuracy: %.3f" % (
    clf.score(X_train, y_train), clf.score(X_val, y_val)))
print(f"Fitting Duration (seconds) :{duration:.4f}s")
print(f"Confusion Matrix :")
print(model_confusion_matrix)


cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Cross-Validation-Scores: "+str(cv_scores))
print("MAX Cross-Validation-Score: "+str(max(cv_scores)))
print("MIN Cross-Validation-Score: "+str(min(cv_scores)))
print("Cross-Validation-Accuracy: %.6f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))






MLP, training accuracy: 0.968, validation accuracy: 0.939
Fitting Duration (seconds) :109.8117s
Confusion Matrix :
[[ 15   1   0   0   0   1   0   0   0   0   0   0   0   0   0]
 [  0 119   9   1   0   1   0   0   5   0   0   0   0   0   0]
 [  0   2 140   1   0   2   0   0   1   2   0   1   3   1   0]
 [  0   1   2  88   0   3   0   0   2   0   0   0   1   0   0]
 [  0   1   0   1 132   0   0   0   1   0   0   0   0   0   0]
 [  0   3   1   7   0 118   0   1   1   0   0   0   0   2   0]
 [  0   0   0   0   0   3  33   0   0   0   0   0   0   0   0]
 [  0   1   3   0   0   3   0  86   1   0   1   1   1   0   0]
 [  1   1   3   0   1   0   0   1  88   0   0   0   0   0   0]
 [  0   0   1   0   0   1   0   0   0 107   1   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0 114   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0  95   1   0   0]
 [  0   1   1   1   0   0   0   1   0   0   2   1 130   0   0]
 [  0   0   0   0   1   0   0   0   0   0   1   1   0 143   0]
 [ 



Cross-Validation-Scores: [0.93709793 0.92777976 0.92491956 0.93350018 0.93600286]
MAX Cross-Validation-Score: 0.9370979270907791
MIN Cross-Validation-Score: 0.9249195566678584
Cross-Validation-Accuracy: 0.931860 (+/- 0.01)




In [11]:
def trainData():
    X_train, X_val, y_train, y_val = loadData()
    
    print("---------------------------")
    print("Training Random Forest Model...")
    randomForest = RandomForestClassifier(n_estimators=40, random_state=0,max_features = "sqrt")
    randomForest = randomForest.fit(X_train, y_train)
    with open("FinalrandomForest.pkl","wb") as op_file:
        pickle.dump(randomForest, op_file)
    print("Done training Random Forest Model")
    print("Done dump model.")
    
    print("---------------------------")
    
    print("Training SVM Model...")
    clf = svm.SVC(kernel="linear",C=10.5)
    clf.fit(X_train,y_train)
    with open("finalsvm.pkl","wb") as op_file:
        pickle.dump(clf, op_file)
    print("Done training SVM Model")
    print("Done dump model.")
    
    print("---------------------------")
    
    print("Training Random Forest Model...")
    clf = MLPClassifier(solver='sgd', alpha=1e-5,
                    hidden_layer_sizes=(100,), random_state=0)
    clf.fit(X_train,y_train)
    with open("neuralnetwork.pkl","wb") as op_file:
        pickle.dump(clf, op_file)
    print("Done training MLP Model")
    print("Done dump model.")
    
    print("---------------------------")
    print("Finish training, can get your model at device.")
    

In [12]:
trainData()

number of historical data= 15540
shape of X : (15540, 1024)
shape of X_train : (13986, 1024)
length of training data : 13986.
length of validation data : 1554.
---------------------------
Training Random Forest Model...
Done training Random Forest Model
Done dump model.
---------------------------
Training SVM Model...
Done training SVM Model
Done dump model.
---------------------------
Training Random Forest Model...
Done training MLP Model
Done dump model.
---------------------------
Finish training, can get your model at device.


