In [1]:
# Import required libraries

import numpy as np
import pandas as pd
import sklearn
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


In [32]:
#Loading Dataset, which is 500 generated 32x32 images with a background of 0s, each with a 3x3 square of 1s placed randomly in the image.
#These images are a substitute for clinical MRI data that can't be shared for privacy reasons
#Images were scored by summing the value of all pixels in each quadrent, then assigning a score based on the max quadrent value
#This scoring format from 0-3 was intended to be similar to the 0-4 integer image quality scores on the clinical MRI data 
#Pixel values were converted into single excel row of 1024 columns (excel column = image column value + 32 * image row value), and score was placed in 1025th column
df = pd.read_excel("path_to_dataset") #Replace with path
#Since the data was self-generated, there was no need to clean it. 
#If it were the clinical data failed scans (wrong timing, corrupted data, and mis-labeled scans, etc) would be removed, and the rest sorted by view, scan type, acquisition trajectory, and number of temporal frames (which would add a 1024*temporal frame to the excel formula)

In [3]:
#Seperating into x (pixel values) and y (Assigned Score), and further into train (n=400) and test (n=100) datasets
y = df['Score'].values
x = df.drop('Score',axis=1).values
#print(y.shape)
#print(x.shape)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)
print(x_train.shape)
print(x_test.shape)

(400, 1024)
(100, 1024)


In [28]:
#Training SVM, K-nearest Neighbors, decision tree, and random forest to compare methods
SVM_Quadrent_model = LinearSVC(dual='auto')
SVM_Quadrent_model = SVM_Quadrent_model.fit(x_train,y_train)
#I think SVM is a good model for this because, due to the nature of my substitute dataset, seperating hyperplanes in a grid would work well
KNN_Quadrent_model = KNeighborsClassifier()
KNN_Quadrent_model = KNN_Quadrent_model.fit(x_train,y_train)
#KNN allows for classification based on similar datasets, which works well, and even better the further away from quadrant boundaries
DT_Quadrent_model = DecisionTreeClassifier()
DT_Quadrent_model = DT_Quadrent_model.fit(x_train,y_train)
#I selected decision trees because only 20 of the 1024 pixels can appear in multiple classifications
RF_Quadrent_model = RandomForestClassifier()
RF_Quadrent_model = RF_Quadrent_model.fit(x_train,y_train)
#Random Forest would be a good model because Decision Trees had worse test accuracy than SVN and KNN, while RF decorrolates the trees to not be too dependent on an early split

In [29]:
#Testing and reporting relevant factors
SVM_Train_Accuracy = SVM_Quadrent_model.score(x_train,y_train)
KNN_Train_Accuracy = KNN_Quadrent_model.score(x_train,y_train)
DT_Train_Accuracy = DT_Quadrent_model.score(x_train,y_train)
RF_Train_Accuracy = RF_Quadrent_model.score(x_train,y_train)
print('SVM Train Accuracy = ' + str(SVM_Train_Accuracy))
print('KNN Train Accuracy = ' + str(KNN_Train_Accuracy))
print('DT Train Accuracy = ' + str(DT_Train_Accuracy))
print('RF Train Accuracy = ' + str(RF_Train_Accuracy))
SVM_Test_Accuracy = SVM_Quadrent_model.score(x_test,y_test)
KNN_Test_Accuracy = KNN_Quadrent_model.score(x_test,y_test)
DT_Test_Accuracy = DT_Quadrent_model.score(x_test,y_test)
RF_Test_Accuracy = RF_Quadrent_model.score(x_test,y_test)
print('SVM Test Accuracy = ' + str(SVM_Test_Accuracy))
print('KNN Test Accuracy = ' + str(KNN_Test_Accuracy))
print('DT Test Accuracy = ' + str(DT_Test_Accuracy))
print('RF Test Accuracy = ' + str(RF_Test_Accuracy))

SVM Train Accuracy = 1.0
KNN Train Accuracy = 0.9975
DT Train Accuracy = 1.0
RF Train Accuracy = 1.0
SVM Test Accuracy = 0.97
KNN Test Accuracy = 0.98
DT Test Accuracy = 0.9
RF Test Accuracy = 0.98


In [None]:
#Results & Discussion:
#Training Data accuracy was very high for all of the methods. 
#However, DT had consistently ~10% lower test accuracy than SVM and KNN (Multiple trainings resulted in slightly different accuracies)
#I implemented RF to attempt to improve upon DT's test accuracy, and it was more in line with SVM and KNN
#Overall, SVM, KNN, and RF performed well, and DT was good but not to the levels of the other 3
#Since there is perfect delineation in the substitute dataset, there are very high accuracy scores
#The clincal dataset is much more complex and will likely not produce this kind of accuracy
#With this knowledge, I will focus on SVM, KNN, and RF models to use on my clinical dataset