In [1]:
#Name: CS 458 Task 1 Testing File Predictions and Model Analysis
#Authors: Liliana Pacheco, Chantelle Suarez, Yan Tarpley
#Date: December 9, 2019
#Description: This code takes in the training.txt file and testing.txt file. It then evaluates the training data
#and processes the data and evaluates it to find the best possible model. It then makes preditions for the
#testing.txt file 
#NOTE: The model analysis portion of this program was created using Jason Brownlee's article 
#"How To Compare Machine Learning Algorithms in Python with scikit-learn" as a referrence
#Link to the article: https://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/

# Load libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

#----------------------------------------------------------------------------------------------#
#                                 May Change File Paths Here                                   #

trainingFile = open(r"C:\Users\lilia_fdv6j62\Documents\training.txt","r+")
testingFile = open(r"C:\Users\lilia_fdv6j62\Documents\testing.txt", "r+")

#----------------------------------------------------------------------------------------------#

#Load Training  dataset 
trainingNames = ['info ID', 'ft ID', 'val']
trainingDF = pd.read_csv(trainingFile, delimiter=" ", names=trainingNames)
trainingData = pd.pivot_table(trainingDF, index=['info ID'], columns=['ft ID'], values=['val'])
trainingData.fillna(0, inplace=True)

#Load Testing Dataset
testingNames = ['info ID', 'ft ID', 'val']
testingDF = pd.read_csv(testingFile, delimiter=" ", names=testingNames)
testingData = pd.pivot_table(testingDF, index=['info ID'], columns=['ft ID'], values=['val'])
testingData.fillna(0, inplace=True)

#Load Training Labels
trainingLabelFile = open(r"C:\Users\lilia_fdv6j62\Documents\label_training.txt", 'r+')
labelNames = ['label']
trainingLabelDF = pd.read_csv(trainingLabelFile, delimiter=" ", names=labelNames)

#Merge Trainning Data and Training Labels
trainingLabelDF.insert(0, 'info ID', range(1, len(trainingLabelDF) + 1))
trainingMerge = pd.merge(trainingLabelDF, trainingData, on="info ID")
cols = list(trainingMerge.columns.to_numpy())
trainingLabelCol = trainingMerge.pop('label')
trainingMerge['label'] = trainingLabelCol

#Process Training Data with PCA
trainingFeatures = list(trainingMerge.columns)
trainingTarget = trainingFeatures.pop()

x = trainingMerge.loc[:, trainingFeatures].values
y = trainingMerge.loc[:,['label']].values
x = StandardScaler().fit_transform(x)

numComponents = 55
pca = PCA(n_components=numComponents)
principalComponents = pca.fit_transform(x)
principalDF = pd.DataFrame(data=principalComponents, columns = range(1,numComponents + 1))

finalDF = pd.concat([principalDF, trainingMerge[['label']]], axis=1)

#Process Testing Data with PCA
testingFeatures = list(testingData.columns)
x2 = testingData.loc[:, testingFeatures].values
x2 = StandardScaler().fit_transform(x2)

testingPCA = PCA(n_components = numComponents)
testingPrincipalComponents  = testingPCA.fit_transform(x2)
testingFinalDF = pd.DataFrame(data = testingPrincipalComponents, columns = range(1, numComponents + 1))

#Split Training Data For Model Analysis
finalData = finalDF.values
xVal = finalData[:,0:numComponents]
yVal= finalData[:,numComponents]
xTrain, xTest, yTrain, yTest = train_test_split(xVal, yVal, test_size=0.20, random_state=1)

#Select Models For Testing
models = []
models.append(('LR ', LogisticRegression(solver='liblinear',multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNC', KNeighborsClassifier()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))

#Evaluate Each Model
print("\n\n#-----------------------------------------------------------------------------------------------------#")
print("#-------------------------------Model Analysis and Testing With training.txt--------------------------#")
print("#-----------------------------------------------------------------------------------------------------#")
print("     Accuracy AVG Diviation")
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=0)
    cv_results = model_selection.cross_val_score(model, xTrain, yTrain, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
#Select The Best Model For Testing
print("The best model results in a Logistic Regression Model.")
testModel = LogisticRegression(solver='liblinear',multi_class='ovr')
testModel.fit(xTrain, yTrain)

#Make Sample Predictions
testPredictions = testModel.predict(xTest)
print("\nSample Predictions: ")
print(testPredictions)
print("\nSample Actual:")
print(yTest)

# Evaluate Sample predictions
print("\nSample Accuracy:")
print(accuracy_score(yTest, testPredictions))

print("\nSample Report")
print(classification_report(yTest, testPredictions))

#Make Final Predictions for Test.txt
finalModel = LogisticRegression(solver='liblinear',multi_class='ovr')
finalModel.fit(xVal, yVal)

finalPredictions = finalModel.predict(testingFinalDF)
print("\n\n#-----------------------------------------------------------------------------------------------------#")
print("#-----------------------------------Final Predictions For Test.txt------------------------------------#")
print("#-----------------------------------------------------------------------------------------------------#")
print(finalPredictions)




#-----------------------------------------------------------------------------------------------------#
#-------------------------------Model Analysis and Testing With training.txt--------------------------#
#-----------------------------------------------------------------------------------------------------#
     Accuracy AVG Diviation
LR : 0.981665 (0.008624)
LDA: 0.844544 (0.038576)
KNC: 0.974196 (0.014209)
DTC: 0.890683 (0.034626)
GNB: 0.479256 (0.044345)
The best model results in a Logistic Regression Model.

Sample Predictions: 
[-1.  1.  1.  1. -1. -1.  1. -1. -1.  1.  1. -1. -1.  1.  1.  1. -1.  1.
 -1.  1.  1.  1. -1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1. -1.
  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1. -1.  1. -1.
  1.  1. -1.  1. -1.  1. -1.  1.  1.  1.  1. -1.  1.  1.  1. -1.  1. -1.
  1.  1.  1.  1. -1.  1. -1.  1. -1. -1. -1.  1.  1. -1.  1. -1.  1. -1.
 -1.  1. -1. -1.  1. -1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1. -1.  1. -1.  