# Machine Learning for Engineers Final Project
#### by: Justin May, Jonathan Hong, and Joseph Shenouda

In [1]:
"""
Universal imports
"""
import numpy as np
import pandas as pd
import math
import csv
import pickle
import time
import sys

In [2]:
"""
Handy function to monitor progress of running cells
"""
def progress(count, total, suffix=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', suffix))
    sys.stdout.flush()  # As suggested by Rom Ruben

## Imaging Dataset Classification Problem
[CIFAR-10 Dataset](https://www.cs.toronto.edu/~kriz/cifar.html)

### Preprocessing Step

In [11]:
from sklearn import preprocessing as sklpp
from sklearn import decomposition as skldecomp
from sklearn import metrics

In [14]:
"""
Extracting data from file into numpy array
"""
data = None
labels = []

file = "images/data_batch_1"
with open(file, 'rb') as fo:
    print("extracting file "+file+"...")
    dict = pickle.load(fo, encoding='bytes')
    temp_data = dict[b'data']
    try:
        data = np.concatenate((data, temp_data), axis=0)
    except:
        data = temp_data
    labels = labels + dict[b'labels']

labels = np.asarray(labels)
data = np.asarray(data)
labels = labels.reshape(-1,1)
print("Finished Extracting Features")
print(np.shape(data))
        

extracting file images/data_batch_1...
Finished Extracting Features
(10000, 3072)


## Preprocessing and Feature Learning For Images Dataset
For preprocessing we centered our dataset to 0 mean and then for feature learning we apply PCA to reduce our dimensions from 3072 to a smaller number of features than contains 95% of the variance of our data. We did this in order to speed up our computation time for the other algorithms we would be implementing on the dataset for classification.

In [4]:
# First we create a StandardScaler object to 0 mean the data matrix but preserve the variance
stand_scaler = sklpp.StandardScaler(with_mean = True, with_std = False)
# Fits the data matrix to the StandardScaler object defined ^
centered_ImageData = stand_scaler.fit_transform(data)

In [5]:
# Creates a PCA object that reduces the dimensions of our data matrix keeping 95% of the variance
pca_obj = skldecomp.PCA(n_components = 0.95, svd_solver = 'auto')
dim_reducedImageData = pca_obj.fit_transform(centered_ImageData)
np.save('dim_reducedImageData.npy', dim_reducedImageData)

In [6]:
print('Data has been reduced to {} features after PCA'.format(dim_reducedImageData.shape[1]))

Data has been reduced to 209 features after PCA


### Quadratic Discriminant Analysis (by: Justin May)
I implemeneted the algorithim we learned in class:

first learn the parameters (expected average, prior probabilities, covariance matricies)
define the discriminant functions to find the best labels

In [16]:
"""
Pull pca data from saved
"""
image_data = np.load('dim_reducedImageData.npy')
image_data = np.concatenate((image_data,labels),axis=1)
number_of_parameters = len(image_data[0]) - 1 #209
number_of_total_samples = len(image_data)
"""
K-Fold cross validation 
"""     
final_estimates = []
k = 10
for i in range(1,k+1):
    training_data = np.concatenate((image_data[0:(i-1)*int(number_of_total_samples/k)],image_data[i*int(number_of_total_samples/k):number_of_total_samples]),axis=0)
    testing_data = image_data[(i-1)*int(number_of_total_samples/k):i*int(number_of_total_samples/k)]
    number_of_training_samples = len(training_data)
    """
    Define Parameters 
    """
    mu = [[0 for j in range(number_of_parameters)] for i in range(10)] # initializing means 
    sigmas = [[[0 for k in range(number_of_parameters)] for j in range(number_of_parameters)] for i in range(10)] # initializing variances
    occurences = [0 for i in range(10)]
    pi = [0 for i in range(10)]
        
    """
    Learn mu
    """
    # Find Sums and Occurences
    for image in training_data:
        label = int(image[-1])
        occurences[label] += 1
        mu[label] = np.add(mu[label],image[:-1])
    # Calculate Averages and Prior Probabilities 
    for label,sums in enumerate(mu):
        mu[label] = np.multiply(1/(occurences[label] - 1), sums).reshape(-1,1) #unbaised estimator
        pi[label] = occurences[label] / number_of_total_samples
    """
    Learn sigma^2
    """
    time = 0
    for image in training_data:
        time += 1
        if time%10 == 0:
            progress(time,number_of_training_samples,suffix="training k= "+str(i))
        label = int(image[-1])
        image = image[:-1].reshape(-1,1)
        difference = np.subtract(image,mu[label])
        sigma = difference.dot(difference.T)
        sigmas[label] = np.add(sigmas[label],sigma)
    for label,sigma in enumerate(sigmas):
        sigmas[label] = np.multiply(1/(occurences[label]-1),sigma)
    sys.stdout.flush()
    """
    Method to find the best discriminant score
    """
    def estimateBestLabel(image):
        scores = [0 for _ in range(10)]
        # find score for each label 

        for label in range(10):
            inverted_variance = np.linalg.inv(sigmas[label])
            first_term = -0.5*image.T.dot(inverted_variance).dot(image)
            second_term = image.T.dot(inverted_variance).dot(mu[label])
            third_term = -0.5*mu[label].T.dot(inverted_variance).dot(mu[label])
            (sign, logdet) = np.linalg.slogdet(sigmas[label])
            fourth_term = -0.5*sign*logdet
            fifth_term = math.log(pi[label])
            score = first_term + second_term + third_term + fourth_term + fifth_term
            scores[label] = score[0][0]
        return(scores.index(max(scores)))
    correct = 0
    print("")
    for index, image in enumerate(testing_data):
        if index % 9 == 0:
            progress(index+1,1000,suffix="testing k= "+str(i))
        image = image[:-1].reshape(-1,1)
        if testing_data[index][-1] == estimateBestLabel(image):
            correct += 1
    sys.stdout.flush()
    print(correct/1000)
    final_estimates.append(correct/1000)
print("")
print(final_estimates)


[0.466, 0.498, 0.479, 0.473, 0.464, 0.49, 0.441, 0.469, 0.482, 0.473]


In [70]:
final_estimatesAverage = sum([0.466, 0.498, 0.479, 0.473, 0.464, 0.49, 0.441, 0.469, 0.482, 0.473])/10
final_estimatesAverage

0.4734999999999999

In [17]:
from sklearn.metrics import confusion_matrix

In [18]:
"""
Method to find the best discriminant score
"""
def estimateBestLabel(image):
    scores = [0 for _ in range(10)]
    # find score for each label 

    for label in range(10):
        inverted_variance = np.linalg.inv(sigmas[label])
        first_term = -0.5*image.T.dot(inverted_variance).dot(image)
        second_term = image.T.dot(inverted_variance).dot(mu[label])
        third_term = -0.5*mu[label].T.dot(inverted_variance).dot(mu[label])
        (sign, logdet) = np.linalg.slogdet(sigmas[label])
        fourth_term = -0.5*sign*logdet
        fifth_term = math.log(pi[label])
        score = first_term + second_term + third_term + fourth_term + fifth_term
        scores[label] = score[0][0]
    return(scores.index(max(scores)))
correct = 0
print("")
estimated = []
real = []
for index, image in enumerate(testing_data):
    if index % 9 == 0:
        progress(index+1,1000,suffix="testing k= "+str(i))
    image = image[:-1].reshape(-1,1)
    real_label = testing_data[index][-1]
    estimated_label = estimateBestLabel(image)
    real.append(real_label)
    estimated.append(estimated_label)
sys.stdout.flush()
confusion = confusion_matrix(real, estimated,labels=[0,1,2,3,4,5,6,7,8,9])



In [19]:
"""
Print Confusion Matrix 
"""
image_labels = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship' ,'truck']
confusion = pd.DataFrame(confusion)
confusion.insert(0, "", image_labels, True)
confusion.columns = ["", 'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship' ,'truck']
print(confusion.to_string(index=False))

             airplane  automobile  bird  cat  deer  dog  frog  horse  ship  truck
   airplane        45           7     4    5     5    3     0      3    16     16
 automobile         4          70     1    3     0    0     2      0     1     26
       bird         8           2    32   16    23   15     5      4     1      9
        cat         2           6     4   42    10   18     6     12     1      8
       deer         8           8     9    4    44    4     0     16     3      7
        dog         1           4     7   13     1   26     4     11     0     10
       frog         0          10     8   12     7    6    50      1     5      9
      horse         4           6     2    4     5    6     2     49     0      8
       ship        13          13     0    0     4    0     2      0    57     12
      truck         5          16     0    2     0    1     0      2     6     58


### K-Nearest Neighbor Classifier(by: Joseph Shenouda)

In [7]:
from sklearn import neighbors as sklneighb

### K-Nearest Neighbors with SciKit Learn
We first create a KNeighborsClassifier that accepts the following parameters
1. **metric**: this defines the method we will use to calculate the distance between a test sample and the training samples to determine the K nearest training samples to the given test
2. **algorithm**: this is set to 'auto' and it tells SciKit which algorithm it should use to calculate the nearest neighbors we left it to the default of auto so that SciKit would give the best choice based on our training data
3. **leaf_size**: Leaf_size is a parameter used for 2 of the possible algorithms from the algorithms parameter BallTree and KDTree we left this at the default because our algorithm was being choosen by SciKit
4. **p** : p is the power parameter for the Minkowski metric in our case since we are doing euclidean distance it will just be 2
5. **weights**: is set to its default value of 'uniform' so that each point in the neighborhood of test sample has the same strength and contributes as equally as any other point to the overall classification
6. **metric_params**: additional keyword arguments for the distance metric used
7. **n_jobs**: number of parallel jobs we set it to the defualt of none because we are not doing parallel

Then we fit the object we just created to our training data and finally we call the KNeighborsClassifier.predict method which accepts the test samples as inputs and returns the labels it classified each sample as it does this in the following steps:
1. It finds the n_neighbors closest training points using the metric defined for the KNeighborsClassifier we defined earlier
2. Then based on the closest n_neighbors it found it assigns the test sample to the label that turned up the most in the n_neighbors closest training samples

In [30]:
"""
A method to compute the kth fold CV of K nearest neighbor
Parameters:
k_iterCV - The current iteration of k fold CV
k_NN -  The number of neighbors for K Nearest Neighbors algo
data - The data matrix to be split into training and test
kCVResults - An array that stores the results of each K-Fold CV result
"""
def kFoldKNN(k_iterCV, k_NN, data, kFoldCVResults, kFolds):
    number_of_total_samples = data.shape[0]
    
    #Splits up the data ito training and testing data for each iteration of K-fold CV
    training_data = np.concatenate((data[0:(k_iterCV-1)*int(number_of_total_samples/kFolds)],data[k_iterCV*int(number_of_total_samples/kFolds):number_of_total_samples]),axis=0)
    training_labels = np.concatenate((labels[0:(k_iterCV-1)*int(number_of_total_samples/kFolds)],labels[k_iterCV*int(number_of_total_samples/kFolds):number_of_total_samples]),axis=0)
    
    testing_data = data[(k_iterCV-1)*int(number_of_total_samples/kFolds):k_iterCV*int(number_of_total_samples/kFolds)]
    test_data_labels = labels[(k_iterCV-1)*int(number_of_total_samples/kFolds):k_iterCV*int(number_of_total_samples/kFolds)]
    
    # KNN implementation from SciKit Learn as explained above 
    neigh = sklneighb.KNeighborsClassifier(metric='euclidean',n_neighbors = k_NN)
    neigh.fit(training_data,training_labels)
    KNNClassifierResults = neigh.predict(testing_data)
    
    # From the SciKit learn library this method takes in the ground truth labels of the test data and the predicted labels then returns
    # the fraction of correctly classified samples
    kFoldCVResults.append(metrics.accuracy_score(test_data_labels,KNNClassifierResults)) 

In [33]:
#3 different K values for KNN
kNNs = [5, 15 , 20, 30, 50]

#A dictionary that stores the K for KNN as keys
#And the list of kFoldCVResults for that KNN as values
kNNResults = {}

#Iterating over each K value for KNN
for kNN in kNNs:
    kFoldResults = []
    #10-fold Cross validation
    for k in range(1,11):
        kFoldKNN(k,kNN, dim_reducedImageData, kFoldResults, 10)
    kNNResults[kNN] = kFoldResults

In [35]:
avgError_kNN5 = sum(kNNResults[5])/10
avgError_kNN15 = sum(kNNResults[15])/10
avgError_kNN20 = sum(kNNResults[20])/10
avgError_kNN30 = sum(kNNResults[30])/10
avgError_kNN50 = sum(kNNResults[50])/10

print('The average error after 10-Fold CV for k = 5 KNN is {}\n'.format(avgError_kNN5))
print('The average error after 10-Fold CV for k = 15 KNN is {}\n'.format(avgError_kNN15))
print('The average error after 10-Fold CV for k = 20 KNN is {}\n'.format(avgError_kNN20))
print('The average error after 10-Fold CV for k = 30 KNN is {}\n'.format(avgError_kNN30))
print('The average error after 10-Fold CV for k = 50 KNN is {}\n'.format(avgError_kNN50))

The average error after 10-Fold CV for k = 5 KNN is 0.3043

The average error after 10-Fold CV for k = 15 KNN is 0.30260000000000004

The average error after 10-Fold CV for k = 20 KNN is 0.3045

The average error after 10-Fold CV for k = 30 KNN is 0.29900000000000004

The average error after 10-Fold CV for k = 50 KNN is 0.2942



### Logistic Regression (by Jonathan Hong)

In [20]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

In [24]:
dim_reducedImageData = np.load('dim_reducedImageData.npy')
kf = KFold(n_splits=10, random_state=None, shuffle=False)
labels = labels.flatten()

In [25]:
# initialize
accuracy = []
models = []
count = 1

# split into training and validation index
for train_index, test_index in kf.split(dim_reducedImageData):
    print("Training Iteration ", count, ": \n")
    print("\tTraining set: [", train_index[0], '...', train_index[-1], "] \tTest set: [", test_index[0], '...', test_index[-1], "]")
    
    # split into training and validation dataset
    X_train, X_test = dim_reducedImageData[train_index], dim_reducedImageData[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    # Use Logistic Regression Model - One vs. All because we need to learn 10 classes
    model = LogisticRegression(solver='liblinear', multi_class="ovr", max_iter = 400)
    
    # fit training data to model
    model.fit(X_train, y_train)
    
    # report the score using the test dataset
    score = model.score(X_test, y_test)
    
    # append the results for later
    accuracy.append(score)
    models.append(model)
    print("Score: ", score)
    count +=1
# results    
print('Accuracies:', accuracy)

Training Iteration  1 : 

	Training set: [ 1000 ... 9999 ] 	Test set: [ 0 ... 999 ]
Score:  0.379
Training Iteration  2 : 

	Training set: [ 0 ... 9999 ] 	Test set: [ 1000 ... 1999 ]
Score:  0.379
Training Iteration  3 : 

	Training set: [ 0 ... 9999 ] 	Test set: [ 2000 ... 2999 ]
Score:  0.384
Training Iteration  4 : 

	Training set: [ 0 ... 9999 ] 	Test set: [ 3000 ... 3999 ]
Score:  0.367
Training Iteration  5 : 

	Training set: [ 0 ... 9999 ] 	Test set: [ 4000 ... 4999 ]
Score:  0.348
Training Iteration  6 : 

	Training set: [ 0 ... 9999 ] 	Test set: [ 5000 ... 5999 ]
Score:  0.398
Training Iteration  7 : 

	Training set: [ 0 ... 9999 ] 	Test set: [ 6000 ... 6999 ]
Score:  0.386
Training Iteration  8 : 

	Training set: [ 0 ... 9999 ] 	Test set: [ 7000 ... 7999 ]
Score:  0.374
Training Iteration  9 : 

	Training set: [ 0 ... 9999 ] 	Test set: [ 8000 ... 8999 ]
Score:  0.378
Training Iteration  10 : 

	Training set: [ 0 ... 8999 ] 	Test set: [ 9000 ... 9999 ]
Score:  0.386
Accuracies

In [72]:
logAccuracy_avg = sum([0.379, 0.379, 0.384, 0.367, 0.348, 0.398, 0.386, 0.374, 0.378, 0.386])/10
logAccuracy_avg

0.3779

### Best Iteration @ K=6

In [26]:
indexes = np.arange(10000)
test_index = np.arange(5000,6000)
model = models[5]
predictions = model.predict(dim_reducedImageData[test_index])
confusion = confusion_matrix(labels[test_index], predictions, labels=[0,1,2,3,4,5,6,7,8,9])
image_labels = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship' ,'truck']
confusion = pd.DataFrame(confusion)
confusion.insert(0, "", image_labels, True)
confusion.columns = ["", 'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship' ,'truck']
print(confusion.to_string(index=False))

             airplane  automobile  bird  cat  deer  dog  frog  horse  ship  truck
   airplane        49           3     5    5     2    1     1      6    25      8
 automobile         7          36     3    6     4    5     7      3    11      9
       bird        11           6    29    8     9    9    14      5     7      2
        cat         6           2    11   20     8   19     8      6     7      7
       deer         3           4    18    7    32    4    11      9     2      2
        dog         2           5    12   18     5   29     7      4     7      6
       frog         1           8     8   16     6   10    41      3     2      2
      horse         4           4     5    7    10    6     4     58     6      7
       ship        15           9     2    2     3    5     1      4    52      7
      truck         8          21     2    6     2    3     4      5    12     52


### Results

#### QDA
From QDA we saw that after cross validation we get accuracy of about 47.3%

#### Average K-NN results
- The average correct classifications after 10-Fold CV for k = 5 KNN is 0.3043
- The average correct classifications after 10-Fold CV for k = 15 KNN is 0.30260000000000004
- The average correct classifications after 10-Fold CV for k = 20 KNN is 0.3045
- The average correct classifications after 10-Fold CV for k = 30 KNN is 0.29900000000000004
- The average correct classifications after 10-Fold CV for k = 50 KNN is 0.2942

The best K-NN which was 15-NN got us accuracy of only 30.3%

#### Logistic Regression Results
From Logistic Regression on average we got an accuracy of 37.79%

 **Therefore from our results we see that the best model for this dataset and problem was Quadratic Discriminant Analysis**

## Text Dataset Classification 
[Tweets for Sentiment Analysis](https://www.kaggle.com/imrandude/twitter-sentiment-analysis?fbclid=IwAR3g6NIHi9alcwDH3BI_qedUknq4xqAf-O7yfy1gMzUAQwQOoanOTK1p5zg)

### Preprocessing Step
Our twitter data preprocessing step consistions of the following:

- Tokenizing words the tweets
- Removing any links, @ mentions, and #tags that don't have semantic meaning outside of twitter
- Lemmatization: remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma
- Lower case all words
- Remove stopwords: a commonly used word (such as “the”, “a”, “an”, “in”) that have little semantic meaning

In [28]:
"""
Reading in Data
"""
df_raw_twitter_data = pd.read_csv('train.csv', header=None, encoding = "ISO-8859-1").values[1:]
df_raw_twitter_data = df_raw_twitter_data[:,1:] # getting rid of index 

labels = df_raw_twitter_data[:,0].reshape(-1,1)

In [29]:
"""
Preprocessing
"""

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
#nltk.download('stopwords')
#nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer


print(np.shape(df_raw_twitter_data))
corpus = []
print("")
time = 0
for index,[sentiment, tweet] in enumerate(df_raw_twitter_data):
    time += 1
    # Tokenize Words 
    tokens = tweet.split(" ")
    # Remove Links, @ mentions, # tags
    links = ['http', '.com', '#', '@', '&', '~']
    tokens = [w for w in tokens if not any(x in w for x in links)]
    # Tokenize again 
    regex = re.compile('[^a-zA-Z]')
    tokens = [regex.sub('', w) for w in tokens]
    # Lemmatization 
    lemmatizer = WordNetLemmatizer() 
    tokens = [lemmatizer.lemmatize(w) for w in tokens ]
    # Clean Up
    tokens = [w.lower() for w in tokens if len(w) > 0]
    # Remove Stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('im')
    tokens = [w for w in tokens if not w in stop_words]
    corpus.append(" ".join(tokens))
    if time % 100 == 0:
        progress(time,99988,suffix="running twitter processing")
sys.stdout.flush()

(99989, 2)


In [30]:
np.save('corpus.npy', corpus)

In [31]:
##############
star = 5000
##############
corpus = np.load('corpus.npy')
vectorizer = CountVectorizer()
vobj = vectorizer.fit_transform(corpus[:star])
vectors = vobj.toarray()
print("\nFinished vectorization")
print(np.shape(vectors))


Finished vectorization
(5000, 7882)


### PCA

In [146]:
# First we create a StandardScaler object to 0 mean the data matrix but preserve the variance
stand_scaler = sklpp.StandardScaler(with_mean = True, with_std = False)
# Fits the data matrix to the StandardScaler object defined ^
centered_textData = stand_scaler.fit_transform(vectors)

In [147]:
# Creates a PCA object that reduces the dimensions of our data matrix keeping 90% of the variance
pca_obj = skldecomp.PCA(n_components = 0.9, svd_solver = 'auto')
dim_reducedTextData = pca_obj.fit_transform(centered_textData)
np.save('twitterDataReduced.npy', dim_reducedTextData)

In [148]:
print('Data has been reduced to {} features after PCA'.format(dim_reducedTextData.shape[1]))

Data has been reduced to 1719 features after PCA


### Support Vector Machines (by Joseph Shenouda)

Support vector machines aim to find a hyperplane in our $p$ dimensional space that maximizes the margin between the hyperplane and the data points closest to it on both sides of the linear boundary mathmatically it solves this optimization problem:
$$\underset{(w,b)}{\arg\min}\frac{1}{n}\sum_{i=1}^{n}max[0, 1-y_{i}(w^{T}x_{i}+b)]+\frac{\lambda}{2}\|w\|^{2}$$ here $y\in\{-1,1\}$ and where $\lambda$ is a regularization parameter that effects the energy of $w$ vector decreasing it as $\lambda$ increases and increasing the energy of $w$ as $\lambda$ decreases. In SciKit Learn they use a similar formulation for the support vector machines which is:
$$\begin{align}\begin{aligned}\min_ {w, b, \zeta} \frac{1}{2} w^T w + C \sum_{i=1}^{n} \zeta_i\\\begin{split}\textrm {subject to } & y_i (w^T \phi (x_i) + b) \geq 1 - \zeta_i,\\
& \zeta_i \geq 0, i=1, ..., n\end{split}\end{aligned}\end{align}$$ here the regularization parameter is now C. If C is lower then the summation can be higher meaning it allows for more errors but if C is high then it ensures that the summation must decrease and thus reduce the amount of missclassifications by the hyper-plane.

SciKit learn uses the LibLinear library for large linear classification

In [149]:
"""
Importing SVM from SciKit Learn
"""
from sklearn import svm as sklSVM

In [150]:
data = np.load('twitterDataReduced.npy')
labels = labels[:5000]
number_of_parameters = len(data[0]) - 1 #5000
number_of_total_samples = len(data)
number_of_labels = 2

In [151]:
"""
Validation step
"""
kFolds = 10
reg_params = [0.001, 0.1, 1, 5 ,10]

regParamResults = {}

for c in reg_params:
    validation_results = []
    for k_iterCV in range(2,12):
        number_of_total_samples = data.shape[0]

        fold_size = int(number_of_total_samples/kFolds)

        #Splits up the data into training and testing data for each iteration of K-fold CV
        if k_iterCV != 11:
            training_data = np.concatenate((data[0:(k_iterCV-2)*fold_size],data[k_iterCV*fold_size:number_of_total_samples]),axis=0)
            training_labels = np.concatenate((labels[0:(k_iterCV-2)*fold_size],labels[k_iterCV*fold_size:number_of_total_samples]),axis=0)
        else:
            training_data = data[fold_size:number_of_total_samples-fold_size]
            training_labels = labels[fold_size:number_of_total_samples-fold_size]


        testing_data = data[(k_iterCV-2)*fold_size:(k_iterCV-1)*fold_size]
        test_labels = labels[(k_iterCV-2)*fold_size:(k_iterCV-1)*fold_size]

        if k_iterCV != 11:
            validation_data = data[(k_iterCV-1)*fold_size:k_iterCV*fold_size]
            validation_labels = labels[(k_iterCV-1)*fold_size:k_iterCV*fold_size]
        else:
            validation_data = data[0:fold_size]
            validation_labels = labels[0:fold_size]
            
        # fit_intercept set to false because the data was already centered from PCA
        # dual set to false to get rid of ConvergenceWarnings
        svm_obj = sklSVM.LinearSVC(C = c, dual = False)

        #ravel converts narray to (n,) shape which SVM uses
        svm_obj.fit(training_data,training_labels.ravel())
        svmResults = svm_obj.predict(validation_data)
        # From the SciKit learn library this method takes in the ground truth labels of the test data and the predicted labels then returns the fraction of correctly classified samples
        validation_results.append(metrics.accuracy_score(validation_labels,svmResults))
    regParamResults[c] = validation_results

# Finding best regularization parameter from tuning on validation sets
validationAvgs = {}
for regs, val_sets in regParamResults.items():
    print('The regularization parameter (C) being used is {}'.format(regs))
    avgCorrect = sum(val_sets)/10
    print('The average # of correct classifications is {}\n'.format(avgCorrect))
    validationAvgs[regs] = avgCorrect

The regularization parameter (C) being used is 0.001
The average # of correct classifications is 0.7112

The regularization parameter (C) being used is 0.1
The average # of correct classifications is 0.7592000000000001

The regularization parameter (C) being used is 1
The average # of correct classifications is 0.7308

The regularization parameter (C) being used is 5
The average # of correct classifications is 0.7152000000000001

The regularization parameter (C) being used is 10
The average # of correct classifications is 0.7078



#### From these results we can see that C = 0.1 gives us the best results based on our validation sets.

In [152]:
"""
K-Fold cross validation
"""
SVM_k_FoldResults = []
kFolds = 10
for k_iterCV in range(2,12):
    number_of_total_samples = data.shape[0]

    fold_size = int(number_of_total_samples/kFolds)

    #Splits up the data ito training and testing data for each iteration of K-fold CV
    if k_iterCV != 11:
        training_data = np.concatenate((data[0:(k_iterCV-2)*fold_size],data[k_iterCV*fold_size:number_of_total_samples]),axis=0)
        training_labels = np.concatenate((labels[0:(k_iterCV-2)*fold_size],labels[k_iterCV*fold_size:number_of_total_samples]),axis=0)
    else:
        training_data = data[fold_size:number_of_total_samples-fold_size]
        training_labels = labels[fold_size:number_of_total_samples-fold_size]


    testing_data = data[(k_iterCV-2)*fold_size:(k_iterCV-1)*fold_size]
    test_labels = labels[(k_iterCV-2)*fold_size:(k_iterCV-1)*fold_size]

    if k_iterCV != 11:
        validation_data = data[(k_iterCV-1)*fold_size:k_iterCV*fold_size]
        validation_labels = labels[(k_iterCV-1)*fold_size:k_iterCV*fold_size]
    else:
        validation_data = data[0:fold_size]
        validation_labels = labels[0:fold_size]
    
    # fit_intercept set to false because the data was already centered from PCA
    # dual set to false to get rid of ConvergenceWarnings
    svm_obj = sklSVM.LinearSVC(C = 0.1, dual = False)

    #ravel converts narray to (n,) shape which SVM uses
    svm_obj.fit(training_data,training_labels.ravel())
    svmResults = svm_obj.predict(testing_data)
    # From the SciKit learn library this method takes in the ground truth labels of the test data and the predicted labels then returns the fraction of correctly classified samples
    SVM_k_FoldResults.append(metrics.accuracy_score(validation_labels,svmResults))

In [153]:
avgResults = sum(SVM_k_FoldResults)/10

In [154]:
avgResults

0.6388

#### After running the K-Fold cross validation with SVM we find that the best regularization parameter is C = 0.1 and our model has accuracy of about 63.88% on average

### Quadratic Discriminant Analysis (by Justin May)

### QDA Implementation
With this QDA implementation I chose to go with sklearn. Sklearn's behind the scenes optimizations are simply 100x better than mine. Literally, it took around 43 minutes on average to train on each k with n=5000 and p=1719. I probably spent around 14+ hours playing around with optimizing my code and finally just tried sklearn's qda implentation. It finished in less than 5 minutes.

In [34]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
import warnings
warnings.filterwarnings('ignore',category=UserWarning)
"""
Pull pca data from saved
"""
data = np.load('twitterDataReduced.npy')
labels = labels[:5000]
number_of_parameters = len(data[0]) - 1 #5000
number_of_total_samples = len(data)
number_of_labels = 2
"""
K-Fold cross validation 
"""     
final_estimates = []
k = 10
for i in range(1,11):
    training_data = np.concatenate((data[0:(i-1)*int(number_of_total_samples/k)],data[i*int(number_of_total_samples/k):number_of_total_samples]),axis=0)
    training_labels = np.concatenate((labels[0:(i-1)*int(number_of_total_samples/k)],labels[i*int(number_of_total_samples/k):number_of_total_samples]),axis=0)
    testing_data = data[(i-1)*int(number_of_total_samples/k):i*int(number_of_total_samples/k)]
    number_of_training_samples = len(training_data)
    clf = QDA()
    print("Training on k="+str(i))
    clf.fit(training_data, training_labels)
    print("")
    correct = 0
    for index, image in enumerate(testing_data):
        if index % 1 == 0:
            progress(index+1,int(star/10),suffix="testing k= "+str(i))
        if int(testing_data[index][-1]) == int(clf.predict([image])[0]):
            correct += 1
    sys.stdout.flush()
    print("")
    final_estimates.append(correct/int(star/10))
print(final_estimates)

Training on k=1

Training on k=2

Training on k=3

Training on k=4

Training on k=5

Training on k=6

Training on k=7

Training on k=8

Training on k=9

Training on k=10

[1.0, 0.988, 0.988, 0.994, 0.976, 0.99, 0.98, 0.976, 0.958, 0.962]


In [75]:
final_estimatesAvg = sum([1.0, 0.988, 0.988, 0.994, 0.976, 0.99, 0.98, 0.976, 0.958, 0.962])/10
final_estimatesAvg

0.9812

### Logistic Regression (by: Jonathan Hong)

In [35]:
dim_reducedTweetsData = np.load('twitterDataReduced.npy')
kf = KFold(n_splits=10, random_state=None, shuffle=False)

In [36]:
# initialize
accuracy = []
models = []
count = 1


# split into training and validation index
for train_index, test_index in kf.split(dim_reducedTweetsData):
    print("Training Iteration ", count, ": \n")
    print("\tTraining set: [", train_index[0], '...', train_index[-1], "] \tTest set: [", test_index[0], '...', test_index[-1], "]")
    
    # split into training and validation dataset
    X_train, X_test = dim_reducedTweetsData[train_index], dim_reducedTweetsData[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    
    # Use Logistic Regression Model - Binary Case
    model = LogisticRegression(solver='liblinear', max_iter = 400)
    
    # fit training data to model
    model.fit(X_train, y_train)
    
    # report the score using the test dataset
    score = model.score(X_test, y_test)
    
    # append the results for later
    accuracy.append(score)
    models.append(model)
    print("Score: ", score)
    count +=1
# results    
print('Accuracies:', accuracy)

Training Iteration  1 : 

	Training set: [ 500 ... 4999 ] 	Test set: [ 0 ... 499 ]
Score:  0.732
Training Iteration  2 : 

	Training set: [ 0 ... 4999 ] 	Test set: [ 500 ... 999 ]
Score:  0.706
Training Iteration  3 : 

	Training set: [ 0 ... 4999 ] 	Test set: [ 1000 ... 1499 ]
Score:  0.766
Training Iteration  4 : 

	Training set: [ 0 ... 4999 ] 	Test set: [ 1500 ... 1999 ]
Score:  0.8
Training Iteration  5 : 

	Training set: [ 0 ... 4999 ] 	Test set: [ 2000 ... 2499 ]
Score:  0.744
Training Iteration  6 : 

	Training set: [ 0 ... 4999 ] 	Test set: [ 2500 ... 2999 ]
Score:  0.816
Training Iteration  7 : 

	Training set: [ 0 ... 4999 ] 	Test set: [ 3000 ... 3499 ]
Score:  0.82
Training Iteration  8 : 

	Training set: [ 0 ... 4999 ] 	Test set: [ 3500 ... 3999 ]
Score:  0.766
Training Iteration  9 : 

	Training set: [ 0 ... 4999 ] 	Test set: [ 4000 ... 4499 ]
Score:  0.738
Training Iteration  10 : 

	Training set: [ 0 ... 4499 ] 	Test set: [ 4500 ... 4999 ]
Score:  0.742
Accuracies: [0.7

In [76]:
final_estimatesAvg = sum([0.732, 0.706, 0.766, 0.8, 0.744, 0.816, 0.82, 0.766, 0.738, 0.742])/10
final_estimatesAvg

0.763

### Best iteration at K = 7

In [38]:
indexes = np.arange(5000)
test_index = np.arange(3000,3500)
model = models[6]
predictions = model.predict(dim_reducedTweetsData[test_index])
confusion = confusion_matrix(labels[test_index], predictions, labels=['1','0'])
sentiment_labels = ['Positive', 'Negative']
confusion = pd.DataFrame(confusion)
confusion.insert(0, "", sentiment_labels, True)
confusion.columns = ["", 'Positive', 'Negative']
print(confusion.to_string(index=False))

           Positive  Negative
 Positive        45        51
 Negative        39       365


#### Results

##### QDA
On average after K-Fold cross validation we get accuracy of about 98.12%

#### Logistic Regression
Results after K-fold: 76.3%

#### Support Vector Machines
Results after K-fold: 63.88%

**The best model for this dataset was also QDA**

# Clustering

In [51]:
"""
Opening File, creating nparray 
Data is already 0 mean 1 variance 
"""
features = open('human_activity_features_train_data.txt','r')
human_activity_data = []
for feature in features:
    feature = np.array([float(w) for w in features.readline().split(" ") if len(w) > 0])
    human_activity_data.append(feature)
features.close()
human_activity_data = np.array(human_activity_data)
print("features: ",np.shape(human_activity_data))

labels = open('human_activity_labels_train_data.txt','r')
human_activity_labels = []
for label in labels:
    human_activity_labels.append(int(label))
labels.close()
human_activity_labels = human_activity_labels[:3676]
print("labels: ",np.shape(human_activity_labels))

features:  (3676, 561)
labels:  (3676,)


In [43]:
from sklearn import cluster as skCluster

## PCA

In [52]:
# First we create a StandardScaler object to 0 mean the data matrix but preserve the variance
stand_scaler = sklpp.StandardScaler(with_mean = True, with_std = False)
# Fits the data matrix to the StandardScaler object defined ^
centered_HARData = stand_scaler.fit_transform(human_activity_data)

In [53]:
# Creates a PCA object that reduces the dimensions of our data matrix keeping 90% of the variance
pca_obj = skldecomp.PCA(n_components = 0.95, svd_solver = 'auto')
dim_reducedHARData = pca_obj.fit_transform(centered_HARData)

In [54]:
dim_reducedHARData.shape

(3676, 67)

In [55]:
np.save('human_activity_data.npy', dim_reducedHARData)

### K- Means Clustering (by Joseph Shenouda)
I implemented K-means clustering using the SciKit Learn library. The **KMeans** function takes in the following important parameters:
- n_cluster - The number of clusters to make from the given data
- init - The initialization method used for finding the initial centroid in the K-means algorithm, I left this as SciKit's default of k-means++ instead of random for optimization
- n_init - The number of times k-means is rerun with different seeds this is import because the more runs we do the better chances we have of finding the global minimum as opposed to the local minima
- max_iter - max number of iterations done
- tol - Tolerance level used to declare convergence and stop the algo, Left this as default

The rest of the algorithm works like the method discussed in class

In [157]:
kmeans_obj = skCluster.KMeans(n_clusters = 6,init = 'random', n_init = 10)
kmeansCluster = kmeans_obj.fit(dim_reducedHARData)

predicted_clusters = kmeansCluster.labels_
predicted_results = [[0 for j in range(6)] for i in range(6)]

for index,cluster in enumerate(predicted_clusters):
    predicted_results[cluster][human_activity_labels[index]-1] += 1   
for i in range(6):
    Sum = sum(predicted_results[i])
    predicted_results[i] = [(x/Sum)*100 for x in predicted_results[i]]
    print("Cluster ",i,"best predicted label is ",predicted_results[i].index(max(predicted_results[i])),"with ",'%.3f'%(max(predicted_results[i])),"% with cluster size", Sum," account for ",'%.3f'%(Sum/3676*100), "% of the data")

Cluster  0 best predicted label is  5 with  96.124 % with cluster size 1290  account for  35.092 % of the data
Cluster  1 best predicted label is  2 with  43.047 % with cluster size 1431  account for  38.928 % of the data
Cluster  2 best predicted label is  4 with  47.768 % with cluster size 896  account for  24.374 % of the data
Cluster  3 best predicted label is  4 with  50.588 % with cluster size 1870  account for  50.871 % of the data
Cluster  4 best predicted label is  2 with  64.628 % with cluster size 376  account for  10.229 % of the data
Cluster  5 best predicted label is  1 with  54.197 % with cluster size 1489  account for  40.506 % of the data


### Gaussian Mixtures (by Jonathan Hong)

### Gaussian Mixture Models

GMM is similar to the K means clustering algorithm, but estimates using a mixture of multiple Gaussian distributions and takes into account the means and covariances. K means puts a hypersphere around the cluster centers, but this isn't the best clustering technique if your data is a different shape or if any boundaries are overlapping. So GMM can estimate ellipsoidal shapes using the mean and covariances. It uses the Expectation-Maximization Algorithm, which to iteratively estimate the Maximum Likelihood probabilities of our data belonging to each cluster. This means we have a soft clustering classification because we are given broken down probabilties of a sample for each cluster. These resulting probabilities, also known as responsibilities, tell us how likely our points were estimated to be a part of each cluster.

In [61]:
from sklearn.mixture import GaussianMixture

In [62]:
human_activity_data = np.load('human_activity_data.npy')
gm_model = GaussianMixture(n_components = 6, random_state = 7)
gm_model.fit(human_activity_data)
pred = gm_model.predict(human_activity_data)

In [65]:
probs = gm_model.predict_proba(human_activity_data)
print('Predicted Probabilities: Samples(row) Clusters(columns)')
print(probs[:20].round(3))

Predicted Probabilities: Samples(row) Clusters(columns)
[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]]


In [64]:
predicted_clusters = pred
predicted_results = [[0 for j in range(6)] for i in range(6)]
for index,cluster in enumerate(predicted_clusters):
    predicted_results[cluster][human_activity_labels[index]-1] += 1   
for i in range(6):
    Sum = sum(predicted_results[i])
    predicted_results[i] = [(x/Sum)*100 for x in predicted_results[i]]
    print("Cluster ",i,"best predicted label is ",predicted_results[i].index(max(predicted_results[i])),"with ",'%.3f'%(max(predicted_results[i])),"% with cluster size", Sum," account for ",'%.3f'%(Sum/3676*100), "% of the data")

Cluster  0 best predicted label is  5 with  27.594 % with cluster size 424  account for  11.534 % of the data
Cluster  1 best predicted label is  3 with  23.973 % with cluster size 730  account for  19.859 % of the data
Cluster  2 best predicted label is  4 with  26.612 % with cluster size 605  account for  16.458 % of the data
Cluster  3 best predicted label is  0 with  23.109 % with cluster size 727  account for  19.777 % of the data
Cluster  4 best predicted label is  5 with  24.504 % with cluster size 1008  account for  27.421 % of the data
Cluster  5 best predicted label is  2 with  28.022 % with cluster size 182  account for  4.951 % of the data


### Hierarchical/Agglomerative Clustering (by: Justin May)

This type of clustering breaks all data points down into centroids and groups them one by one until it reaches the specified number of clusters, k. The linkage policy determines grouping, which are:

- simple: closest distance between clusters
- complete: farthest distance between clusters
- average: average distance between clusters
- ward: sum of squared differences Our implementation is using euclidean distance

Hierarchical/Agglomerative is deterministic and⁠—as compared to k-means⁠—is slow. Complete, Average, and Ward linkage policies yield a $n^{3}$ runtime. Simple linkage yields $n^{2}$ runtime with clever optimizations, which is why we are using sklearn.

We are using k=6 because of our a-priori knoweldge that there are 6 groups:

1. WALKING,
2. WALKING_UPSTAIRS,
3. WALKING_DOWNSTAIRS,
4. SITTING,
5. STANDING,
6. LAYING;

In [56]:
from sklearn.cluster import AgglomerativeClustering

In [57]:
human_activity_data = np.load('human_activity_data.npy')
clustering = AgglomerativeClustering(n_clusters = 6, linkage='single').fit(human_activity_data)
predicted_clusters = clustering.labels_
predicted_results = [[0 for j in range(6)] for i in range(6)]
for index,cluster in enumerate(predicted_clusters):
    predicted_results[cluster][human_activity_labels[index]-1] += 1   
for i in range(6):
    Sum = sum(predicted_results[i])
    predicted_results[i] = [(x/Sum)*100 for x in predicted_results[i]]
    print("Cluster ",i,"best predicted label is ",predicted_results[i].index(max(predicted_results[i])),"with ",'%.3f'%(max(predicted_results[i])),"% with cluster size", Sum," account for ",'%.3f'%(Sum/3676*100), "% of the data")

Cluster  0 best predicted label is  4 with  18.801 % with cluster size 3670  account for  99.837 % of the data
Cluster  1 best predicted label is  5 with  100.000 % with cluster size 1  account for  0.027 % of the data
Cluster  2 best predicted label is  3 with  50.000 % with cluster size 2  account for  0.054 % of the data
Cluster  3 best predicted label is  5 with  100.000 % with cluster size 1  account for  0.027 % of the data
Cluster  4 best predicted label is  5 with  100.000 % with cluster size 1  account for  0.027 % of the data
Cluster  5 best predicted label is  5 with  100.000 % with cluster size 1  account for  0.027 % of the data


In [58]:

human_activity_data = np.load('human_activity_data.npy')
clustering = AgglomerativeClustering(n_clusters = 6, linkage='complete').fit(human_activity_data)
predicted_clusters = clustering.labels_
predicted_results = [[0 for j in range(6)] for i in range(6)]
for index,cluster in enumerate(predicted_clusters):
    predicted_results[cluster][human_activity_labels[index]-1] += 1   
for i in range(6):
    Sum = sum(predicted_results[i])
    predicted_results[i] = [(x/Sum)*100 for x in predicted_results[i]]
    print("Cluster ",i,"best predicted label is ",predicted_results[i].index(max(predicted_results[i])),"with ",'%.3f'%(max(predicted_results[i])),"% with cluster size", Sum," account for ",'%.3f'%(Sum/3676*100), "% of the data")


Cluster  0 best predicted label is  2 with  28.571 % with cluster size 63  account for  1.714 % of the data
Cluster  1 best predicted label is  0 with  27.500 % with cluster size 360  account for  9.793 % of the data
Cluster  2 best predicted label is  5 with  23.553 % with cluster size 2021  account for  54.978 % of the data
Cluster  3 best predicted label is  5 with  100.000 % with cluster size 9  account for  0.245 % of the data
Cluster  4 best predicted label is  3 with  35.294 % with cluster size 17  account for  0.462 % of the data
Cluster  5 best predicted label is  3 with  23.466 % with cluster size 1206  account for  32.807 % of the data


In [59]:
human_activity_data = np.load('human_activity_data.npy')
clustering = AgglomerativeClustering(n_clusters = 6, linkage='average').fit(human_activity_data)
predicted_clusters = clustering.labels_
predicted_results = [[0 for j in range(6)] for i in range(6)]
for index,cluster in enumerate(predicted_clusters):
    predicted_results[cluster][human_activity_labels[index]-1] += 1   
for i in range(6):
    Sum = sum(predicted_results[i])
    predicted_results[i] = [(x/Sum)*100 for x in predicted_results[i]]
    print("Cluster ",i,"best predicted label is ",predicted_results[i].index(max(predicted_results[i])),"with ",'%.3f'%(max(predicted_results[i])),"% with cluster size", Sum," account for ",'%.3f'%(Sum/3676*100), "% of the data")

Cluster  0 best predicted label is  5 with  100.000 % with cluster size 2  account for  0.054 % of the data
Cluster  1 best predicted label is  0 with  41.463 % with cluster size 41  account for  1.115 % of the data
Cluster  2 best predicted label is  3 with  22.640 % with cluster size 1568  account for  42.655 % of the data
Cluster  3 best predicted label is  2 with  37.037 % with cluster size 27  account for  0.734 % of the data
Cluster  4 best predicted label is  5 with  38.462 % with cluster size 13  account for  0.354 % of the data
Cluster  5 best predicted label is  5 with  23.506 % with cluster size 2025  account for  55.087 % of the data


In [60]:
human_activity_data = np.load('human_activity_data.npy')
clustering = AgglomerativeClustering(n_clusters = 6, linkage='ward').fit(human_activity_data)
predicted_clusters = clustering.labels_
predicted_results = [[0 for j in range(6)] for i in range(6)]
for index,cluster in enumerate(predicted_clusters):
    predicted_results[cluster][human_activity_labels[index]-1] += 1   
for i in range(6):
    Sum = sum(predicted_results[i])
    predicted_results[i] = [(x/Sum)*100 for x in predicted_results[i]]
    print("Cluster ",i,"best predicted label is ",predicted_results[i].index(max(predicted_results[i])),"with ",'%.3f'%(max(predicted_results[i])),"% with cluster size", Sum," account for ",'%.3f'%(Sum/3676*100), "% of the data")

Cluster  0 best predicted label is  5 with  26.484 % with cluster size 657  account for  17.873 % of the data
Cluster  1 best predicted label is  0 with  25.472 % with cluster size 742  account for  20.185 % of the data
Cluster  2 best predicted label is  3 with  24.784 % with cluster size 811  account for  22.062 % of the data
Cluster  3 best predicted label is  0 with  26.875 % with cluster size 640  account for  17.410 % of the data
Cluster  4 best predicted label is  5 with  25.371 % with cluster size 741  account for  20.158 % of the data
Cluster  5 best predicted label is  2 with  23.529 % with cluster size 85  account for  2.312 % of the data
