# Images

# Preprocessing Step

In [1]:
import sys


def progress(count, total, suffix=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = '=' * filled_len + '-' * (bar_len - filled_len)

    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', suffix))
    sys.stdout.flush()  # As suggested by Rom Ruben

In [2]:
import pickle 
import numpy as np
import pandas as pd
import csv
import math
from sklearn import preprocessing as sklpp
from sklearn import decomposition as skldecomp

In [3]:
"""
Extracting data into a file
Data are all in the same domain, no need to normalize
"""
data = None
labels = []

file = "images/data_batch_1"
with open(file, 'rb') as fo:
    print("extracting file "+file+"...")
    dict = pickle.load(fo, encoding='bytes')
    temp_data = dict[b'data']
    try:
        data = np.concatenate((data, temp_data), axis=0)
    except:
        data = temp_data
    labels = labels + dict[b'labels']
labels = np.array(labels)
labels = labels.reshape(-1,1)
print("Finished Extracting Features")
print(np.shape(data))
file.close()

extracting file images/data_batch_1...
Finished Extracting Features
(10000, 3072)


## Preprocessing and Feature Learning For Images Dataset
For preprocessing we centered our dataset to 0 mean and then for feature learning we apply PCA to reduce our dimensions from 3072 to a smaller number of features than contains 95% of the variance of our data.

In [11]:
# First we create a StandardScaler object to 0 mean the data matrix but preserve the variance
stand_scaler = sklpp.StandardScaler(with_mean = True, with_std = False)
# Fits the data matrix to the StandardScaler object defined ^
centered_ImageData = stand_scaler.fit_transform(data)

In [15]:
# Creates a PCA object that reduces the dimensions of our data matrix keeping 95% of the variance
pca_obj = skldecomp.PCA(n_components = 0.95, svd_solver = 'auto')
dim_reducedImageData = pca_obj.fit_transform(centered_ImageData)
np.save('dim_reducedImageData.npy', dim_reducedImageData)

In [16]:
print('Data has been reduced to {} features after PCA'.format(dim_reducedImageData.shape[1]))

Data has been reduced to 209 features after PCA


# QDA: Justin

In [4]:
"""
Pull pca data from saved
"""
data = np.load('dim_reducedImageData.npy')
data = np.concatenate((data,labels),axis=1)
number_of_parameters = len(data[0]) - 1 #209
number_of_total_samples = len(data)
"""
K-Fold cross validation 
"""     
final_estimates = []
k = 10
for i in range(1,k+1):
    training_data = np.concatenate((data[0:(i-1)*int(number_of_total_samples/k)],data[i*int(number_of_total_samples/k):number_of_total_samples]),axis=0)
    testing_data = data[(i-1)*int(number_of_total_samples/k):i*int(number_of_total_samples/k)]
    number_of_training_samples = len(training_data)
    """
    Define Parameters 
    """
    mu = [[0 for j in range(number_of_parameters)] for i in range(10)] # initializing means 
    sigmas = [[[0 for k in range(number_of_parameters)] for j in range(number_of_parameters)] for i in range(10)] # initializing variances
    occurences = [0 for i in range(10)]
    pi = [0 for i in range(10)]
        
    """
    Learn mu
    """
    # Find Sums and Occurences
    for image in training_data:
        label = int(image[-1])
        occurences[label] += 1
        mu[label] = np.add(mu[label],image[:-1])
    # Calculate Averages and Prior Probabilities 
    for label,sums in enumerate(mu):
        mu[label] = np.multiply(1/(occurences[label] - 1), sums).reshape(-1,1) #unbaised estimator
        pi[label] = occurences[label] / number_of_total_samples
    """
    Learn sigma^2
    """
    time = 0
    for image in training_data:
        time += 1
        if time%10 == 0:
            progress(time,number_of_training_samples,suffix="training k= "+str(i))
        label = int(image[-1])
        image = image[:-1].reshape(-1,1)
        difference = np.subtract(image,mu[label])
        sigma = difference.dot(difference.T)
        sigmas[label] = np.add(sigmas[label],sigma)
    for label,sigma in enumerate(sigmas):
        sigmas[label] = np.multiply(1/(occurences[label]-1),sigma)
    sys.stdout.flush()
    """
    Method to find the best discriminant score
    """
    def estimateBestLabel(image):
        scores = [0 for _ in range(10)]
        # find score for each label 

        for label in range(10):
            inverted_variance = np.linalg.inv(sigmas[label])
            first_term = -0.5*image.T.dot(inverted_variance).dot(image)
            second_term = image.T.dot(inverted_variance).dot(mu[label])
            third_term = -0.5*mu[label].T.dot(inverted_variance).dot(mu[label])
            (sign, logdet) = np.linalg.slogdet(sigmas[label])
            fourth_term = -0.5*sign*logdet
            fifth_term = math.log(pi[label])
            score = first_term + second_term + third_term + fourth_term + fifth_term
            scores[label] = score[0][0]
        return(scores.index(max(scores)))
    correct = 0
    print("")
    for index, image in enumerate(testing_data):
        if index % 9 == 0:
            progress(index+1,1000,suffix="testing k= "+str(i))
        image = image[:-1].reshape(-1,1)
        if testing_data[index][-1] == estimateBestLabel(image):
            correct += 1
    sys.stdout.flush()
    print(correct/1000)
    final_estimates.append(correct/1000)
print("")
print(final_estimates)


[0.466, 0.498, 0.479, 0.473, 0.464, 0.49, 0.441, 0.469, 0.482, 0.473]


### Results 
% Correct for each k

[0.466, 0.498, 0.479, 0.473, 0.464, 0.49, 0.441, 0.469, 0.482, 0.473]

# Twitter Data

# Preprocessing Step

In [22]:

import nltk
import time
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
#nltk.download('stopwords')
#nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
"""
Reading in Data
"""
df_raw_twitter_data = pd.read_csv('train.csv', header=None, encoding = "ISO-8859-1").values[1:]
df_raw_twitter_data = df_raw_twitter_data[:,1:] # getting rid of index 

labels = df_raw_twitter_data[:,0]

In [52]:
"""
Preprocessing
"""
print(np.shape(df_raw_twitter_data))
corpus = []
print("")
time = 0
for index,[sentiment, tweet] in enumerate(df_raw_twitter_data):
    time += 1
    # Tokenize Words 
    tokens = tweet.split(" ")
    # Remove Links, @ mentions, # tags
    links = ['http', '.com', '#', '@', '&', '~']
    tokens = [w for w in tokens if not any(x in w for x in links)]
    # Tokenize again 
    regex = re.compile('[^a-zA-Z]')
    tokens = [regex.sub('', w) for w in tokens]
    # Lemmatization 
    lemmatizer = WordNetLemmatizer() 
    tokens = [lemmatizer.lemmatize(w) for w in tokens ]
    # Clean Up
    tokens = [w.lower() for w in tokens if len(w) > 0]
    # Remove Stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('im')
    tokens = [w for w in tokens if not w in stop_words]
    corpus.append(" ".join(tokens))
    if time % 100 == 0:
        progress(time,99988,suffix="running twitter processing")
sys.stdout.flush()


(99989, 2)


In [53]:
##############
star = 5000
##############
corpus = np.load('corpus.npy')
vectorizer = CountVectorizer()
vobj = vectorizer.fit_transform(corpus[:star])
vectors = vobj.toarray()
print("\nFinished vectorization")
print(np.shape(vectors))
np.save('vectors.npy', vectors)


Finished vectorization
(5000, 7882)


### PCA

In [8]:
# First we create a StandardScaler object to 0 mean the data matrix but preserve the variance
stand_scaler = sklpp.StandardScaler(with_mean = True, with_std = False)
# Fits the data matrix to the StandardScaler object defined ^
centered_ImageData = stand_scaler.fit_transform(vectors)

In [9]:
# Creates a PCA object that reduces the dimensions of our data matrix keeping 95% of the variance
pca_obj = skldecomp.PCA(n_components = 0.90, svd_solver = 'auto')
dim_reducedImageData = pca_obj.fit_transform(centered_ImageData)
np.save('twitterDataReduced.npy', dim_reducedImageData)

In [21]:
print('Data has been reduced to {} features after PCA'.format(dim_reducedImageData.shape[1]))

Data has been reduced to 1719 features after PCA


## QDA: Justin

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
"""
Pull pca data from saved
"""
#data = np.load('twitterDataReduced.npy')
data = vectors
labels = labels[:5000]
number_of_parameters = len(data[0]) - 1 #5000
number_of_total_samples = len(data)
number_of_labels = 2
"""
K-Fold cross validation 
"""     
final_estimates = []
k = 10
for i in range(1,11):
    training_data = np.concatenate((data[0:(i-1)*int(number_of_total_samples/k)],data[i*int(number_of_total_samples/k):number_of_total_samples]),axis=0)
    training_labels = np.concatenate((labels[0:(i-1)*int(number_of_total_samples/k)],labels[i*int(number_of_total_samples/k):number_of_total_samples]),axis=0)
    testing_data = data[(i-1)*int(number_of_total_samples/k):i*int(number_of_total_samples/k)]
    number_of_training_samples = len(training_data)
    clf = QDA()
    print("Training on k="+str(i))
    clf.fit(training_data, training_labels)
    print("")
    correct = 0
    for index, image in enumerate(testing_data):
        if index % 1 == 0:
            progress(index+1,int(star/10),suffix="testing k= "+str(i))
        if int(testing_data[index][-1]) == int(clf.predict([image])[0]):
            correct += 1
    sys.stdout.flush()
    print("")
    final_estimates.append(correct/int(star/10))
print(final_estimates)


Training on k=1


In [8]:
"""
Pull pca data from saved
"""
data = np.load('twitterDataReduced.npy')
data = np.concatenate((data,labels[:star]),axis=1)
number_of_parameters = len(data[0]) - 1 #5000
number_of_total_samples = len(data)
number_of_labels = 2
"""
K-Fold cross validation 
"""     
final_estimates = []
k = 10
def QDA(i):
    training_data = np.concatenate((data[0:(i-1)*int(number_of_total_samples/k)],data[i*int(number_of_total_samples/k):number_of_total_samples]),axis=0)
    testing_data = data[(i-1)*int(number_of_total_samples/k):i*int(number_of_total_samples/k)]
    number_of_training_samples = len(training_data)
    """
    Define Parameters 
    """
    mu = [[0 for j in range(number_of_parameters)] for i in range(number_of_labels)] # initializing means 
    sigmas = [[[0 for k in range(number_of_parameters)] for j in range(number_of_parameters)] for i in range(number_of_labels)] # initializing variances
    occurences = [0 for i in range(10)]
    pi = [0 for i in range(10)]
        
    """
    Learn mu
    """
    # Find Sums and Occurences
    for image in training_data:
        label = int(image[-1])
        occurences[label] += 1
        mu[label] = np.add(mu[label],image[:-1])
    # Calculate Averages and Prior Probabilities 
    for label,sums in enumerate(mu):
        mu[label] = np.multiply(1/(occurences[label] - 1), sums).reshape(-1,1) #unbaised estimator
        pi[label] = occurences[label] / number_of_total_samples
    """
    Learn sigma^2
    """
    time = 0
    for image in training_data:
        time += 1
        if time%10 == 0:
            progress(time,number_of_training_samples,suffix="training k= "+str(i))
        label = int(image[-1])
        image = image[:-1].reshape(-1,1)
        difference = np.subtract(image,mu[label])
        sigma = difference.dot(difference.T)
        sigmas[label] = np.add(sigmas[label],sigma)
    for label,sigma in enumerate(sigmas):
        sigmas[label] = np.multiply(1/(occurences[label]-1),sigma).astype(np.float32)
    sys.stdout.flush()
    """
    Method to find the best discriminant score
    """
    def estimateBestLabel(image):
        scores = [0 for _ in range(number_of_labels)]
        # find score for each label 

        for label in range(number_of_labels):
            inverted_variance = np.linalg.inv(sigmas[label])
            first_term = -0.5*image.T.dot(inverted_variance).dot(image)
            second_term = image.T.dot(inverted_variance).dot(mu[label])
            third_term = -0.5*mu[label].T.dot(inverted_variance).dot(mu[label])
            (sign, logdet) = np.linalg.slogdet(sigmas[label])
            fourth_term = -0.5*sign*logdet
            fifth_term = math.log(pi[label])
            score = first_term + second_term + third_term + fourth_term + fifth_term
            scores[label] = score[0][0]
        return(scores.index(max(scores)))
    correct = 0
    print("")
    for index, image in enumerate(testing_data):
        if index % 1 == 0:
            progress(index+1,int(star/10),suffix="testing k= "+str(i))
        image = image[:-1].reshape(-1,1)
        if int(testing_data[index][-1]) == int(estimateBestLabel(image)):
            correct += 1
    sys.stdout.flush()
    
    print(correct/(int(star/10)))
    final_estimates.append(correct/int(star/10))
print("")
print(final_estimates)


[]


In [None]:
"""*
K = 1 & 2
"""
start = time.time()
QDA(1)
end = time.time()
print((end-start)/60)

start = time.time()
QDA(2)
end = time.time()
print((end-start)/60)

In [None]:
"""*
K = 3 & 4
"""
start = time.time()
QDA(3)
end = time.time()
print((end-start)/60)

start = time.time()
QDA(4)
end = time.time()
print((end-start)/60)

In [9]:
"""*
K = 5 & 6
"""
start = time.time()
QDA(5)
end = time.time()
print((end-start)/60)

start = time.time()
QDA(6)
end = time.time()
print((end-start)/60)

41.58777117729187
41.84258098602295


In [20]:
"""*
K = 7 & 8
"""
start = time.time()
QDA(7)
end = time.time()
print((end-start)/60)
start = time.time()

start = time.time()
QDA(8)
end = time.time()
print((end-start)/60)
start = time.time()

41.69866884152095
39.27590667009353


In [13]:
"""*
K = 9 & 10
"""
start = time.time()
QDA(9)
end = time.time()
print((end-start)/60)

start = time.time()
QDA(10)
end = time.time()
print((end-start)/60)

[------------------------------------------------------------] 0.6% ...testing k= 9

KeyboardInterrupt: 

### Results
[============================================================] 100.0% ...training k= 1
0.656========================================================] 100.0% ...testing k= 1
[============================================================] 100.0% ...training k= 2
0.474========================================================] 100.0% ...testing k= 2
-2413.8574686050415
[============================================================] 100.0% ...training k= 3
0.654========================================================] 100.0% ...testing k= 3
43.156752200921375
[============================================================] 100.0% ...training k= 4
0.354========================================================] 100.0% ...testing k= 4
43.58165429830551
[============================================================] 100.0% ...training k= 5
0.55=========================================================] 100.0% ...testing k= 5
41.879090480009715
[============================================================] 100.0% ...training k= 6
[==========================================================--] 96.4% ...testing k= 6
[============================================================] 100.0% ...training k= 7
0.428========================================================] 100.0% ...testing k= 7
42.374602814515434
[============================================================] 100.0% ...training k= 8
0.654========================================================] 100.0% ...testing k= 8
42.34428945382436
[============================================================] 100.0% ...training k= 9
0.37=========================================================] 100.0% ...testing k= 9
42.841961006323494
[==================================================----------] 82.7% ...training k= 10

# Human Activity Clustering

### Preprocessing

In [None]:
"""
Opening File, creating nparray 
Data is already 0 mean 1 variance 
"""
features = open('human_activity_features_train_data.txt','r')
human_activity_data = None
for feature in features:
    feature = [float(w) for w in features.readline().split(" ") if len(w) > 0]
    try:
        human_activity_data = np.concatenate((data, feature), axis=0)
    except:
        human_activity_data = feature
features.close()
print(np.shape(human_activity_data))

### PCA

In [None]:
# First we create a StandardScaler object to 0 mean the data matrix but preserve the variance
stand_scaler = sklpp.StandardScaler(with_mean = True, with_std = False)
# Fits the data matrix to the StandardScaler object defined ^
centered_ImageData = stand_scaler.fit_transform(human_activity_data)

In [None]:
# Creates a PCA object that reduces the dimensions of our data matrix keeping 95% of the variance
pca_obj = skldecomp.PCA(n_components = 0.90, svd_solver = 'auto')
dim_reducedImageData = pca_obj.fit_transform(centered_ImageData)
np.save('dim_reducedImageData.npy', dim_reducedImageData)

### Hierarchical/Agglomerative Clustering (Justin)

This type of clustering breaks all data points down into centroids and groups them one by one until it reaches the specified number of clusters, k. The linkage policy determines grouping, which are:
 - simple: closest distance between clusters 
 - complete: farthest distance between clusters 
 - average: average distance between clusters 
 - ward: sum of squared differences
Our implementation is using euclidean distance 

Hierarchical/Agglomerative is deterministic and⁠—as compared to k-means⁠—is slow. Complete, Average, and Ward linkage policies yield a $n^{3}$ runtime. Simple linkage yields $n^{2}$ runtime with clever optimizations, which is why we are using sklearn. 

We are using k=6 because of our a-priori knoweldge that there are 6 groups:
 1. WALKING,
 2. WALKING_UPSTAIRS,
 3. WALKING_DOWNSTAIRS,
 4. SITTING,
 5. STANDING,
 6. LAYING;


In [None]:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters = 6, linkage='single').fit(human_activity_data)