# Project 2 -- BMLCO: Solving Real ML Problems

## CS260 Winter 2018
## Machine Learning Algorithms
## Dr. Yutao He

## Due: March 25th, 2018

### Student Name: Jennifer MacDonald
### SID: 604501712

### 1. Study the Data Set

In [340]:
# Standard scientific Python imports
import numpy as np

# Import performance metrics
from sklearn import metrics

# Import classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

# Import feature selctor and cross validator
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import cross_val_score

In [341]:
# The "Smartphone-Based Recognition of Human Activities and Postural Transitions 
# Data Set"  dataset
X_train = np.genfromtxt('HAPT Data Set/Train/X_train.txt')
y_train = np.genfromtxt('HAPT Data Set/Train/y_train.txt')

X_test = np.genfromtxt('HAPT Data Set/Test/X_test.txt')
y_test = np.genfromtxt('HAPT Data Set/Test/y_test.txt')

In [342]:
# The data that we are interested in are 10929 instances from 30 volunteers 
#recorded on a smart device. Each instance consists of measurements of 42 different
# attributes. The combination of these attributes can be used to build a model that 
# can predict six basic activities that include three static postures (standing, 
# sitting, lying) and three dynamic activities (walking, walking downstairs, and 
# walking upstairs). Data about transitions between the postures or activities were
# also recorded.

In [343]:
# Here is a look at the first 4 instances from the training set, as well as the
# first 4 results.
print("Training Data Sample: \n", X_train[0:4])
print("Test Data Sample: \n", y_train[0:4])

Training Data Sample: 
 [[ 0.04357967 -0.00597022 -0.03505434 ..., -0.84155851  0.17991281
  -0.05171842]
 [ 0.03948004 -0.00213128 -0.02906736 ..., -0.8450924   0.18026111
  -0.04743634]
 [ 0.03997778 -0.00515272 -0.02265071 ..., -0.84923013  0.18060956
  -0.04227136]
 [ 0.03978456 -0.01180878 -0.02891578 ..., -0.84894659  0.18190709
  -0.04082622]]
Test Data Sample: 
 [ 5.  5.  5.  5.]


### 2. Select the feature set

In [344]:
# Here is the length of each inner array corresponding to the attributes
# measured.
print("Number of Attributes: \n", len(X_train[0]))

Number of Attributes: 
 561


##### Feature Selection

In [345]:
# 561 attributes seems like a lot. We can narrow it down to the 17 most
# important features using Variance threshold feature selection to remove the
# features with low variance. In this case, we're removing all features whose 
# variance isn't at least 65%.

# sel = VarianceThreshold(threshold=(.65 * (1 - .65)))
# X_train = sel.fit_transform(X_train)
# X_test = sel.fit_transform(X_test)

In [346]:
# Here is the new length of each inner array corresponding to the attributes
# measured.

# print("Number of Attributes in Training Data After Feature Selection: \n", len(X_train[0]))
# print("Number of Attributes in Testing Data After Feature Selection: \n", len(X_test[0]))

In [347]:
# Here is the length of the outer array corresponding the number of instances
# in the training and testing sets.
print("Number of Instances in the Training Data Set: \n", len(X_train))
print("Number of Instances in the Testing Data Set: \n", len(X_test))

Number of Instances in the Training Data Set: 
 7767
Number of Instances in the Testing Data Set: 
 3162


### 3. Develop the model

In [348]:
# Create a classifier: a KNN classifier 
knn = KNeighborsClassifier(n_neighbors=6)

In [349]:
# Create a classifier: a Naive Bayes classifier 
nb = GaussianNB()

In [350]:
# Create a classifier: a Decision Trees classifier 
dt = DecisionTreeClassifier()

In [351]:
# Create a classifiers: a linear SVM classifier generalized to the multi-class
# case through a One-Vs-All (OVA) approach. This model was also selected in
# addition to the project's required classifiers because there is a published
# study that used this classifier on the same data set, and I'm interested to
# see if my results mirror the one in their paper.
lsvc = LinearSVC(multi_class="ovr")

### 4. Train the model

#### KNN Classifier

In [352]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform')

#### Naive Bayes Classifier 

In [353]:
nb.fit(X_train, y_train)

GaussianNB(priors=None)

#### Decision Trees Classifier

In [354]:
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

#### Linear SVM Classifier

In [355]:
lsvc.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

### 5. Validate and evaluate the models

#### KNN Classifier

In [357]:
expected_knn = y_test
predicted_knn = knn.predict(X_test)

##### Accuracy

In [358]:
knn_accuracy = np.sum(predicted_knn==expected_knn)/len(expected_knn)
print(knn_accuracy)

0.885515496521


##### Cross Validation Accuracy, Check for Over/Underfitting

In [359]:
knn_scores = cross_val_score(knn, X_train, y_train)
print("%0.2f (+/- %0.2f)" % (knn_scores.mean(), knn_scores.std() * 2))

0.87 (+/- 0.02)


##### Confusion Matrix

In [360]:
knn_confusion_matrix = metrics.confusion_matrix(predicted_knn, expected_knn)
print(knn_confusion_matrix)

[[487  46  53   0   0   0   1   0   0   0   2   1]
 [  1 419  51   4   2   1   3   0   0   0   4   0]
 [  8   6 316   0   0   0   0   0   0   0   1   0]
 [  0   0   0 431  59   2   1   0   0   0   0   0]
 [  0   0   0  73 495   2   0   0   0   0   1   0]
 [  0   0   0   0   0 540   0   0   0   0   1   0]
 [  0   0   0   0   0   0  18   0   0   0   2   0]
 [  0   0   0   0   0   0   0  10   0   0   0   0]
 [  0   0   0   0   0   0   0   0  29   0  15   1]
 [  0   0   0   0   0   0   0   0   0  22   0  13]
 [  0   0   0   0   0   0   0   0   3   1  23   2]
 [  0   0   0   0   0   0   0   0   0   2   0  10]]


#### Naive Bayes Classifier

In [361]:
expected_nb = y_test
predicted_nb = nb.predict(X_test)

##### Accuracy

In [362]:
nb_accuracy = np.sum(predicted_nb==expected_nb)/len(expected_nb)
print(nb_accuracy)

0.747311827957


##### Cross Validation Accuracy, Check for Over/Underfitting

In [363]:
nb_scores = cross_val_score(nb, X_train, y_train)
print("%0.2f (+/- %0.2f)" % (nb_scores.mean(), nb_scores.std() * 2))

0.71 (+/- 0.07)


##### Confusion Matrix

In [364]:
nb_confusion_matrix = metrics.confusion_matrix(predicted_nb, expected_nb)
print(nb_confusion_matrix)

[[416   8  80   0   0   0   0   0   0   0   0   0]
 [ 38 442  83   0   0   0   1   0   0   0   0   0]
 [ 42  11 257   0   0   0   1   0   0   0   0   0]
 [  0   0   0 457 311  62   0   0   0   0   0   0]
 [  0   0   0  35 220   0   0   0   0   0   1   0]
 [  0   0   0   1   1 467   0   0   0   0   0   0]
 [  0   9   0   8  22   0  15   0   0   0   1   1]
 [  0   0   0   5   0   0   2   9   1   0   0   0]
 [  0   0   0   1   0   0   3   0  24   0  18   0]
 [  0   0   0   1   0  11   0   1   0  21   2  15]
 [  0   1   0   0   2   1   1   0   7   1  27   3]
 [  0   0   0   0   0   4   0   0   0   3   0   8]]


#### Decision Trees Classifier

In [365]:
expected_dt = y_test
predicted_dt = dt.predict(X_test)

##### Accuracy

In [366]:
dt_accuracy = np.sum(predicted_dt==expected_dt)/len(expected_dt)
print(dt_accuracy)

0.810562934851


##### Cross Validation Accuracy, Check for Over/Underfitting

In [367]:
dt_scores = cross_val_score(dt, X_train, y_train)
print("%0.2f (+/- %0.2f)" % (dt_scores.mean(), dt_scores.std() * 2))

0.82 (+/- 0.01)


##### Confusion Matrix

In [368]:
dt_confusion_matrix = metrics.confusion_matrix(predicted_dt, expected_dt)
print(dt_confusion_matrix)

[[376  53  34   0   0   0   0   0   0   0   1   1]
 [109 357  77   0   0   1   1   0   1   0   1   0]
 [ 11  58 309   0   0   1   0   0   0   1   0   0]
 [  0   0   0 407  86   0   0   1   1   1   1   1]
 [  0   0   0  97 468   0   1   0   1   0   1   0]
 [  0   0   0   0   0 539   0   0   0   0   0   0]
 [  0   0   0   3   1   1  17   1   0   0   4   1]
 [  0   0   0   0   0   0   2   8   0   0   0   0]
 [  0   0   0   0   1   0   0   0  22   0   8   0]
 [  0   0   0   1   0   0   0   0   0  13   0  10]
 [  0   3   0   0   0   2   2   0   7   1  33   0]
 [  0   0   0   0   0   1   0   0   0   9   0  14]]


#### Linear SVM Classifier

In [369]:
expected_lsvc = y_test
predicted_lsvc = lsvc.predict(X_test)

##### Accuracy

In [370]:
lsvc_accuracy = np.sum(predicted_lsvc==expected_lsvc)/len(expected_lsvc)
print(lsvc_accuracy)

0.944339025933


##### Cross Validation Accuracy, Check for Over/Underfitting

In [371]:
# Adding a 10-fold cross-validation score, similar to the experiment 
# parameters in the paper
lsvc_scores = cross_val_score(lsvc, X_train, y_train, cv=10)
print("%0.2f (+/- %0.2f)" % (lsvc_scores.mean(), lsvc_scores.std() * 2))

0.94 (+/- 0.10)


##### Confusion Matrix 

In [372]:
svm_confusion_matrix = metrics.confusion_matrix(predicted_svm, expected_svm)
print(svm_confusion_matrix)

[[492  19  13   0   0   0   0   0   0   0   2   0]
 [  0 450  29   3   0   1   6   1   0   0   5   1]
 [  4   2 378   0   0   0   0   0   0   0   0   0]
 [  0   0   0 435  40   0   2   1   0   0   1   0]
 [  0   0   0  70 516   0   0   0   0   0   1   0]
 [  0   0   0   0   0 544   0   0   0   0   2   0]
 [  0   0   0   0   0   0  15   8   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0  20   1  11   0]
 [  0   0   0   0   0   0   0   0   0  23   0  20]
 [  0   0   0   0   0   0   0   0  12   0  27   3]
 [  0   0   0   0   0   0   0   0   0   1   0   3]]
