In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pandas.io.json import json_normalize
from pymongo import MongoClient
from sklearn import preprocessing
from sklearn.model_selection import  train_test_split
from sklearn.linear_model import LogisticRegression

%matplotlib inline

In [None]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = MongoClient(course_cluster_uri)

In [None]:
# Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
wine = course_client['coursera-agg']['wine']

In [None]:
pipeline = [
    {
        "$project": {
            "_id": 0
        }
    }
]

In [None]:
cursor = wine.aggregate(pipeline)
docs = list(cursor)
df = json_normalize(docs)

In [None]:
df.head()

In [None]:
X = df.drop(['Alcohol'], axis=1).values.astype('float64')

In [None]:
X = preprocessing.scale(X)

In [None]:
cov_matrix = np.cov(X.T)

In [None]:
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

In [None]:
for val in eigenvalues:
    print(val)

In [None]:
eigen_map = list(zip(eigenvalues, eigenvectors.T))

In [None]:
eigen_map.sort(key=lambda x: x[0], reverse=True)

In [None]:
sorted_eigenvalues = [pair[0] for pair in eigen_map]
sorted_eigenvectors = [pair[1] for pair in eigen_map]

In [None]:
sorted_eigenvalues

In [None]:
print(pd.DataFrame(sorted_eigenvectors, columns=df.drop(['Alcohol'], axis=1).columns))

In [None]:
eigenvalue_sum = sum(eigenvalues)
var_exp = [(v / eigenvalue_sum)*100 for v in sorted_eigenvalues]
cum_var_exp = np.cumsum(var_exp)

In [None]:
dims = len(df.drop(['Alcohol'], axis=1).columns)

In [None]:
plt.clf()
fig, ax = plt.subplots()

ax.plot(range(dims), cum_var_exp, '-o')

plt.xlabel('Number of Components')
plt.ylabel('Percent of Variance Explained')

plt.show()

In [None]:
ev1 = sorted_eigenvectors[0]
ev2 = sorted_eigenvectors[1]

In [None]:
eigen_matrix = np.hstack((ev1.reshape(dims,1), ev2.reshape(dims,1)))

In [None]:
eigen_matrix

In [None]:
Y = X.dot(eigen_matrix)

In [None]:
plt.clf()
fig, ax = plt.subplots()
ax.scatter(Y.T[0], Y.T[1], alpha=0.2)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Y_sklearn = pca.fit_transform(X)

In [None]:
plt.clf()
fig, ax = plt.subplots()
ax.scatter(Y_sklearn.T[0], Y_sklearn.T[1], alpha=0.2)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
y = df['Alcohol'].values

In [None]:
# Let's split the model for training and testing, and use a logistic regression
X_train, X_test, y_train, y_test = train_test_split(df.drop('Alcohol', axis=1), y, test_size=0.25)

In [None]:
classifier = LogisticRegression(random_state=0)

In [None]:
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.score(X_test, y_test)
y_pred

In [None]:
# now with PCA applied
X_train, X_test, y_train, y_test = train_test_split(Y_sklearn, y, test_size=0.3)

In [None]:
classifier_with_pca = LogisticRegression(random_state=0)
classifier_with_pca.fit(X_train, y_train)

In [None]:
y_pred = classifier_with_pca.score(X_test, y_test)
y_pred