# Midterm with Scikit-Learn
* Due Date: July 18, 2018
* by: Jose Medina

### Import the Libraries:
1. numpy
2. matplotlib.pyplot
3. pandas
4. seaborn

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

import csv
from collections import Counter

import string
import pprint




# Problem 1: 
For this problem, you are asked to use the Python algorithms developed in class for the Principle Component Analysis (PCA) implementation to investigate dimensionality reduction of the  “breast cancer” dataset provided by Scikit-Learn.
1. You should implement PCA in two different ways and in each case show the Eigen values and Eigen vectors:
    1. using the eigenvector decomposition of the covariance matrix
    2. using Singular Value Decomposition (SVD) method  

2. Provide a plot that shows the level of data variance retained as a function of dimensionality reduction.
3. Using the first two Eigen vectors, show a two-dimensional plot of the dataset with the reduced number of features. The points on the plot should be color-coded based on their values.
4. How many Eigen vectors should be used to retain 95% of the data variance.


### Read the Digit Data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

# Import Dataset
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

# Explore dataset
print('Keys = \n',cancer.keys(),'\n')
print('Description = \n',cancer['DESCR'])


#load the cancer dataset
X = cancer.data
y = cancer.target

### Some info from the data

In [None]:
# Print the number of 8 by 8 images
print("Data Size: " , X.shape)

# Print the number of labels
print("Number of Labels :", y.shape)


### Split the data with y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state= 99)

print('X_train shape = ',X_train.shape)
print('X_test shape = ',X_test.shape)
print('y_train shape = ',y_train.shape)
print('y_test shape = ',y_test.shape)

In [None]:
df = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])
df.head(2)

In [None]:
print(cancer['target'][0:25])

print(cancer['target_names'])

# Scikit-learn implementation. 

In [None]:
from sklearn.preprocessing import StandardScaler
# Scale the data
sc = StandardScaler()
sc.fit(df)
sc_data = sc.transform(df)

In [None]:
# Apply PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(sc_data)
xpca = pca.transform(sc_data)
sc_data.shape, xpca.shape


In [None]:
# Visualize Data
plt.figure(figsize=(8,6))
plt.scatter(xpca[:,0], xpca[:,1], c=cancer['target'], cmap='plasma')
plt.xlabel('PC-1')
plt.ylabel('PC-2')
plt.show()


In [None]:
# Show Eigenvectors
pca.components_

In [None]:
df_components = pd.DataFrame(pca.components_,columns=cancer['feature_names'])
df_components
plt.figure(figsize=(12,6))
sns.heatmap(df_components, cmap='plasma')
plt.show()

## Own Implementation of a Dimensional Reduction

In [None]:
# Scale Data
X = sc_data.T

### Compute the eigen values and the eigen vectors:

In [None]:
sigma = np.dot(X, X.T)
v, W = np.linalg.eig(sigma)
print('Eigenvalues: v=\n', v, '\n')
print('Eigenvectors W=\n', W)


### Sort the eigen values in descending order, and correspondingly the eigen vectors:

In [None]:
n = X.shape[0] #Number of features
v_indices = np.argsort(v)[::-1][:n]
v = v[v_indices]
W = W[:,v_indices]
print(v,'\n', W)


#### Provide a plot that shows the level of data variance retained as a function of dimensionality reduction.

In [None]:
sv = np.cumsum(v)/sum(v)
plt.step(list(range(len(sv))), sv)
plt.grid(True)
plt.show()

# Around 10 

In [None]:
W.shape
X.shape

In [None]:
# Let’s take the first two columns (Principle Components) of U and call it Ur
wr = W[:, 0:2]
print('wr=\n', wr)

# Project the data X onto the reduced matrix Ur
Xr = np.dot(X.T, wr)

### Using the first two Eigen vectors, show a two-dimensional plot of the dataset with the reduced number of features. The points on the plot should be color-coded based on their values.

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(Xr[:,0], Xr[:,1], c=cancer['target'], cmap='plasma')
plt.xlabel('PC-1')
plt.ylabel('PC-2')
plt.show()

### SVD

#### Compute the eigen values and the eigen vectors:

In [None]:
# Singular-value decomposition
U, s, VT = np.linalg.svd(X)
print('Eigen-vectors: U=\n', U, U.shape,'\n')
print('Eigen-values: s=', s, s.shape)
print('Eigen-vectors: VT=', VT.shape,'\n')


#### Let’s take the first two columns (Principle Components) of U and call it Ur


In [None]:
Ur = -U[:, 0:2]
print('Ur=\n', Ur)

#### Project the data X onto the reduced matrix Ur


In [None]:
Xr = np.dot(X.T, Ur)

In [None]:
sv = np.cumsum(s)/sum(s)
plt.step(list(range(len(sv))), sv)
plt.grid(True)
plt.show()

# Around 10 

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(Xr[:,0], Xr[:,1], c=cancer['target'], cmap='plasma')
plt.xlabel('PC-1')
plt.ylabel('PC-2')
plt.show()