# Unsupervised Machine Learning
## Asset Index Review

The Living Standard Management Survey (LSMS) is a World Bank used for collecting household-level data.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Reading Data

In [None]:
df = pd.read_csv("../Data/Analysis/assets.csv")
df=df.set_index(['household_id',"ea_id"])

In [None]:
df.head(4)

In [None]:
df.shape

## 1. Principal component analysis (PCA)

Check PCA documentation [here](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)

- Principal component analysis (PCA) is a mathematical procedure that transforms a number of (possibly) correlated variables into a (smaller) number of uncorrelated variables called principal components.
- The first principal component accounts for as much of the variability in the data as possible, and each succeeding component accounts for as much of the remaining variability as possible 
- PCA is a dimensionality reduction or data compression method. The goal is dimension reduction and there is no guarantee that the dimensions are interpretable

### 1.1 Standardize Data 

In [None]:
assets=df
assets_std = (assets - assets.mean())/assets.std()

In [None]:
assets_std.head(4)

### 1.2 Fit the model

In [None]:
K=20 # number of components (let's start with large number)
pca_model = PCA(n_components=K)
pca_model = pca_model.fit(assets_std.fillna(0))  # Fit the model with the data assets_std 
pca_model.transform(assets_std.fillna(0))  # This line applies the dimensionality reduction to the data.
# i.e., it created a K-d coordinate system out of the several asset variables we have.


### 1.3 Check explained variance and eigen values

#### Variance:

In [None]:
pca_model.explained_variance_ratio_

In [None]:
plt.plot(list(range(1,K+1)),pca_model.explained_variance_ratio_.cumsum());
plt.xlabel('Numbers of components');
plt.ylabel('Explained variance (Cumulative)');

#### Eigen Values:

In [None]:
pca_model.explained_variance_

In [None]:
plt.plot(list(range(1,K+1)),pca_model.explained_variance_);
plt.axhline(1,ls="--",color='black');
plt.xlabel('Numbers of components');
plt.ylabel('Eigen Values');

### 1.4 Define number of components

In [None]:
K = 6
pca_model = PCA(n_components=K)
pca_model = pca_model.fit(assets_std.fillna(0))

Factor loadings (factor or component coefficients) : The factor loadings, also called component loadings in PCA, are the correlation coefficients between the variables (rows) and factors (columns).

### 1.5 Compute components coefficients

In [None]:
load_scores = pd.DataFrame(pca_model.components_)
load_scores.T

In [None]:
#sum(load_scores.T[0]**2)

### 1.6 Apply PCA to the data (dimensionality reduction)

Just for visualization purposes of this notebook. We will compute only TWO components.


Here is Prof. Jared Code to interact with PCA model easier:

In [None]:
class PCA_Model:
    def __init__(self,ncomps,data,cols):
        
        self.ncomps = ncomps
        self.cols = cols
        self.data = data
        self.scaled_data = StandardScaler().fit_transform(data[cols].values)
        
    def fit(self):
        self.PCA_obj = PCA(self.ncomps).fit(self.scaled_data)
    
    def describe(self):
        explvar_ = self.PCA_obj.explained_variance_ratio_

        explvar = pd.DataFrame(explvar_[np.newaxis,:]*100,\
                               columns=["Component " + str(x) for x in range(self.ncomps)],\
                               index=["% Explained Variance"])
        
        loadscores_ = self.PCA_obj.components_
        loadscores = pd.DataFrame(loadscores_,\
                                  columns=self.cols,\
                                  index=["Component " + str(x) for x in range(self.ncomps)])
        print("Explains {0:0.2f}% of the variance".format(round(sum(explvar_),2)*100))
        print(explvar)
        print(loadscores)
        return explvar,loadscores
    
    def gen_data(self):
        return self.PCA_obj.transform(self.scaled_data)
    
    def graph(self,alpha=.6):
        self.graphdata = pd.concat([self.data[[color]].reset_index(drop=True),\
                                    pd.DataFrame(self.gen_data()).reset_index(drop=True)],axis=1)
        if self.ncomps==2:
            return self.graphdata.plot(kind="scatter",x=0,y=1,\
                                       c=color,alpha=alpha,colormap=cm)
        else:
            ax = plt.axes(projection='3d')
            ax.scatter3D(self.graphdata[0], \
                         self.graphdata[1], \
                         self.graphdata[2])#, \
                         #c=color,colormap=cm)

In [None]:
pca2 = PCA_Model(2,assets.fillna(0),assets.columns)
pca2.fit()

In [None]:
components=pd.DataFrame(pca2.gen_data())

In [None]:
plt.scatter(components[0], components[1], s=10, alpha=0.5);
plt.xlabel('First Component');
plt.ylabel('Second Component');

## 2. K-Means

 - An algorithm to determine implicit grouping in data.
 - It minimizes the within-cluster squared Euclidean distances.

In [None]:
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import datasets
from sklearn.datasets.samples_generator import make_blobs

### First - We will review the class example 

### 2.1 Read/Create Data

In [None]:
# This code creates "clustered" data already! (So we know the answer in advance)
X, y_true = make_blobs(n_samples=300, centers=4,
                       cluster_std=0.60, random_state=0)

# Where:
# X = The generated samples.
# y_true = The integer labels for cluster membership of each sample.

In [None]:
X

In [None]:
plt.scatter(X[:, 0], X[:, 1], s=20);

### 2.2 Assign K and run algorithm

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)  # If we define 2 clusters (of course we know that is NOT the right number)
kmeans.fit(X)
y_kmeans = kmeans.predict(X)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis');


Here is Prof. Jared code that makes the interaction with K-Means easier:

In [None]:
class Kmeans_Model:
    def __init__(self,data,num_clusters):
        self.df = StandardScaler().fit_transform(data)
        self.K = num_clusters
        
    def fit(self):    
        self.model = KMeans(self.K).fit(self.df)
    
    def predict(self):
        self.prediction = self.model.predict(self.df)
        return self.prediction
    
    def sil_score(self):
        return silhouette_score(self.df,self.model.predict(self.df),sample_size=10000)
    
def score_by_k(data,K):
    model = Kmeans_Model(data,K)
    model.fit()
    return model.sil_score()

def inertia_by_k(data,K):
    model = Kmeans_Model(data,K)
    model.fit()
    return model.model.inertia_

### How do we know the "optimal" number of clusters? 


### Heuristics for selection: Silhouette Scores

Silhouette scores near +1 indicate that the sample is far away from the neighboring clusters (i.e., they are closest to their assigned cluster). A value of 0 indicates that the sample is on or very close to the decision boundary between two neighboring clusters and negative values indicate that those samples might have been assigned to the wrong cluster.



In [None]:
K=20
sil_scores =[score_by_k(X,i) for i in range(2,K)];
plt.plot(list(range(2,K)),sil_scores);
plt.axvline(4,color="black",ls="--");
plt.xlabel('Numbers of clusters K');
plt.ylabel('Average Silhouette Width');

### Inertia: Is the function being minimized (Within-sum of squared distances) 

 - Inertia can be recognized as a measure of how internally coherent clusters are. Lower values are better and zero is optimal.
 - Criteria: Choose the K at the "elbow"

In [None]:
init_scores =[inertia_by_k(X,i) for i in range(2,K)];
plt.plot(list(range(2,K)),init_scores);
plt.axvline(4,color="black",ls="--");
plt.xlabel('Numbers of clusters K');
plt.ylabel('Sum of squared distances');

## Assets example 
### Run K-means over the two components that we created

In [None]:
components.head(5)

### Let's look at the heuristics

In [None]:
K=20
sil_scores =[score_by_k(components,i) for i in range(2,K)];
plt.plot(list(range(2,K)),sil_scores);
plt.xlabel('Numbers of clusters K');
plt.ylabel('Average Silhouette Width');
plt.axvline(3,color="black",ls="--");


In [None]:
init_scores =[inertia_by_k(components,i) for i in range(2,K)];
plt.plot(list(range(2,K)),init_scores);
plt.axvline(3,color="black",ls="--");
plt.xlabel('Numbers of clusters K');
plt.ylabel('Sum of squared distances');

### Visualization of Clusters

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(components)
y_kmeans = kmeans.predict(components)

plt.scatter(components[0],components[1], c=y_kmeans, s=10, cmap='Pastel2')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=20);
plt.xlabel('First Component');
plt.ylabel('Second Component');
