# DNSC 6303-10: PROGRAMMING FOR ANALYTICS - S_05

# What is Cluster Analysis?

+ Finding groups of objects such that the objects in a group will be similar (or related) to one another and different from (or
unrelated to) the objects in other groups

![pic1.jpg](attachment:pic1.jpg)

# How does it work ?

![pic2.jpg](attachment:pic2.jpg)

+ Euclidian: The euclidean distance is the shortest distance between two points

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

In [None]:
# Task1: Read the "Income Data.csv" file
customers_df = pd.read_csv("Income Data.csv" )

In [None]:
# Quick glimpse of the data 
customers_df.head(5)

In [None]:
# Task 2: Find the max value of the income and age


In [None]:
# Task 3: Find the min value of the income and age


In [None]:
# Task 4: plot data and regression model fits
sn.lmplot( "age", "income", data=customers_df, fit_reg = False, size = 4);
plt.title( "Fig: Customer Segments Based on Income and Age");

## K-means Clustering

In [None]:
# Task 5: Create clusters
from sklearn.cluster import KMeans
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
clusters = KMeans(3)
clusters.fit(customers_df)

In [None]:
customers_df["clusterid"] = clusters.labels_

In [None]:
customers_df[0:5]

In [None]:
print(customers_df.max()) 

### Normalizing the features

In [None]:
# Task 6: Preprocessing/need of standardization 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# https://scikitlearn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
scaled_customers_df = scaler.fit_transform(customers_df[["age", "income"]] )
scaled_customers_df[0:5]

In [None]:
# Task 7: Create clusters after standardization
from sklearn.cluster import KMeans
clusters_new = KMeans( 3, random_state=42 )
clusters_new.fit( scaled_customers_df )
customers_df["clusterid_new"] = clusters_new.labels_

In [None]:
# Task 8: Check the centers of the clusters 
clusters.cluster_centers_

In [None]:
# Task 9: Cluster Centers and Interpreting the Clusters

In [None]:
customers_df.groupby( 'clusterid' )['age','income'].agg( ["mean",'std'] ).reset_index()

## CASE 2 - Creating Product Segments - Beer Dataset

In [None]:
# Task 10: Read the "beer.csv" file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

%matplotlib inline
beer_df = pd.read_csv('beer.csv' )

In [None]:
# Task 11: Quick glimpse of the data 
beer_df

In [None]:
# Task 12: Preprocessing/need of standardization 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_beer_df = scaler.fit_transform( beer_df[['calories','sodium','alcohol','cost']] )

#### NO. of Clusters - Using Dendrogram
+ A dendrogram is a diagram that shows the hierarchical relationship between objects. 

In [None]:
# Task 13: Create Dendrogram 
cmap = sn.cubehelix_palette(as_cmap=True, rot=-.3, light=1)
sn.clustermap(scaled_beer_df, cmap=cmap, linewidths=.2,
figsize = (8,8) );
plt.title( "Fig: Dendrogram of Beer Dataset");

# Task 14: Finding Optimal Number of Clusters using Elbow Method

In [None]:
cluster_range = range( 1, 10 )
cluster_errors = []
for num_clusters in cluster_range:
    clusters = KMeans( num_clusters )
    clusters.fit( scaled_beer_df )
    cluster_errors.append( clusters.inertia_ )
plt.figure(figsize=(6,4))
plt.plot( cluster_range, cluster_errors, marker = "o" );
plt.title( "Fig : Elbow Diagram");

In [None]:
# Task 15: Normalizing Features - Rescaling the dataset

In [None]:
scaler = StandardScaler()
scaled_beer_df = scaler.fit_transform( beer_df[['calories','sodium','alcohol','cost']] )

In [None]:
# Task 16: Creating Clusters

In [None]:
k = 3
clusters = KMeans( k, random_state = 42 )
clusters.fit( scaled_beer_df )
beer_df["clusterid"] = clusters.labels_

# Task 17: Interpreting the Clusters

##### Cluster 0

In [None]:
beer_df[beer_df.clusterid == 0]

##### Cluster 1

In [None]:
beer_df[beer_df.clusterid == 1]

##### Cluster 2

In [None]:
beer_df[beer_df.clusterid == 2]

# Hierarchical clustering

In [None]:
# Task 18: Use AgglomerativeClustering
from sklearn.cluster import AgglomerativeClustering

In [None]:
h_clusters = AgglomerativeClustering( 3 )
h_clusters.fit( scaled_beer_df )
beer_df["h_clusterid"] = h_clusters.labels_

In [None]:
beer_df[beer_df.h_clusterid == 0]

In [None]:
beer_df[beer_df.h_clusterid == 1]

In [None]:
beer_df[beer_df.h_clusterid == 2]

In [None]:
# Task 19: Find the largest clusters
beer_df.groupby([beer_df.h_clusterid]).size()

# Inclass Pracitce: Read the "Country clusters"

In [None]:
# Task 20: Load the 'Country clusters.csv'
data = pd.read_csv("Country clusters.csv")
data

In [None]:
# Task 21:  Normalizing Features - Rescaling the dataset

In [None]:
scaler = StandardScaler()
scalerd_country_df = scaler.fit_transform(data [["Latitude","Longitude"]] )

In [None]:
# Task 22:  Creating Clusters
k = 3
clusters = KMeans( k, random_state = 42 )
clusters.fit(scalerd_country_df  )
data["clusterid"] = clusters.labels_

In [None]:
# Task 24:  Find the largest clusters = cluster 0
data[data.clusterid == 0]

In [None]:
data[data.clusterid == 1]

In [None]:
data[data.clusterid == 2]

# Task 25:Task for BB Week05 Blog: map 'Language' English':0, 'French':1,'German':2 and re run the clustering (k=3) and  Plot the data using the longitude and the latitude

![pic3.jpg](attachment:pic3.jpg)