# Clustering on Vehicle Dataset using Agglomerative Hierarchical Clustering

### Importing Required Libraries

In [1]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Loading Data from CSV File

In [2]:
df = pd.read_csv(r'C:\Users\user\Desktop\Data Science\cars_clus.csv')
df.head()

Unnamed: 0,manufact,model,sales,resale,type,price,engine_s,horsepow,wheelbas,width,length,curb_wgt,fuel_cap,mpg,lnsales,partition
0,Acura,Integra,16.919,16.36,0.0,21.500,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0,2.828,0.0
1,Acura,TL,39.384,19.875,0.0,28.400,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0,3.673,0.0
2,Acura,CL,14.114,18.225,0.0,$null$,3.2,225.0,106.9,70.6,192.0,3.47,17.2,26.0,2.647,0.0
3,Acura,RL,8.588,29.725,0.0,42.000,3.5,210.0,114.6,71.4,196.6,3.85,18.0,22.0,2.15,0.0
4,Audi,A4,20.397,22.255,0.0,23.990,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0,3.015,0.0


### Data Cleaning

In [3]:
df[[ 'sales', 'resale', 'type', 'price', 'engine_s',
       'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap',
       'mpg', 'lnsales']] = df[['sales', 'resale', 'type', 'price', 'engine_s',
       'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap',
       'mpg', 'lnsales']].apply(pd.to_numeric, errors='coerce')
df = df.dropna()
df = df.reset_index(drop=True)

### Features Selection

In [4]:
fdf = df[['engine_s',  'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap', 'mpg']]
fdf.head()

Unnamed: 0,engine_s,horsepow,wheelbas,width,length,curb_wgt,fuel_cap,mpg
0,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0
1,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0
2,3.5,210.0,114.6,71.4,196.6,3.85,18.0,22.0
3,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0
4,2.8,200.0,108.7,76.1,192.0,3.561,18.5,22.0


### Normalizing Data

In [5]:
from sklearn.preprocessing import MinMaxScaler
X = fdf.values
X_n = MinMaxScaler().fit_transform(X)

### Agglomerative Hierarchical Clustering Algorithm Model

In [6]:
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial import distance_matrix
dm = distance_matrix(X_n, X_n)
ahc = AgglomerativeClustering(n_clusters = 6, linkage = 'complete')
ahc.fit(X_n)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='complete', memory=None, n_clusters=6,
                        pooling_func='deprecated')

### Assigning Label to Data Points

In [7]:
df['Cluster']= ahc.labels_
df.head()

Unnamed: 0,manufact,model,sales,resale,type,price,engine_s,horsepow,wheelbas,width,length,curb_wgt,fuel_cap,mpg,lnsales,partition,Cluster
0,Acura,Integra,16.919,16.36,0.0,21.5,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0,2.828,0.0,1
1,Acura,TL,39.384,19.875,0.0,28.4,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0,3.673,0.0,2
2,Acura,RL,8.588,29.725,0.0,42.0,3.5,210.0,114.6,71.4,196.6,3.85,18.0,22.0,2.15,0.0,2
3,Audi,A4,20.397,22.255,0.0,23.99,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0,3.015,0.0,1
4,Audi,A6,18.78,23.555,0.0,33.95,2.8,200.0,108.7,76.1,192.0,3.561,18.5,22.0,2.933,0.0,2


### Plotting Dendrogram

In [8]:
from scipy.cluster import hierarchy
plt.figure(figsize = (18,50))
Z = hierarchy.linkage(dm, 'complete')
def llf(id):
    return '[%s %s %s]' % (df['manufact'][id], df['model'][id], int(float(df['type'][id])) )   
dendro = hierarchy.dendrogram(Z,  leaf_label_func=llf, leaf_rotation=0, leaf_font_size =12, orientation = 'right')

<IPython.core.display.Javascript object>

  This is separate from the ipykernel package so we can avoid doing imports until


### Conclusions from the Model

In [9]:
cdf = df.groupby(['Cluster', 'type'])
cdf = cdf.mean()
cdf

Unnamed: 0_level_0,Unnamed: 1_level_0,sales,resale,price,engine_s,horsepow,wheelbas,width,length,curb_wgt,fuel_cap,mpg,lnsales,partition
Cluster,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1.0,158.433333,19.454167,29.024667,4.483333,211.666667,124.2,78.516667,207.816667,4.5295,27.75,16.166667,4.1815,0.0
1,0.0,53.191234,15.117447,20.306128,2.246809,146.531915,102.491489,68.857447,179.110638,2.805404,14.980851,27.021277,3.220085,0.0
1,1.0,63.7018,13.572,17.0092,2.58,145.0,99.24,67.64,167.92,2.9412,16.94,22.2,4.0708,0.0
2,0.0,47.725,18.596111,27.750593,3.303704,203.111111,108.851852,72.381481,195.651852,3.496074,17.759259,24.214815,3.491852,0.0
2,1.0,78.349273,17.056364,26.265364,3.345455,182.090909,108.663636,70.845455,184.1,3.821364,20.4,20.181818,3.819364,0.0
3,0.0,31.677,26.9225,42.8704,4.41,256.5,113.99,75.17,205.9,3.9568,20.24,21.5,2.8775,0.0
3,1.0,90.832571,13.884286,21.527714,3.071429,160.571429,118.814286,73.585714,201.242857,3.796429,21.857143,21.428571,4.143286,0.0
4,0.0,21.855,5.16,9.235,1.0,55.0,93.1,62.6,149.4,1.895,10.3,45.0,3.084,0.0
5,0.0,7.391333,51.098333,66.01,6.233333,365.666667,99.9,73.533333,177.833333,3.57,19.733333,19.333333,1.332,0.0
