In [154]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.cluster import MeanShift, estimate_bandwidth

In [155]:
titanic_data = pd.read_excel('titanic.xls')
titanic_data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [156]:
titanic_data.drop(['name','ticket','cabin', 'body','home.dest','boat'], 1 ,inplace=True)
titanic_data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.55,S
2,1,0,female,2.0,1,2,151.55,S
3,1,0,male,30.0,1,2,151.55,S
4,1,0,female,25.0,1,2,151.55,S


In [157]:
le = preprocessing.LabelEncoder()
titanic_data['sex'] = le.fit_transform(titanic_data["sex"].astype(str))
titanic_data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,0,29.0,0,0,211.3375,S
1,1,1,1,0.9167,1,2,151.55,S
2,1,0,0,2.0,1,2,151.55,S
3,1,0,1,30.0,1,2,151.55,S
4,1,0,0,25.0,1,2,151.55,S


In [158]:
#using one hot encoding
titanic_data = pd.get_dummies(titanic_data,columns=['embarked'])
titanic_data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,1,0.9167,1,2,151.55,0,0,1
2,1,0,0,2.0,1,2,151.55,0,0,1
3,1,0,1,30.0,1,2,151.55,0,0,1
4,1,0,0,25.0,1,2,151.55,0,0,1


In [159]:
#check for missing data values
titanic_data[titanic_data.isnull().any(axis=1)]

#We will drop them.
titanic_data = titanic_data.dropna()

In [160]:
clf = MeanShift(bandwidth=30)
#We will provide only bandwith in hyperparameter . 
#The smaller values of bandwith result in tall skinny kernels 
#& larger values result in short fat kernels.
#We found the bandwith using the estimate_bandiwth function mentioned in below cell.

clf.fit(titanic_data)

MeanShift(bandwidth=30, bin_seeding=False, cluster_all=True, max_iter=300,
          min_bin_freq=1, n_jobs=None, seeds=None)

In [161]:
estimate_bandwidth(titanic_data)

31.849187320670215

In [162]:
labels = clf.labels_
np.unique(labels)
#Thus a bandwith of 30 produces 5 clusters - 
#every point is assigned to one of these clusters.

[4 2 2 ... 0 0 0]


array([0, 1, 2, 3, 4, 5])

In [163]:
#We will add a new column in dataset which 
#shows the cluster the data of a particular row belongs to.
titanic_data['cluster_group'] = np.nan
data_length=len(titanic_data)
for i in range(data_length):
    titanic_data.iloc[i,titanic_data.columns.get_loc('cluster_group')] = labels[i]
titanic_data.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S,cluster_group
0,1,1,0,29.0,0,0,211.3375,0,0,1,4.0
1,1,1,1,0.9167,1,2,151.55,0,0,1,2.0
2,1,0,0,2.0,1,2,151.55,0,0,1,2.0
3,1,0,1,30.0,1,2,151.55,0,0,1,2.0
4,1,0,0,25.0,1,2,151.55,0,0,1,2.0


In [164]:
titanic_data.describe()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S,cluster_group
count,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0,1045.0
mean,2.206699,0.408612,0.628708,29.851834,0.503349,0.421053,36.68608,0.202871,0.047847,0.747368,0.36555
std,0.841542,0.491813,0.483382,14.389201,0.912471,0.840052,55.732533,0.40233,0.213544,0.434729,0.816708
min,1.0,0.0,0.0,0.1667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,21.0,0.0,0.0,8.05,0.0,0.0,0.0,0.0
50%,2.0,0.0,1.0,28.0,0.0,0.0,15.75,0.0,0.0,1.0,0.0
75%,3.0,1.0,1.0,39.0,1.0,1.0,35.5,0.0,0.0,1.0,0.0
max,3.0,1.0,1.0,80.0,8.0,6.0,512.3292,1.0,1.0,1.0,5.0


In [165]:
#Grouping passengers by Cluster
titanic_cluster_data = titanic_data.groupby(['cluster_group']).mean()
#Count of passengers in each cluster
titanic_cluster_data['Counts'] = pd.Series(titanic_data.groupby(['cluster_group']).size())
titanic_cluster_data


Unnamed: 0_level_0,pclass,survived,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S,Counts
cluster_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,2.514963,0.339152,0.67581,27.883105,0.425187,0.346633,15.326999,0.130923,0.058603,0.810474,802
1.0,1.280488,0.597561,0.52439,36.746951,0.79878,0.554878,64.737935,0.365854,0.018293,0.603659,164
2.0,1.0,0.72093,0.395349,33.881784,0.581395,0.744186,133.555521,0.55814,0.0,0.44186,43
3.0,1.0,0.75,0.3125,35.0625,1.4375,1.9375,259.824212,0.625,0.0,0.375,16
4.0,1.0,0.625,0.3125,39.03125,0.375,0.375,217.392975,0.5625,0.0,0.4375,16
5.0,1.0,1.0,0.5,41.0,0.0,0.5,512.3292,1.0,0.0,0.0,4
