# Mean shift to Titanic Dataset

### Imports

In [2]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import MeanShift
from sklearn import preprocessing, cross_validation
import pandas as pd

In [21]:
df = pd.read_excel('titanic.xls')
orignal_df = pd.DataFrame.copy(df)

In [22]:
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_content = df[column].values.tolist()
            unique_elements = set(column_content)
            #print(unique_elements)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1
                    
            def convert_to_int(key):
                return text_digit_vals[key]
            df[column] = list(map(convert_to_int, df[column]))
        
handle_non_numerical_data(df)

df.replace('NaN',-99999,inplace = True)
df.fillna(0,inplace = True)
df.drop(['body','name'],1,inplace = True)


In [23]:
x = np.array(df.drop(['survived'],1).astype(float))
x = preprocessing.scale(x)
y = np.array(df['survived'])

In [26]:
df.head(2)

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,0,29.0,0,0,737,211.3375,51,2,1,231
1,1,1,1,0.9167,1,2,515,151.55,157,2,7,290


In [24]:
clf = MeanShift()
clf.fit(x)

MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, min_bin_freq=1,
     n_jobs=1, seeds=None)

In [25]:
labels = clf.labels_
cluster_centers = clf.cluster_centers_

In [32]:
orignal_df['clusters'] = np.nan

In [33]:
for i in range(len(x)):
    orignal_df['clusters'].iloc[i] = labels[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [35]:
n_clusters = len(np.unique(labels))
n_clusters

5

In [40]:
new_df = orignal_df[orignal_df['clusters'] == 0]
len(new_df[new_df['survived'] == 1])/len(orignal_df[orignal_df['clusters'] == 0])

0.3734076433121019

In [42]:
new_df = orignal_df[orignal_df['clusters'] == 2]
len(new_df[new_df['survived'] == 1])/len(new_df)

0.6486486486486487

In [49]:
for i in range(n_clusters):
    new_df = orignal_df[orignal_df['clusters'] == i]
    print('cluster: ' + str(i) + ' :'+str(len(new_df[new_df['survived'] == 1])/len(new_df)))

cluster: 0 :0.3734076433121019
cluster: 1 :1.0
cluster: 2 :0.6486486486486487
cluster: 3 :0.1
cluster: 4 :1.0


In [51]:
orignal_df[orignal_df['clusters'] == 1].describe() 

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,clusters
count,4.0,4.0,4.0,4.0,4.0,4.0,0.0,4.0
mean,1.0,1.0,41.25,0.0,0.5,449.84065,,1.0
std,0.0,0.0,11.176612,0.0,0.57735,124.9771,,0.0
min,1.0,1.0,35.0,0.0,0.0,262.375,,1.0
25%,1.0,1.0,35.75,0.0,0.0,449.84065,,1.0
50%,1.0,1.0,36.0,0.0,0.5,512.3292,,1.0
75%,1.0,1.0,41.5,0.0,1.0,512.3292,,1.0
max,1.0,1.0,58.0,0.0,1.0,512.3292,,1.0


In [52]:
orignal_df[orignal_df['clusters'] == 2].describe() 

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,clusters
count,37.0,37.0,37.0,37.0,37.0,37.0,3.0,37.0
mean,1.0,0.648649,36.943695,1.0,1.567568,202.423311,117.666667,2.0
std,0.0,0.483978,18.316574,0.912871,1.014904,56.788376,19.857828,0.0
min,1.0,0.0,0.9167,0.0,0.0,79.2,96.0,2.0
25%,1.0,0.0,24.0,0.0,1.0,151.55,109.0,2.0
50%,1.0,1.0,36.0,1.0,2.0,211.5,122.0,2.0
75%,1.0,1.0,50.0,1.0,2.0,262.375,128.5,2.0
max,1.0,1.0,67.0,3.0,4.0,263.0,135.0,2.0


In [53]:
orignal_df[orignal_df['clusters'] == 3].describe() 

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,clusters
count,10.0,10.0,8.0,10.0,10.0,10.0,2.0,10.0
mean,3.0,0.1,39.875,0.8,6.0,42.70375,234.5,3.0
std,0.0,0.316228,1.552648,0.421637,1.632993,15.590194,130.814755,0.0
min,3.0,0.0,38.0,0.0,5.0,29.125,142.0,3.0
25%,3.0,0.0,39.0,1.0,5.0,31.303125,188.25,3.0
50%,3.0,0.0,39.5,1.0,5.0,35.5375,234.5,3.0
75%,3.0,0.0,40.25,1.0,6.0,46.9,280.75,3.0
max,3.0,1.0,43.0,1.0,9.0,69.55,327.0,3.0


In [54]:
orignal_df[orignal_df['clusters'] == 4].describe() 

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,clusters
count,2.0,2.0,2.0,2.0,2.0,2.0,0.0,2.0
mean,1.0,1.0,40.0,0.0,0.0,387.3521,,4.0
std,0.0,0.0,7.071068,0.0,0.0,176.74431,,0.0
min,1.0,1.0,35.0,0.0,0.0,262.375,,4.0
25%,1.0,1.0,37.5,0.0,0.0,324.86355,,4.0
50%,1.0,1.0,40.0,0.0,0.0,387.3521,,4.0
75%,1.0,1.0,42.5,0.0,0.0,449.84065,,4.0
max,1.0,1.0,45.0,0.0,0.0,512.3292,,4.0


In [55]:
orignal_df[orignal_df['clusters'] == 0].describe() 

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,clusters
count,1256.0,1256.0,995.0,1256.0,1256.0,1255.0,116.0,1256.0
mean,2.333599,0.373408,29.472111,0.484076,0.305732,26.342406,160.655172,0.0
std,0.817743,0.483902,14.221775,1.046977,0.657533,31.450235,98.290351,0.0
min,1.0,0.0,0.1667,0.0,0.0,0.0,1.0,0.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958,69.75,0.0
50%,3.0,0.0,28.0,0.0,0.0,13.5,160.5,0.0
75%,3.0,1.0,38.0,1.0,0.0,28.5,256.5,0.0
max,3.0,1.0,80.0,8.0,4.0,227.525,328.0,0.0
