# Titanic (Mean Shift)

In [1]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
import numpy as np
from sklearn.cluster import MeanShift
from sklearn import preprocessing
import pandas as pd

In [2]:
df = pd.read_excel("titanic.xls")
original_df = pd.DataFrame.copy(df)  # Copy of the df.
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
df.drop(["body", "name"], 1, inplace=True)
df.convert_objects(convert_numeric=True)
df.fillna(0, inplace=True)

  from ipykernel import kernelapp as app


In [4]:
# Handling non-numerical data
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1
            df[column] = list(map(convert_to_int, df[column]))
    return df
df = handle_non_numerical_data(df)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,0,29.0,0,0,739,211.3375,117,3,1,205
1,1,1,1,0.9167,1,2,503,151.55,109,3,20,267
2,1,0,0,2.0,1,2,503,151.55,109,3,0,267
3,1,0,1,30.0,1,2,503,151.55,109,3,0,267
4,1,0,0,25.0,1,2,503,151.55,109,3,0,267


In [5]:
X = np.array(df.drop(["survived"], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df["survived"])

clf = MeanShift()
clf.fit(X)

labels = clf.labels_
cluster_centers = clf.cluster_centers_

original_df["cluster_group"] = np.nan
# Iter through the labels and populate the values of the cluster_group column.
for i in range(len(X)):
    original_df["cluster_group"].iloc[i] = labels[i]  # iloc[i] is the row in the df (i is the number), column: cluster_group = labels[i]
    
n_clustters_ = len(np.unique(labels))

survival_rates = {}
for i in range(n_clustters_):
    temp_df = original_df[(original_df["cluster_group"] == float(i))]
    survival_cluster = temp_df[(temp_df["survived"] == 1)]
    survival_rate = len(survival_cluster) / len(temp_df)
    survival_rates[i] = survival_rate

for cluster in survival_rates:
    print("Cluster " + str(cluster + 1) + " - survival rate: " + str(survival_rates[cluster]))

Cluster 1 - survival rate: 0.37850467289719625
Cluster 2 - survival rate: 0.8666666666666667
Cluster 3 - survival rate: 0.1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [6]:
print("Description of cluster 1\n")
print(original_df[(original_df["cluster_group"] == 0)].describe())

Description of cluster 1

            pclass     survived          age        sibsp        parch  \
count  1284.000000  1284.000000  1023.000000  1284.000000  1284.000000   
mean      2.304517     0.378505    29.647768     0.493769     0.327103   
std       0.831911     0.485203    14.361612     1.047191     0.676183   
min       1.000000     0.000000     0.166700     0.000000     0.000000   
25%       2.000000     0.000000    21.000000     0.000000     0.000000   
50%       3.000000     0.000000    28.000000     0.000000     0.000000   
75%       3.000000     1.000000    38.000000     1.000000     0.000000   
max       3.000000     1.000000    80.000000     8.000000     4.000000   

              fare        body  cluster_group  
count  1283.000000  119.000000         1284.0  
mean     29.786801  159.571429            0.0  
std      39.497472   97.302914            0.0  
min       0.000000    1.000000            0.0  
25%       7.895800   71.000000            0.0  
50%      14.000000 

In [7]:
print("Description of cluster 2\n")
print(original_df[(original_df["cluster_group"] == 1)].describe())

Description of cluster 2

       pclass   survived        age      sibsp      parch        fare  body  \
count    15.0  15.000000  15.000000  15.000000  15.000000   15.000000   0.0   
mean      1.0   0.866667  40.466667   0.733333   1.600000  327.132227   NaN   
std       0.0   0.351866  16.370124   0.798809   1.404076  115.711466   NaN   
min       1.0   0.000000  13.000000   0.000000   0.000000  247.520800   NaN   
25%       1.0   1.000000  31.000000   0.000000   0.500000  262.375000   NaN   
50%       1.0   1.000000  36.000000   1.000000   1.000000  262.375000   NaN   
75%       1.0   1.000000  54.000000   1.000000   2.500000  387.664600   NaN   
max       1.0   1.000000  64.000000   2.000000   4.000000  512.329200   NaN   

       cluster_group  
count           15.0  
mean             1.0  
std              0.0  
min              1.0  
25%              1.0  
50%              1.0  
75%              1.0  
max              1.0  


In [8]:
print("Description of cluster 3\n")
print(original_df[(original_df["cluster_group"] == 2)].describe())

Description of cluster 3

       pclass   survived        age      sibsp      parch       fare  \
count    10.0  10.000000   8.000000  10.000000  10.000000  10.000000   
mean      3.0   0.100000  39.875000   0.800000   6.000000  42.703750   
std       0.0   0.316228   1.552648   0.421637   1.632993  15.590194   
min       3.0   0.000000  38.000000   0.000000   5.000000  29.125000   
25%       3.0   0.000000  39.000000   1.000000   5.000000  31.303125   
50%       3.0   0.000000  39.500000   1.000000   5.000000  35.537500   
75%       3.0   0.000000  40.250000   1.000000   6.000000  46.900000   
max       3.0   1.000000  43.000000   1.000000   9.000000  69.550000   

             body  cluster_group  
count    2.000000           10.0  
mean   234.500000            2.0  
std    130.814755            0.0  
min    142.000000            2.0  
25%    188.250000            2.0  
50%    234.500000            2.0  
75%    280.750000            2.0  
max    327.000000            2.0  


In [9]:
cluster_0 = original_df[(original_df["cluster_group"] == 0)]
cluster_0_fc = cluster_0[(cluster_0["pclass"] == 1)]
cluster_0_fc.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,308.0,308.0,269.0,308.0,308.0,308.0,35.0,308.0
mean,1.0,0.607143,39.087051,0.422078,0.305195,75.839029,162.828571,0.0
std,0.0,0.48918,14.470383,0.596215,0.607531,56.879199,82.652172,0.0
min,1.0,0.0,0.9167,0.0,0.0,0.0,16.0,0.0
25%,1.0,0.0,28.0,0.0,0.0,30.5,109.5,0.0
50%,1.0,1.0,39.0,0.0,0.0,57.8646,166.0,0.0
75%,1.0,1.0,49.0,1.0,0.0,90.0,233.0,0.0
max,1.0,1.0,80.0,3.0,2.0,263.0,307.0,0.0
