# Titanic (Mean Shift)

In [12]:
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np
from sklearn.cluster import MeanShift
from sklearn import preprocessing
import pandas as pd
style.use("ggplot")

In [2]:
df = pd.read_excel("titanic.xls")
original_df = pd.DataFrame.copy(df)  # Copy of the df.
df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
df.drop(["body", "name"], 1, inplace=True)
df.fillna(0, inplace=True)

## Handling non-numerical data

In [4]:
def handle_non_numerical_data(df):
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1
            df[column] = list(map(convert_to_int, df[column]))
    return df

df = handle_non_numerical_data(df)
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,1,29.0,0,0,746,211.3375,100,1,1,163
1,1,1,0,0.9167,1,2,522,151.55,6,1,9,33
2,1,0,1,2.0,1,2,522,151.55,6,1,0,33
3,1,0,0,30.0,1,2,522,151.55,6,1,0,33
4,1,0,1,25.0,1,2,522,151.55,6,1,0,33


In [5]:
X = np.array(df.drop(["survived"], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df["survived"])

clf = MeanShift()
clf.fit(X)

labels = clf.labels_
cluster_centers = clf.cluster_centers_

original_df["cluster_group"] = np.nan
# Iter through the labels and populate the values of the "cluster_group" column.
for i in range(len(X)):
    original_df["cluster_group"].iloc[i] = labels[i]  # iloc[i] is the row in the df (i is the number), column: cluster_group = labels[i]
    
n_clustters_ = len(np.unique(labels))

survival_rates = {}
for i in range(n_clustters_):
    temp_df = original_df[(original_df["cluster_group"] == float(i))]
    survival_cluster = temp_df[(temp_df["survived"] == 1)]
    survival_rate = len(survival_cluster) / len(temp_df)
    survival_rates[i] = survival_rate

for cluster in survival_rates:
    print("Cluster " + str(cluster + 1) + " - survival rate: " + str(survival_rates[cluster]))

Cluster 1 - survival rate: 0.38424821002386633
Cluster 2 - survival rate: 0.11538461538461539
Cluster 3 - survival rate: 0.8125
Cluster 4 - survival rate: 0.1


In [6]:
print("Description of cluster 1\n")
print(original_df[(original_df["cluster_group"] == 0)].describe())

Description of cluster 1

            pclass     survived          age        sibsp        parch  \
count  1257.000000  1257.000000  1004.000000  1257.000000  1257.000000   
mean      2.291169     0.384248    30.026062     0.388226     0.291965   
std       0.833911     0.486611    14.206585     0.707414     0.638375   
min       1.000000     0.000000     0.166700     0.000000     0.000000   
25%       2.000000     0.000000    21.000000     0.000000     0.000000   
50%       3.000000     0.000000    28.000000     0.000000     0.000000   
75%       3.000000     1.000000    38.625000     1.000000     0.000000   
max       3.000000     1.000000    80.000000     4.000000     4.000000   

              fare        body  cluster_group  
count  1256.000000  118.000000         1257.0  
mean     29.251877  160.355932            0.0  
std      39.273562   97.339175            0.0  
min       0.000000    1.000000            0.0  
25%       7.895800   72.750000            0.0  
50%      13.500000 

In [7]:
print("Description of cluster 2\n")
print(original_df[(original_df["cluster_group"] == 1)].describe())

Description of cluster 2

       pclass   survived        age      sibsp  parch       fare  body  \
count    26.0  26.000000  18.000000  26.000000   26.0  26.000000   1.0   
mean      3.0   0.115385   8.861111   5.615385    2.0  47.253365  67.0   
std       0.0   0.325813   4.898396   1.812775    0.0  18.500701   NaN   
min       3.0   0.000000   1.000000   4.000000    2.0   7.925000  67.0   
25%       3.0   0.000000   5.000000   4.000000    2.0  31.387500  67.0   
50%       3.0   0.000000   9.000000   5.000000    2.0  46.900000  67.0   
75%       3.0   0.000000  12.500000   8.000000    2.0  69.550000  67.0   
max       3.0   1.000000  17.000000   8.000000    2.0  69.550000  67.0   

       cluster_group  
count           26.0  
mean             1.0  
std              0.0  
min              1.0  
25%              1.0  
50%              1.0  
75%              1.0  
max              1.0  


In [8]:
print("Description of cluster 3\n")
print(original_df[(original_df["cluster_group"] == 2)].describe())

Description of cluster 3

       pclass   survived        age    sibsp      parch        fare  body  \
count    16.0  16.000000  16.000000  16.0000  16.000000   16.000000   0.0   
mean      1.0   0.812500  39.437500   0.6875   1.562500  322.156513   NaN   
std       0.0   0.403113  16.342047   0.7932   1.364734  113.545839   NaN   
min       1.0   0.000000  13.000000   0.0000   0.000000  247.520800   NaN   
25%       1.0   1.000000  26.250000   0.0000   0.750000  262.375000   NaN   
50%       1.0   1.000000  36.000000   0.5000   1.000000  262.375000   NaN   
75%       1.0   1.000000  52.000000   1.0000   2.250000  325.332300   NaN   
max       1.0   1.000000  64.000000   2.0000   4.000000  512.329200   NaN   

       cluster_group  
count           16.0  
mean             2.0  
std              0.0  
min              2.0  
25%              2.0  
50%              2.0  
75%              2.0  
max              2.0  


In [9]:
cluster_0 = original_df[(original_df["cluster_group"] == 0)]
cluster_0_fc = cluster_0[(cluster_0["pclass"] == 1)]
cluster_0_fc.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,307.0,307.0,268.0,307.0,307.0,307.0,35.0,307.0
mean,1.0,0.609121,39.143346,0.423453,0.302932,75.279805,162.828571,0.0
std,0.0,0.488744,14.467914,0.596699,0.607221,56.117554,82.652172,0.0
min,1.0,0.0,0.9167,0.0,0.0,0.0,16.0,0.0
25%,1.0,0.0,28.0,0.0,0.0,30.5,109.5,0.0
50%,1.0,1.0,39.0,0.0,0.0,57.75,166.0,0.0
75%,1.0,1.0,49.25,1.0,0.0,90.0,233.0,0.0
max,1.0,1.0,80.0,3.0,2.0,263.0,307.0,0.0
