In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, model_selection
from sklearn.cluster import KMeans, MeanShift
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

# KMeans Clustering

In [2]:
df = pd.read_excel('../Machine-Learning/Datasets/titanic.xls')

In [3]:
# df.info

In [4]:
df.columns.values

array(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype=object)

In [5]:
df.drop(['name', 'body'], 1, inplace=True)
df.fillna(0, inplace = True)

In [6]:
# df.copy().apply(pd.to_numeric, errors='ignore')

In [7]:
def handling_missing_data(df):
    columns = df.columns.values
    for column in columns:
        text_to_digits = {}
        
        def convert_to_int(val):
            return text_to_digits[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64 : 
            column_content = df[column].values.tolist()
            unique_values = set(column_content)
            
            x = 0
            for unique in unique_values:
                if unique not in text_to_digits:
                    text_to_digits[unique] = x
                    x += 1
            df[column] = list(map(convert_to_int, df[column]))
            
    return df

In [8]:
df = handling_missing_data(df)

In [9]:
df.drop(['boat'], 1, inplace=True)

In [10]:
accuracy = []     
for i in range(5):    
    X = np.array(df.drop(['survived'], 1).astype(float))
    X = preprocessing.scale(X)
    y = np.array(df['survived'])

    clf = KMeans(n_clusters =2)
    clf.fit(X)

    correct = 0
    for i in range(len(X)):
        predict_on = np.array(X[i].astype(float))
        predict_on = predict_on.reshape(-1, len(predict_on))
        prediction = clf.predict(predict_on)    
        if prediction[0] == y[i]:
            correct += 1
        
    accuracy.append(100 * (correct / len(X)))
   

accuracy

[68.52559205500381,
 31.550802139037433,
 68.44919786096256,
 31.550802139037433,
 68.44919786096256]

# MeanShift Clustering

In [11]:
df = pd.read_excel('../Machine-Learning/Datasets/titanic.xls')

original_df = df.copy()
df.drop(['body','name'], 1, inplace=True)
df.fillna(0,inplace=True)

def handle_non_numerical_data(df):
    
    # handling non-numerical data: must convert.
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        #print(column,df[column].dtype)
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            
            column_contents = df[column].values.tolist()
            #finding just the uniques
            unique_elements = set(column_contents)
            # great, found them. 
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    # creating dict that contains new
                    # id per unique string
                    text_digit_vals[unique] = x
                    x+=1
            # now we map the new "id" vlaue
            # to replace the string. 
            df[column] = list(map(convert_to_int,df[column]))

    return df

df = handle_non_numerical_data(df)
df.drop(['boat', 'home.dest'], 1, inplace=True)

   
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

labels = clf.labels_


In [12]:
labels = clf.labels_
custers_centers = clf.cluster_centers_
original_df['cluster'] = np.NaN
for i in range(len(X)):
    original_df['cluster'].iloc[i] = labels[i] 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [13]:
n_clusters = np.unique(labels)
survival_rates = {}
for i in n_clusters:
    temp_df = original_df[ original_df['cluster'] == float(i) ]
    survival_cluster = temp_df[ temp_df['survived'] == 1]
    survival_rate = len(survival_cluster) / len(temp_df)
    survival_rates[i] = survival_rate
    
survival_rates

{0: 0.37480190174326466, 1: 1.0, 2: 0.1, 3: 0.6451612903225806}

In [14]:
print(original_df[original_df['cluster'] == 0].describe())
original_df[original_df['cluster'] == 1]

            pclass     survived          age        sibsp        parch  \
count  1262.000000  1262.000000  1001.000000  1262.000000  1262.000000   
mean      2.327258     0.374802    29.537213     0.484152     0.309826   
std       0.820940     0.484264    14.288395     1.045053     0.659138   
min       1.000000     0.000000     0.166700     0.000000     0.000000   
25%       2.000000     0.000000    21.000000     0.000000     0.000000   
50%       3.000000     0.000000    28.000000     0.000000     0.000000   
75%       3.000000     1.000000    38.000000     1.000000     0.000000   
max       3.000000     1.000000    80.000000     8.000000     4.000000   

              fare        body  cluster  
count  1261.000000  116.000000   1262.0  
mean     26.938662  160.637931      0.0  
std      32.638522   98.297012      0.0  
min       0.000000    1.000000      0.0  
25%       7.895800   69.750000      0.0  
50%      13.775000  160.500000      0.0  
75%      29.000000  256.500000      0.0

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cluster
35,1,1,"Bowen, Miss. Grace Scott",female,45.0,0,0,PC 17608,262.375,,C,4,,"Cooperstown, NY",1.0
49,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36.0,0,1,PC 17755,512.3292,B51 B53 B55,C,3,,"Austria-Hungary / Germantown, Philadelphia, PA",1.0
50,1,1,"Cardeza, Mrs. James Warburton Martinez (Charlo...",female,58.0,0,1,PC 17755,512.3292,B51 B53 B55,C,3,,"Germantown, Philadelphia, PA",1.0
66,1,1,"Chaudanson, Miss. Victorine",female,36.0,0,0,PC 17608,262.375,B61,C,4,,,1.0
183,1,1,"Lesurer, Mr. Gustave J",male,35.0,0,0,PC 17755,512.3292,B101,C,3,,,1.0
302,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C,3,,,1.0
