In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from matplotlib import style
style.use("ggplot")

# KMeans Clustering

In [28]:
titanicDf = pd.read_excel("/Users/jaskiratsinghp/Desktop/PersonalStuff/Datasets/titanic.xls")

titanicDf_copy = pd.DataFrame.copy(titanicDf)

'''
1.) Pclass: Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
2.) survival: Survival (0 = No; 1 = Yes) THESE ARE THE LABELS.
3.) name: Name
4.) sex: Sex
5.) age: Age
6.) sibsp: Number of Siblings/Spouses Aboard
7.) parch: Number of Parents/Children Aboard
8.) ticket: Ticket Number
9.) fare: Passenger Fare (British pound)
10.) cabin: Cabin
11.) embarked: Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
12.) boat: Lifeboat
13.) body: Body Identification Number
14.) home.dest: Home/Destination
'''
titanicDf.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [29]:
titanicDf.drop(["name" , "body"] , axis = 1 , inplace = True)

titanicDf.fillna(0 , inplace = True)

# titanicDf.convert_objects(convert_numeric=True)

titanicDf.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,home.dest
0,1,1,female,29.0,0,0,24160,211.3375,B5,S,2,"St Louis, MO"
1,1,1,male,0.9167,1,2,113781,151.55,C22 C26,S,11,"Montreal, PQ / Chesterville, ON"
2,1,0,female,2.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"
3,1,0,male,30.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"
4,1,0,female,25.0,1,2,113781,151.55,C22 C26,S,0,"Montreal, PQ / Chesterville, ON"


In [30]:
titanicDf.dtypes

pclass         int64
survived       int64
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
home.dest     object
dtype: object

In [31]:
titanicDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
pclass       1309 non-null int64
survived     1309 non-null int64
sex          1309 non-null object
age          1309 non-null float64
sibsp        1309 non-null int64
parch        1309 non-null int64
ticket       1309 non-null object
fare         1309 non-null float64
cabin        1309 non-null object
embarked     1309 non-null object
boat         1309 non-null object
home.dest    1309 non-null object
dtypes: float64(2), int64(4), object(6)
memory usage: 122.8+ KB


### Now we have null values in columns "cabin" , "embarked" and "home.dest" and we are putting 0 where we have null values so in this column we have some string values and some integers now.

### So we can't perform label encoding on them either it should be all string or all integers.

In [32]:
# from sklearn.preprocessing import LabelEncoder

# label_encoder = LabelEncoder()

# titanicDf["sex"] = label_encoder.fit_transform(titanicDf["sex"])
# titanicDf["cabin"] = label_encoder.fit_transform(titanicDf["cabin"])
# titanicDf["embarked"] = label_encoder.fit_transform(titanicDf["embarked"])
# titanicDf["home.dest"] = label_encoder.fit_transform(titanicDf["home.dest"])



In [33]:

def handle_non_numerical_data(titanicDf):
    
    columns = titanicDf.columns.values
    
    for column in columns:
        
        text_digit_vals = {}
        
        def convert_to_int(val):
            
            return text_digit_vals[val]
        
        if titanicDf[column].dtype != np.int64 and titanicDf[column].dtype != np.float64:
            
            column_contents = titanicDf[column].values.tolist()
            unique_elements = set(column_contents)
            
            x = 0
            
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    
                    text_digit_vals[unique] = x
                    
                    x += 1
            
            titanicDf[column] = list(map(convert_to_int , titanicDf[column]))
            
    return titanicDf
                

In [34]:
titanicDf = handle_non_numerical_data(titanicDf)

print(titanicDf.head())

   pclass  survived  sex      age  sibsp  parch  ticket      fare  cabin  \
0       1         1    0  29.0000      0      0     749  211.3375    182   
1       1         1    1   0.9167      1      2     515  151.5500    160   
2       1         0    0   2.0000      1      2     515  151.5500    160   
3       1         0    1  30.0000      1      2     515  151.5500    160   
4       1         0    0  25.0000      1      2     515  151.5500    160   

   embarked  boat  home.dest  
0         2     2        140  
1         2    27         11  
2         2     0         11  
3         2     0         11  
4         2     0         11  


In [35]:
titanicDf.dtypes

pclass         int64
survived       int64
sex            int64
age          float64
sibsp          int64
parch          int64
ticket         int64
fare         float64
cabin          int64
embarked       int64
boat           int64
home.dest      int64
dtype: object

In [36]:
titanicDf.drop(["sex" , "boat"] , axis = 1 , inplace = True)

X = np.array(titanicDf.drop(["survived"] , axis = 1).astype(float))
X = preprocessing.scale(X)

y = np.array(titanicDf["survived"])

In [37]:
kmeansClassifier = KMeans(n_clusters = 2)
kmeansClassifier.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [38]:
correct = 0

for i in range(len(X)):
    
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1 , len(predict_me))
    
    prediction = kmeansClassifier.predict(predict_me)
    
    if prediction[0] == y[i]:
        correct += 1
        
print("Accuracy: " , correct / len(X))

Accuracy:  0.6860198624904508


In [39]:
X[0].shape

(9,)

In [40]:
X[0].reshape(-1 , len(X[0])).shape

(1, 9)

In [41]:
prediction

array([0], dtype=int32)

# Mean Shift Clustering

In [43]:
meanShiftClassifier = MeanShift()
meanShiftClassifier.fit(X)

labels = meanShiftClassifier.labels_
cluster_centers = meanShiftClassifier.cluster_centers_


titanicDf_copy["cluster_group"] = np.nan

for i in range(len(X)):
    
    titanicDf_copy["cluster_group"].iloc[i] = labels[i]

survival_rates = {}

n_clusters_ = len(np.unique(labels))

for i in range (n_clusters_):
    
    temp_df = titanicDf_copy[ (titanicDf_copy["cluster_group"] == float(i)) ]
    
    survival_cluster = temp_df[ (temp_df["survived"] == 1) ]
    
    survival_rate = len(survival_cluster) / len(temp_df)
    
    survival_rates[i] = survival_rate

print(survival_rates)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


{0: 0.3776387802971071, 1: 0.8, 2: 0.1}


In [55]:
## We can see that there are 3 groups now:
## In group 1 there is 37% Survival Rate
## In group 2 there is 80% Survival Rate
## In group3 there is 10% Survival Rate

print("\n\n\n\n" + "\033[1m" + "Dataframe for Group 1:" + "\033[0m" + "\n")

print(titanicDf_copy[ (titanicDf_copy["cluster_group"] == 1) ].head())

print("\n\n\n\n" + "\033[1m" + "Description:" + "\033[0m" + "\n")

print(titanicDf_copy[ (titanicDf_copy["cluster_group"] == 1) ].describe())





[1mDataframe for Group 1:[0m

    pclass  survived                                               name  \
16       1         0                           Baxter, Mr. Quigg Edmond   
17       1         1    Baxter, Mrs. James (Helene DeLaudeniere Chaput)   
35       1         1                           Bowen, Miss. Grace Scott   
49       1         1                 Cardeza, Mr. Thomas Drake Martinez   
50       1         1  Cardeza, Mrs. James Warburton Martinez (Charlo...   

       sex   age  sibsp  parch    ticket      fare        cabin embarked boat  \
16    male  24.0      0      1  PC 17558  247.5208      B58 B60        C  NaN   
17  female  50.0      0      1  PC 17558  247.5208      B58 B60        C    6   
35  female  45.0      0      0  PC 17608  262.3750          NaN        C    4   
49    male  36.0      0      1  PC 17755  512.3292  B51 B53 B55        C    3   
50  female  58.0      0      1  PC 17755  512.3292  B51 B53 B55        C    3   

    body                  

In [56]:
print("\n\n\n\n" + "\033[1m" + "Dataframe for Group 2:" + "\033[0m" + "\n")

print(titanicDf_copy[ (titanicDf_copy["cluster_group"] == 2) ].head())

print("\n\n\n\n" + "\033[1m" + "Description:" + "\033[0m" + "\n")

print(titanicDf_copy[ (titanicDf_copy["cluster_group"] == 2) ].describe())





[1mDataframe for Group 2:[0m

     pclass  survived                                               name  \
629       3         0                        Andersson, Mr. Anders Johan   
632       3         0  Andersson, Mrs. Anders Johan (Alfrida Konstant...   
644       3         0         Asplund, Mr. Carl Oscar Vilhelm Gustafsson   
646       3         1  Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...   
831       3         0                     Goodwin, Mr. Charles Frederick   

        sex   age  sibsp  parch   ticket     fare cabin embarked boat   body  \
629    male  39.0      1      5   347082  31.2750   NaN        S  NaN    NaN   
632  female  39.0      1      5   347082  31.2750   NaN        S  NaN    NaN   
644    male  40.0      1      5   347077  31.3875   NaN        S  NaN  142.0   
646  female  38.0      1      5   347077  31.3875   NaN        S   15    NaN   
831    male  40.0      1      6  CA 2144  46.9000   NaN        S  NaN    NaN   

                          

In [58]:
print("\n\n\n\n" + "\033[1m" + "Dataframe for Group 0:" + "\033[0m" + "\n")

print(titanicDf_copy[ (titanicDf_copy["cluster_group"] == 0) ].head())

print("\n\n\n\n" + "\033[1m" + "Description:" + "\033[0m" + "\n")

print(titanicDf_copy[ (titanicDf_copy["cluster_group"] == 0) ].describe())





[1mDataframe for Group 0:[0m

   pclass  survived                                             name     sex  \
0       1         1                    Allen, Miss. Elisabeth Walton  female   
1       1         1                   Allison, Master. Hudson Trevor    male   
2       1         0                     Allison, Miss. Helen Loraine  female   
3       1         0             Allison, Mr. Hudson Joshua Creighton    male   
4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   

       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
0  29.0000      0      0   24160  211.3375       B5        S    2    NaN   
1   0.9167      1      2  113781  151.5500  C22 C26        S   11    NaN   
2   2.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3  30.0000      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4  25.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         h

In [60]:
## Now lets check that in cluster_0 how many people we have with pclass = 1.

cluster_0 = titanicDf_copy[ (titanicDf_copy["cluster_group"] == 0) ]

cluster_0_fc = cluster_0[ (cluster_0["pclass"] == 1) ]

cluster_0_fc.describe()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,body,cluster_group
count,303.0,303.0,264.0,303.0,303.0,303.0,35.0,303.0
mean,1.0,0.607261,39.380366,0.389439,0.280528,72.80165,162.828571,0.0
std,0.0,0.489168,14.44195,0.521226,0.578749,52.134338,82.652172,0.0
min,1.0,0.0,0.9167,0.0,0.0,0.0,16.0,0.0
25%,1.0,0.0,29.0,0.0,0.0,30.5,109.5,0.0
50%,1.0,1.0,39.0,0.0,0.0,57.0,166.0,0.0
75%,1.0,1.0,50.0,1.0,0.0,89.1042,233.0,0.0
max,1.0,1.0,80.0,2.0,2.0,227.525,307.0,0.0
