### Imports and handling non-numeric data

In [1]:
# https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import KMeans
from sklearn import preprocessing, model_selection
import pandas as pd

'''
Features of titanic.xls dataset. Want to predict survival based on the other features
Pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
survival Survival (0 = No; 1 = Yes)
name Name
sex Sex
age Age
sibsp Number of Siblings/Spouses Aboard
parch Number of Parents/Children Aboard
ticket Ticket Number
fare Passenger Fare (British pound)
cabin Cabin
embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
boat Lifeboat
body Body Identification Number
home.dest Home/Destination
'''

df = pd.read_excel('titanic.xls')
# print(df.head())
df.drop(['body', 'name'], 1, inplace=True)  # body id number might only be applicable for dead, so remove since predicting death
df.apply(pd.to_numeric, errors='ignore')
df.fillna(0, inplace=True)
# print(df.head())

# python_prog's way of encoding categorial to numerical
# don't use, better to do one hot encoding instead since this way of changing to nums will imply a distance between the vars
def handle_non_numerical_data(df):
    columns = df.columns.values  # get col labels
    
    for column in columns:
        text_digit_vals = {}
        
        # based on the dictionary defined below, return the value from the key
        def convert_to_int(val):
            return text_digit_vals[val]
        
        # if type is not numeric, want to convert it
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                # grow a dictionary
                # if the value is not already in our growing dictionary, add it to dict with a growing num
                # e.g. female: 0, male: 1, other: 2
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1
            df[column] = list(map(convert_to_int, df[column]))
            
    return df

df = handle_non_numerical_data(df)
print(df.head())

   pclass  survived  sex      age  sibsp  parch  ticket      fare  cabin  \
0       1         1    0  29.0000      0      0     741  211.3375    158   
1       1         1    1   0.9167      1      2     496  151.5500     76   
2       1         0    0   2.0000      1      2     496  151.5500     76   
3       1         0    1  30.0000      1      2     496  151.5500     76   
4       1         0    0  25.0000      1      2     496  151.5500     76   

   embarked  boat  home.dest  
0         0     2         47  
1         0    23        306  
2         0     0        306  
3         0     0        306  
4         0     0        306  


### Run K Means on dataset

In [7]:
# drop the survived column (axis=1), and then convert remaining to np array as float
# https://www.quora.com/How-is-a-Pandas-DataFrame-different-from-a-2D-NumPy-array
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])
# print(X[:10])
# print(y[:10])

clf = KMeans(n_clusters=2)
clf.fit(X)

correct = 0 
for i in range(len(X)):
    # take the first set of features (pclass, sex, age etc)
    predict_me = np.array(X[i].astype(float))
    # properly declare the array with two []
    predict_me = predict_me.reshape(-1, len(predict_me))
    # takes an array, returns what cluster predicted, out of n_clusters
    # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans.predict
    # the cluster number assigned is random, so correct or 1 - correct is the right amt
    prediction = clf.predict(predict_me)
    # seems using prediction w/o [0] would work as well
    if prediction[0] == y[i]:
        correct += 1

print(correct / len(X))

# test_prediction on a sample datapoint. Has to be rescaled since X was rescaled
# still either the test_prediction value or 1 - test_prediction for accuracy
test_prediction = clf.predict([[-1.54609786, -1.34499549, 0.29131302, -0.47908676, -0.4449995, 1.02141698, 3.44242751, 3.04022184, -0.63853537, -0.42886555, -0.48546501]])
print(test_prediction)

0.7051184110007639
[1]
