# Titanic - Machine Learning from Disaster

## Imports

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder

## Global config

In [2]:
train_csv = "encoded_data.csv"

## Load data

In [3]:
train_data = pd.read_csv(train_csv, index_col='PassengerId')

### Data dictionary

#### Categorical features

- Survived - Survival - 0 = No, 1 = Yes - **Used for evaluating whether a person survived**
- Sex - Sex 	
- Embarked - Port of Embarkation - C = Cherbourg, Q = Queenstown, S = Southampton

#### Ordinal

- Pclass - Ticket class - 1 = 1st, 2 = 2nd, 3 = 3rd

#### Numerical features

##### Discrete

- SibSp - # of siblings / spouses aboard the Titanic 	
- Parch - # of parents / children aboard the Titanic 

##### Continuous

- Age - Age in years 		
- Fare - Passenger fare 

#### Mixed/Error-prone/Unused

- PassengerId - Id of person
- Name - Name of Passenger
- Ticket - Ticket number 
- Cabin - Cabin number

In [4]:
y_column = 'Survived'

In [19]:
import os
import random
import json
import numpy as np
import pandas as pd

# Set warnings to not be displayed
pd.options.mode.chained_assignment = None

class Divider:
    '''
    Table divider class
    '''

    def __init__(self, input_table, important_column):
        '''
        Constructor for divider

        input_table - table to apply division on
        important_column - y_column used for predictions
        '''
        self.result = []
        self.input_table = input_table
        self.important_column = important_column
        self.connections = []
        self.index = 0
    
    def get_result(self):
        '''
        Returns the result of calculations
        '''
        return self.result

    def random_same_pk_fk(self, input_table, level):
        '''
        Recursively cluster randomly the columns
        level - level of recursion
        '''

        input_table = input_table.copy()
        
        # Return if no columns
        if len(input_table.columns) == 1:
            return
        
        # Set up base table variables
        base_index = self.index
        self.index += 1
        base_index_cols = input_table.index.names
        
        # Apply clustering for every cluster of columns
        while len(input_table.columns) > 0:

            # Randomly shuffle
            self.index += 1
            mylist = np.array(range(0, len(input_table)))
            random.shuffle(mylist)

            # Randomly pick n_splits columns
            picked_columns = []
            if len(input_table.columns) > 1:
                picked_columns = input_table.sample(n=np.random.randint(1, len(input_table.columns)), axis='columns').columns
            else:
                picked_columns = input_table.columns       
            
            # Set new column names
            PK_name = 'PK' + str(level + 1) + str(self.index)
            FK_name = 'FK' + str(level + 1) + str(self.index)
            
            # Add new PK
            recursed_table = input_table[picked_columns]
            recursed_table.loc[:, PK_name] = mylist
            recursed_table.set_index(PK_name, inplace = True)
            
            # Add new FK and remove columns associated with it
            input_table.loc[:, FK_name] = mylist
            input_table = input_table.groupby(input_table.index.names + [FK_name]).mean()
            input_table = input_table.drop(picked_columns, axis = 1)
            
            # Add the connection to a list
            self.connections.append(('table' + str(level) + str(base_index), FK_name,  
                                     'table' + str(level + 1) + str(self.index), PK_name))
            
            # Append new FK table to result list
            if len(recursed_table.columns) == 1:
                self.result.append((level + 1, self.index, recursed_table))
            
            # Apply clustering recursively on smaller table
            self.random_same_pk_fk(recursed_table, level + 1)
        
        # Reset the index and set FK columns as normal columns
        input_table = input_table.reset_index()
        input_table.set_index(base_index_cols, inplace = True)
        self.result.append((level, base_index, input_table))
    
    def correlation(self, input_table, important_column, level):
        '''
        Recursively cluster most correlated columns to an "important_column"
        important_column - colum of interest, most likely to be Y
        input_table - table to apply the clustering on
        '''

        input_table = input_table.copy()

        n_splits = 3
        
        # Return if no columns
        if len(input_table.columns) == 0:
            return
        
        # Set up base table variables
        base_index = self.index
        self.index += 1
        base_index_cols = input_table.index.names

        # Calculate correlation between columns and most important column
        corr = abs(input_table.corr(method='spearman'))
        corr = corr.drop([important_column], axis = 1)
        
        # Calculate quantiles based on correlation and n_splits
        quantiles = []
        for i in range(n_splits):
            quantile = 1 - (i+1) / n_splits
            quantiles.append(corr.loc[[important_column]].T.quantile(quantile)[0])
        
        # Apply clustering for every cluster of columns
        for threshold in quantiles:
            # Randomly shuffle
            self.index += 1
            mylist = np.array(range(0, len(input_table)))
            random.shuffle(mylist)

            # Break if no columns
            if len(corr.columns) == 0 or len(input_table.columns) == 0:
                break
        
            # Pick the new important column
            new_important = corr.loc[[important_column]].idxmax(axis=1)[0]
            # Pick all columns with correlation above quantile threshold
            corr_columns = [col for col in corr.loc[[important_column]].columns if corr.loc[[important_column]][col][0] >= threshold]
           
            if len(corr_columns) == 0:
                continue

             # Set new column names
            PK_name = 'PK' + str(level + 1) + str(self.index)
            FK_name = 'FK' + str(level + 1) + str(self.index)
            
            # Add new PK
            recursed_table = input_table[corr_columns]
            recursed_table.loc[:, PK_name] = mylist
            recursed_table.set_index(PK_name, inplace = True)
            
            # Add new FK and remove columns associated with it
            input_table.loc[:, FK_name] = mylist
            input_table = input_table.groupby(input_table.index.names + [FK_name]).mean()
            input_table = input_table.drop(corr_columns, axis = 1)
            corr = corr.drop(corr_columns, axis = 1)
            
            # Add the connection to a list
            self.connections.append(('table' + str(level) + str(base_index), FK_name,  
                                     'table' + str(level + 1) + str(self.index), PK_name))
            
            # Apply clustering recursively
            self.correlation(recursed_table, new_important, level + 1)

        # Reset the index and set FK columns as normal columns
        input_table = input_table.reset_index()
        input_table.set_index(base_index_cols, inplace = True)
        self.result.append((level, base_index, input_table))

    
    def divide(self, strategy, path):
        '''
        Function used to divide the table
        strategy - strategy of division
        path - path to save output
        '''

        # Create output folder
        os.makedirs(path, exist_ok=True) 
        
        # Initialise fresh result and connections lists
        self.__init__(self.input_table, self.important_column)
        
        # Pick strategy
        if strategy == 'random_same_pk_fk':
            input_table = self.input_table.groupby(self.input_table.index.names + [self.important_column]).mean()
            self.random_same_pk_fk(input_table, 0)
        elif strategy == 'correlation':
            self.correlation(self.input_table, self.important_column, 0)
        
        # Sort result by recursion level and index
        self.result.sort(key=lambda x: (x[0], x[1]))

        # Print results and save every table to a file
        print('Level, Index, Primary Key, Columns')
        for (el, col, table) in self.result:
            print(el, " ", col, " ", table.index.names, " ", table.columns, "\n")
            table.to_csv(path + '/table' + str(el) + str(col) + '.csv')

        # Initialise set of tuples in the form (table name, PK column)
        all_tables = []

        # Iterate over tables and fill data
        for (el, col, table) in self.result:
            # Add tables to set
            all_tables.append((str('table' + str(el) + str(col)), table.index.names))

        # Save connections to file
        np.savetxt(path + "/connections.csv", self.connections, delimiter=',', fmt='%s')
        
        # Save tables names with their PK to file
        all_tables = json.dumps(all_tables)
        with open(path + '/tables.json', 'w') as outfile:
            json.dump(all_tables, outfile)

In [20]:
# Initialise divider
dv = Divider(train_data, y_column)

In [22]:
# Apply division
dv.divide(strategy = "correlation", path = 'output')

Level, Index, Primary Key, Columns
0   0   ['PassengerId']   Index(['FK12', 'FK111', 'FK117', 'Survived'], dtype='object') 

1   2   ['PK12']   Index(['FK24', 'FK28', 'Sex'], dtype='object') 

1   11   ['PK111']   Index(['FK213', 'Embarked'], dtype='object') 

1   17   ['PK117']   Index(['FK219', 'FK223', 'SibSp'], dtype='object') 

2   4   ['PK24']   Index(['Fare'], dtype='object') 

2   8   ['PK28']   Index(['Pclass'], dtype='object') 

2   13   ['PK213']   Index(['Parch'], dtype='object') 

2   19   ['PK219']   Index(['Age'], dtype='object') 

2   23   ['PK223']   Index(['Title'], dtype='object') 



In [23]:
from numpy import genfromtxt

# Read data from tables - returns array of [table name, Primary key]
read_tables = []
with open('output/tables.json') as json_file:
    read_tables = json.loads(json.load(json_file))
print(read_tables)

[['table00', ['PassengerId']], ['table12', ['PK12']], ['table111', ['PK111']], ['table117', ['PK117']], ['table24', ['PK24']], ['table28', ['PK28']], ['table213', ['PK213']], ['table219', ['PK219']], ['table223', ['PK223']]]


In [24]:
# Read data from tables - read tables as pandas and set index as Primary key column
read_tables_content = dict()
for [table, pk] in read_tables:
    read_tables_content[table] = pd.read_csv('output/' + table + '.csv', index_col=pk)
    
print(read_tables_content)

{'table00':              FK12  FK111  FK117  Survived
PassengerId                              
1             378    118    537         0
2             309    647    871         1
3             143    100    396         1
4             356    501     67         1
5             234    654    679         0
...           ...    ...    ...       ...
887           447    309    847         0
888           428    524     17         1
889           492    761     89         0
890           763    669    296         1
891           381     42    125         0

[891 rows x 4 columns], 'table12':       FK24  FK28  Sex
PK12                 
0       54   486  0.0
1      122   812  0.0
2      606   370  1.0
3      468   172  0.0
4      839    42  0.0
...    ...   ...  ...
886    734   792  0.0
887     52   509  0.0
888    326   291  1.0
889    586   855  1.0
890    778   559  1.0

[891 rows x 3 columns], 'table111':        FK213  Embarked
PK111                 
0        274       0.0
1        848  

In [25]:
# Test table reading
table01 = pd.read_csv('output/table00.csv', index_col='PassengerId')
table01

Unnamed: 0_level_0,FK12,FK111,FK117,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,378,118,537,0
2,309,647,871,1
3,143,100,396,1
4,356,501,67,1
5,234,654,679,0
...,...,...,...,...
887,447,309,847,0
888,428,524,17,1
889,492,761,89,0
890,763,669,296,1


In [26]:
# Read all ((table1, PK), (table2, FK)) relations and add them to a list
read_connections = map(lambda x: ((x[0], x[1]), (x[2], x[3])), genfromtxt('output/connections.csv', delimiter=',', dtype='str'))
read_connections = list(read_connections)
print(read_connections)

[(('table00', 'FK12'), ('table12', 'PK12')), (('table12', 'FK24'), ('table24', 'PK24')), (('table12', 'FK28'), ('table28', 'PK28')), (('table00', 'FK111'), ('table111', 'PK111')), (('table111', 'FK213'), ('table213', 'PK213')), (('table00', 'FK117'), ('table117', 'PK117')), (('table117', 'FK219'), ('table219', 'PK219')), (('table117', 'FK223'), ('table223', 'PK223'))]


In [27]:
# Example of joining tables by going over the relations list
for ((table1, index1), (table2, index2)) in read_connections:
    print(read_tables_content[table1].merge(read_tables_content[table2], left_on=index1, right_on=index2))

     FK12  FK111  FK117  Survived  FK24  FK28  Sex
0     378    118    537         0   743   175  1.0
1     309    647    871         1   513   744  0.0
2     143    100    396         1   112   800  0.0
3     356    501     67         1   319   633  0.0
4     234    654    679         0   312   271  1.0
..    ...    ...    ...       ...   ...   ...  ...
886   447    309    847         0   752    33  1.0
887   428    524     17         1   604   884  0.0
888   492    761     89         0   377   807  0.0
889   763    669    296         1   187   516  1.0
890   381     42    125         0   569   297  1.0

[891 rows x 7 columns]
     FK24  FK28  Sex      Fare
0      54   486  0.0   78.2667
1     122   812  0.0   13.0000
2     606   370  1.0   55.9000
3     468   172  0.0   25.4667
4     839    42  0.0    7.2250
..    ...   ...  ...       ...
886   734   792  0.0  120.0000
887    52   509  0.0   11.1333
888   326   291  1.0  135.6333
889   586   855  1.0    6.4375
890   778   559  1.0   