# Titanic - Machine Learning from Disaster

## Imports

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder

## Global config

In [2]:
train_csv = "encoded_data.csv"

## Load data

In [3]:
train_data = pd.read_csv(train_csv, index_col='PassengerId')

### Data dictionary

#### Categorical features

- Survived - Survival - 0 = No, 1 = Yes - **Used for evaluating whether a person survived**
- Sex - Sex 	
- Embarked - Port of Embarkation - C = Cherbourg, Q = Queenstown, S = Southampton

#### Ordinal

- Pclass - Ticket class - 1 = 1st, 2 = 2nd, 3 = 3rd

#### Numerical features

##### Discrete

- SibSp - # of siblings / spouses aboard the Titanic 	
- Parch - # of parents / children aboard the Titanic 

##### Continuous

- Age - Age in years 		
- Fare - Passenger fare 

#### Mixed/Error-prone/Unused

- PassengerId - Id of person
- Name - Name of Passenger
- Ticket - Ticket number 
- Cabin - Cabin number

In [4]:
y_column = 'Survived'

In [5]:
import os
import random
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Set warnings to not be displayed
pd.options.mode.chained_assignment = None

class Divider:
    '''
    Table divider class
    '''

    def __init__(self, input_table, important_column):
        '''
        Constructor for divider

        input_table - table to apply division on
        important_column - y_column used for predictions
        '''
        self.result = []
        self.input_table = input_table.copy()
        self.important_column = important_column
        self.connections = []
        self.index = 0
    
    def get_result(self):
        '''
        Returns the result of calculations
        '''
        return self.result

    def random_shrink(self, input_table, level, onehot=False, overlap=False):
        '''
        Recursively cluster randomly the columns
        level - level of recursion
        '''

        input_table = input_table.copy()
        
        # Return if no columns
        if len(input_table.columns) <= 1:
            return
        
        # Set up base table variables
        base_index = self.index
        self.index += 1
        base_index_cols = input_table.index.names
        
        # Apply clustering for every cluster of columns
        while len(input_table.columns) > 0:

            # Randomly shuffle
            self.index += 1
            mylist = np.array(range(0, len(input_table)))
            random.shuffle(mylist)

            # Randomly pick n_splits columns
            picked_columns = []
            if len(input_table.columns) > 1:
                picked_columns = input_table.sample(n=np.random.randint(1, len(input_table.columns)), axis='columns').columns
            else:
                picked_columns = input_table.columns       
            
            # Set new column names
            PK_name = 'PK' + str(level + 1) + str(self.index)
            FK_name = 'FK' + str(level + 1) + str(self.index)
            
            # Add new PK
            recursed_table = input_table[picked_columns]
            recursed_table.loc[:, PK_name] = mylist
            recursed_table.set_index(PK_name, inplace = True)

            # Check if table size can be reduced
            unique_recursed_table = recursed_table.drop_duplicates()
            if len(input_table.columns) + len(input_table.index.names) > 2 and len(unique_recursed_table) < len(recursed_table):

                # Add new FK and remove columns associated with it
                old_index = list(input_table.index.names)
                input_table = (input_table.reset_index().merge(unique_recursed_table.reset_index(), on=list(picked_columns.values), how='left')
                .rename(columns={PK_name:FK_name})
                .groupby(old_index + [FK_name]).mean()
                .drop(picked_columns, axis = 1))
                
                # Set recursed table to have reduced element count
                recursed_table = unique_recursed_table
            
            
            # Add the connection to a list
            self.connections.append(('table' + str(level) + str(base_index), FK_name,  
                                     'table' + str(level + 1) + str(self.index), PK_name))
            
            # Append new FK table to result list
            if len(recursed_table.columns) == 1:
                # Check if you need to apply oneHotEncoding
                if onehot is True and len(recursed_table[recursed_table.columns[0]].unique()) < 7:
                    oneHotEncoder = OneHotEncoder()
                    encoded_col = pd.DataFrame(oneHotEncoder.fit_transform(recursed_table[[recursed_table.columns[0]]]).toarray())

                    # Concatenate the 2 tables
                    recursed_table = pd.concat([recursed_table.reset_index(), encoded_col], axis=1, copy=False, join='inner')

                    # Readd the index column
                    recursed_table.loc[:, PK_name] = mylist
                    recursed_table.set_index(PK_name, inplace = True)
                
                # Append single-column table to result
                self.result.append((level + 1, self.index, recursed_table))
                continue
            elif overlap is True:
                #Perform overlaping with probability
                p = 0.3
                if np.random.rand() <= p:
                    picked_column = recursed_table.sample(n=1, axis='columns').columns
                    input_table[picked_column] = recursed_table[picked_column].copy()
            
            # Apply clustering recursively on smaller table
            self.random_shrink(recursed_table, level + 1, onehot=onehot, overlap=overlap)
        
        # Reset the index and set FK columns as normal columns
        input_table = input_table.reset_index()
        input_table.set_index(base_index_cols, inplace = True)
        self.result.append((level, base_index, input_table))
    

    def random_same_pk_fk(self, input_table, level, onehot=False, overlap=False):
        '''
        Recursively cluster randomly the columns
        level - level of recursion
        '''

        input_table = input_table.copy()
        
        # Return if no columns
        if len(input_table.columns) <= 1:
            return
        
        # Set up base table variables
        base_index = self.index
        self.index += 1
        base_index_cols = input_table.index.names
        
        # Apply clustering for every cluster of columns
        while len(input_table.columns) > 0:

            # Randomly shuffle
            self.index += 1
            mylist = np.array(range(0, len(input_table)))
            random.shuffle(mylist)

            # Randomly pick n_splits columns
            picked_columns = []
            if len(input_table.columns) > 1:
                picked_columns = input_table.sample(n=np.random.randint(1, len(input_table.columns)), axis='columns').columns
            else:
                picked_columns = input_table.columns       
            
            # Set new column names
            PK_name = 'PK' + str(level + 1) + str(self.index)
            FK_name = 'FK' + str(level + 1) + str(self.index)
            
            # Add new PK
            recursed_table = input_table[picked_columns]
            recursed_table.loc[:, PK_name] = mylist
            recursed_table.set_index(PK_name, inplace = True)
            
            # Add new FK and remove columns associated with it
            input_table.loc[:, FK_name] = mylist
            input_table = input_table.groupby(input_table.index.names + [FK_name]).mean()
            input_table = input_table.drop(picked_columns, axis = 1)
            
            # Add the connection to a list
            self.connections.append(('table' + str(level) + str(base_index), FK_name,  
                                     'table' + str(level + 1) + str(self.index), PK_name))
            
            # Append new FK table to result list
            if len(recursed_table.columns) == 1:
                # Check if you need to apply oneHotEncoding
                if onehot is True and len(recursed_table[recursed_table.columns[0]].unique()) < 7:
                    oneHotEncoder = OneHotEncoder()
                    encoded_col = pd.DataFrame(oneHotEncoder.fit_transform(recursed_table[[recursed_table.columns[0]]]).toarray())

                    # Concatenate the 2 tables
                    recursed_table = pd.concat([recursed_table.reset_index(), encoded_col], axis=1, copy=False, join='inner')

                    # Readd the index column
                    recursed_table.loc[:, PK_name] = mylist
                    recursed_table.set_index(PK_name, inplace = True)
                
                # Append single-column table to result
                self.result.append((level + 1, self.index, recursed_table))
                continue
            elif overlap is True:
                #Perform overlaping with probability
                p = 0.3
                if np.random.rand() <= p:
                    picked_column = recursed_table.sample(n=1, axis='columns').columns
                    input_table[picked_column] = recursed_table[picked_column].copy()
            
            # Apply clustering recursively on smaller table
            self.random_same_pk_fk(recursed_table, level + 1, onehot=onehot, overlap=overlap)
        
        # Reset the index and set FK columns as normal columns
        input_table = input_table.reset_index()
        input_table.set_index(base_index_cols, inplace = True)
        self.result.append((level, base_index, input_table))
    
    def correlation(self, input_table, important_column, level):
        '''
        Recursively cluster most correlated columns to an "important_column"
        important_column - colum of interest, most likely to be Y
        input_table - table to apply the clustering on
        '''

        input_table = input_table.copy()

        n_splits = 3
        
        # Return if no columns
        if len(input_table.columns) == 0:
            return
        
        # Set up base table variables
        base_index = self.index
        self.index += 1
        base_index_cols = input_table.index.names

        # Calculate correlation between columns and most important column
        corr = abs(input_table.corr(method='spearman'))
        corr = corr.drop([important_column], axis = 1)
        
        # Calculate quantiles based on correlation and n_splits
        quantiles = []
        for i in range(n_splits):
            quantile = 1 - (i+1) / n_splits
            quantiles.append(corr.loc[[important_column]].T.quantile(quantile)[0])
        
        # Apply clustering for every cluster of columns
        for threshold in quantiles:
            # Randomly shuffle
            self.index += 1
            mylist = np.array(range(0, len(input_table)))
            random.shuffle(mylist)

            # Break if no columns
            if len(corr.columns) == 0 or len(input_table.columns) == 0:
                break
        
            # Pick the new important column
            new_important = corr.loc[[important_column]].idxmax(axis=1)[0]
            # Pick all columns with correlation above quantile threshold
            corr_columns = [col for col in corr.loc[[important_column]].columns if corr.loc[[important_column]][col][0] >= threshold]
           
            if len(corr_columns) == 0:
                continue

             # Set new column names
            PK_name = 'PK' + str(level + 1) + str(self.index)
            FK_name = 'FK' + str(level + 1) + str(self.index)
            
            # Add new PK
            recursed_table = input_table[corr_columns]
            recursed_table.loc[:, PK_name] = mylist
            recursed_table.set_index(PK_name, inplace = True)
            
            # Add new FK and remove columns associated with it
            input_table.loc[:, FK_name] = mylist
            input_table = input_table.groupby(input_table.index.names + [FK_name]).mean()
            input_table = input_table.drop(corr_columns, axis = 1)
            corr = corr.drop(corr_columns, axis = 1)
            
            # Add the connection to a list
            self.connections.append(('table' + str(level) + str(base_index), FK_name,  
                                     'table' + str(level + 1) + str(self.index), PK_name))
            
            # Apply clustering recursively
            self.correlation(recursed_table, new_important, level + 1)

        # Reset the index and set FK columns as normal columns
        input_table = input_table.reset_index()
        input_table.set_index(base_index_cols, inplace = True)
        self.result.append((level, base_index, input_table))

    
    def divide(self, strategy, path, onehot=False, overlap=False):
        '''
        Function used to divide the table
        strategy - strategy of division
        path - path to save output
        '''

        # Create output folder
        os.makedirs(path, exist_ok=True) 
        
        # Initialise fresh result and connections lists
        self.__init__(self.input_table, self.important_column)
        
        # Pick strategy
        if strategy == 'random':
            input_table = self.input_table.groupby(self.input_table.index.names + [self.important_column]).mean()
            self.random_same_pk_fk(input_table, 0, onehot=onehot, overlap=overlap)
        elif strategy == 'correlation':
            self.correlation(self.input_table, self.important_column, 0)
        elif strategy == 'shrink':
            input_table = self.input_table.groupby(self.input_table.index.names + [self.important_column]).mean()
            self.random_shrink(input_table, 0, onehot=onehot, overlap=overlap)
        
        # Sort result by recursion level and index
        self.result.sort(key=lambda x: (x[0], x[1]))

        # Print results and save every table to a file
        print('Level, Index, Primary Key, Columns')
        for (el, col, table) in self.result:
            print(el, " ", col, " ", table.index.names, " ", table.columns, "\n")
            table.to_csv(path + '/table' + str(el) + str(col) + '.csv')

        # Initialise set of tuples in the form (table name, PK column)
        all_tables = []

        # Iterate over tables and fill data
        for (el, col, table) in self.result:
            # Add tables to set
            all_tables.append((str('table' + str(el) + str(col)), table.index.names))

        # Save connections to file
        np.savetxt(path + "/connections.csv", self.connections, delimiter=',', fmt='%s')
        
        # Save tables names with their PK to file
        all_tables = json.dumps(all_tables)
        with open(path + '/tables.json', 'w') as outfile:
            json.dump(all_tables, outfile)

In [6]:
# Initialise divider
dv = Divider(train_data, y_column)

In [7]:
# Apply division
# dv.divide(strategy = "correlation", path = 'output')
# dv.divide(strategy = "random", path = 'output')
# dv.divide(strategy = "random", path = 'output', onehot=True)
# dv.divide(strategy='random', path='output', overlap=True)
dv.divide(strategy='shrink', path = 'output')

Level, Index, Primary Key, Columns
0   0   ['PassengerId', 'Survived']   Index(['FK12', 'FK113'], dtype='object') 

1   2   ['PK12']   Index(['FK24', 'FK25', 'FK29', 'FK210', 'FK211', 'FK212'], dtype='object') 

1   13   ['PK113']   Index(['Age'], dtype='object') 

2   4   ['PK24']   Index(['Fare'], dtype='object') 

2   5   ['PK25']   Index(['FK37', 'FK38'], dtype='object') 

2   9   ['PK29']   Index(['Sex'], dtype='object') 

2   10   ['PK210']   Index(['Parch'], dtype='object') 

2   11   ['PK211']   Index(['Pclass'], dtype='object') 

2   12   ['PK212']   Index(['Embarked'], dtype='object') 

3   7   ['PK37']   Index(['SibSp'], dtype='object') 

3   8   ['PK38']   Index(['Title'], dtype='object') 



In [8]:
from numpy import genfromtxt

# Read data from tables - returns array of [table name, Primary key]
read_tables = []
with open('output/tables.json') as json_file:
    read_tables = json.loads(json.load(json_file))
print(read_tables)

[['table00', ['PassengerId', 'Survived']], ['table12', ['PK12']], ['table113', ['PK113']], ['table24', ['PK24']], ['table25', ['PK25']], ['table29', ['PK29']], ['table210', ['PK210']], ['table211', ['PK211']], ['table212', ['PK212']], ['table37', ['PK37']], ['table38', ['PK38']]]


In [9]:
# Read data from tables - read tables as pandas and set index as Primary key column
read_tables_content = dict()
for [table, pk] in read_tables:
    read_tables_content[table] = pd.read_csv('output/' + table + '.csv', index_col=pk)
    
print(read_tables_content)

{'table00':                       FK12  FK113
PassengerId Survived             
1           0          338    566
2           1          247    402
3           1          453    446
4           1          140    843
5           0          522    843
...                    ...    ...
887         0          815    703
888         1          261    238
889         0          887    424
890         1          107    446
891         0          866    303

[891 rows x 2 columns], 'table12':       FK24  FK25  FK29  FK210  FK211  FK212
PK12                                       
0      380   337   459    235    175    468
4      164   302   459    444    227    468
5      394   478   424    444    175    462
9      401   309   459    444    245    462
11      33   254   424    433    245    468
...    ...   ...   ...    ...    ...    ...
886     14   478   424    444    245    462
887     45   337   459    235    245    468
888    115    47   424    433    245    468
889    452   302   459    

In [10]:
# Test table reading
table01 = pd.read_csv('output/table00.csv', index_col='PassengerId')
table01

Unnamed: 0_level_0,Survived,FK12,FK113
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,338,566
2,1,247,402
3,1,453,446
4,1,140,843
5,0,522,843
...,...,...,...
887,0,815,703
888,1,261,238
889,0,887,424
890,1,107,446


In [11]:
# Read all ((table1, PK), (table2, FK)) relations and add them to a list
read_connections = map(lambda x: ((x[0], x[1]), (x[2], x[3])), genfromtxt('output/connections.csv', delimiter=',', dtype='str'))
read_connections = list(read_connections)
print(read_connections)

[(('table00', 'FK12'), ('table12', 'PK12')), (('table12', 'FK24'), ('table24', 'PK24')), (('table12', 'FK25'), ('table25', 'PK25')), (('table25', 'FK37'), ('table37', 'PK37')), (('table25', 'FK38'), ('table38', 'PK38')), (('table12', 'FK29'), ('table29', 'PK29')), (('table12', 'FK210'), ('table210', 'PK210')), (('table12', 'FK211'), ('table211', 'PK211')), (('table12', 'FK212'), ('table212', 'PK212')), (('table00', 'FK113'), ('table113', 'PK113'))]


In [12]:
# Example of joining tables by going over the relations list
for ((table1, index1), (table2, index2)) in read_connections:
    print(read_tables_content[table1].merge(read_tables_content[table2], left_on=index1, right_on=index2))

     FK12  FK113  FK24  FK25  FK29  FK210  FK211  FK212
0     338    566    41   254   424    444    245    468
1     247    402   135   309   459    444    175    462
2     453    446   216   395   459    444    245    468
3     453    703   216   395   459    444    245    468
4     453    424   216   395   459    444    245    468
..    ...    ...   ...   ...   ...    ...    ...    ...
886   870    566   373   395   459    444    245    468
887   836    436   358   302   459    469    245    416
888   261    238   176   395   459    444    175    468
889   887    424    45   337   459    235    245    468
890   107    446   176   478   424    444    175    462

[891 rows x 8 columns]
     FK24  FK25  FK29  FK210  FK211  FK212      Fare
0     380   337   459    235    175    468  151.5500
1     380   395   459    444    175    468  151.5500
2     380   309   459    235    175    468  151.5500
3     380    56   424    235    175    468  151.5500
4     164   302   459    444    227    

In [13]:
# Example of joining tables by going over the relations list
((table1, index1), (table2, index2)) = read_connections[0]
joined_table = read_tables_content[table1]
old_index = list(joined_table.index.names)

joined_table = (joined_table.reset_index().merge(read_tables_content[table2], left_on=index1, right_on=index2)
    .groupby(old_index).mean()
    .drop([index1], axis=1))

for i in range(len(read_connections) - 1):
    ((table1, index1), (table2, index2)) = read_connections[i+1]

    old_index = list(joined_table.index.names)

    joined_table = (joined_table.reset_index().merge(read_tables_content[table2], left_on=index1, right_on=index2)
    .groupby(old_index).mean()
    .drop([index1], axis=1))

print(joined_table.reset_index().sort_index().sort_index(axis=1).equals(train_data.reset_index().sort_index().sort_index(axis=1)))

True
