# Titanic - Machine Learning from Disaster

## Imports

In [22]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder

## Global config

In [23]:
train_csv = "encoded_data.csv"

## Load data

In [24]:
train_data = pd.read_csv(train_csv, index_col='PassengerId')

### Data dictionary

#### Categorical features

- Survived - Survival - 0 = No, 1 = Yes - **Used for evaluating whether a person survived**
- Sex - Sex 	
- Embarked - Port of Embarkation - C = Cherbourg, Q = Queenstown, S = Southampton

#### Ordinal

- Pclass - Ticket class - 1 = 1st, 2 = 2nd, 3 = 3rd

#### Numerical features

##### Discrete

- SibSp - # of siblings / spouses aboard the Titanic 	
- Parch - # of parents / children aboard the Titanic 

##### Continuous

- Age - Age in years 		
- Fare - Passenger fare 

#### Mixed/Error-prone/Unused

- PassengerId - Id of person
- Name - Name of Passenger
- Ticket - Ticket number 
- Cabin - Cabin number

In [25]:
y_column = 'Survived'

In [26]:
import os
import random
import json
import numpy as np
import pandas as pd

# Set warnings to not be displayed
pd.options.mode.chained_assignment = None

class Divider:
    '''
    Table divider class
    '''

    def __init__(self, input_table, important_column):
        '''
        Constructor for divider

        input_table - table to apply division on
        important_column - y_column used for predictions
        '''
        self.result = []
        self.input_table = input_table
        self.important_column = important_column
        self.connections = []
        self.index = 0
    
    def get_result(self):
        '''
        Returns the result of calculations
        '''
        return self.result
        
    def random_same_pk_fk(self, input_table, level):
        '''
        Recursively cluster randomly the columns
        level - level of recursion
        '''
        
        # Return if no columns
        if len(input_table.columns) == 1:
            return
        
        # Set up base table variables
        base_index = self.index
        self.index += 1
        base_index_cols = input_table.index.names
        
        # Apply clustering for every cluster of columns
        while len(input_table.columns) > 0:

            # Randomly shuffle
            self.index += 1
            mylist = np.array(range(0, len(input_table)))
            random.shuffle(mylist)

            # Randomly pick n_splits columns
            picked_columns = []
            if len(input_table.columns) > 1:
                picked_columns = input_table.sample(n=np.random.randint(1, len(input_table.columns)), axis='columns').columns
            else:
                picked_columns = input_table.columns       
            
            # Set new column names
            PK_name = 'PK' + str(level + 1) + str(self.index)
            FK_name = 'FK' + str(level + 1) + str(self.index)
            
            # Add new PK
            recursed_table = input_table[picked_columns]
            recursed_table.loc[:, PK_name] = mylist
            recursed_table.set_index(PK_name, inplace = True)
            
            # Add new FK and remove columns associated with it
            input_table.loc[:, FK_name] = mylist
            input_table = input_table.groupby(input_table.index.names + [FK_name]).mean()
            input_table = input_table.drop(picked_columns, axis = 1)
            
            # Add the connection to a list
            self.connections.append(('table' + str(level) + str(base_index), FK_name,  
                                     'table' + str(level + 1) + str(self.index), PK_name))
            
            # Append new FK table to result list
            if len(recursed_table.columns) == 1:
                self.result.append((level + 1, self.index, recursed_table))
            
            # Apply clustering recursively on smaller table
            self.random_same_pk_fk(recursed_table, level + 1)
        
        # Reset the index and set FK columns as normal columns
        input_table = input_table.reset_index()
        input_table.set_index(base_index_cols, inplace = True)
        self.result.append((level, base_index, input_table))
    
    def divide(self, strategy, path):
        '''
        Function used to divide the table
        strategy - strategy of division
        path - path to save output
        '''

        # Create output folder
        os.makedirs(path, exist_ok=True) 
        
        # Initialise fresh result and connections lists
        self.__init__(self.input_table, self.important_column)
        
        # Pick strategy
        if strategy == 'random_same_pk_fk':
            input_table = self.input_table.groupby(self.input_table.index.names + [self.important_column]).mean()
            self.random_same_pk_fk(input_table, 0)
        #elif strategy == 'correlation':
        #    self.correlation(self.input_table, self.important_column, 0)
        
        # Sort result by recursion level and index
        self.result.sort(key=lambda x: (x[0], x[1]))

        # Print results and save every table to a file
        print('Level, Index, Primary Key, Columns')
        for (el, col, table) in self.result:
            print(el, " ", col, " ", table.index.names, " ", table.columns, "\n")
            table.to_csv(path + '/table' + str(el) + str(col) + '.csv')

        # Initialise set of tuples in the form (table name, PK column)
        all_tables = []

        # Iterate over tables and fill data
        for (el, col, table) in self.result:
            # Add tables to set
            all_tables.append((str('table' + str(el) + str(col)), table.index.names))

        # Save connections to file
        np.savetxt(path + "/connections.csv", self.connections, delimiter=',', fmt='%s')
        
        # Save tables names with their PK to file
        all_tables = json.dumps(all_tables)
        with open(path + '/tables.json', 'w') as outfile:
            json.dump(all_tables, outfile)

In [27]:
# Initialise divider
dv = Divider(train_data, y_column)

In [28]:
# Apply division
dv.divide(strategy = "random_same_pk_fk", path = 'output')

Level, Index, Primary Key, Columns
0   0   ['PassengerId', 'Survived']   Index(['FK12', 'FK113', 'FK114', 'FK115'], dtype='object') 

1   2   ['PK12']   Index(['FK24', 'FK212'], dtype='object') 

1   13   ['PK113']   Index(['Pclass'], dtype='object') 

1   14   ['PK114']   Index(['Sex'], dtype='object') 

1   15   ['PK115']   Index(['Parch'], dtype='object') 

2   4   ['PK24']   Index(['FK36', 'FK310', 'FK311'], dtype='object') 

2   12   ['PK212']   Index(['Embarked'], dtype='object') 

3   6   ['PK36']   Index(['FK48', 'FK49'], dtype='object') 

3   10   ['PK310']   Index(['SibSp'], dtype='object') 

3   11   ['PK311']   Index(['Fare'], dtype='object') 

4   8   ['PK48']   Index(['Title'], dtype='object') 

4   9   ['PK49']   Index(['Age'], dtype='object') 



In [29]:
from numpy import genfromtxt

# Read data from tables - returns array of [table name, Primary key]
read_tables = []
with open('output/tables.json') as json_file:
    read_tables = json.loads(json.load(json_file))
print(read_tables)

[['table00', ['PassengerId', 'Survived']], ['table12', ['PK12']], ['table113', ['PK113']], ['table114', ['PK114']], ['table115', ['PK115']], ['table24', ['PK24']], ['table212', ['PK212']], ['table36', ['PK36']], ['table310', ['PK310']], ['table311', ['PK311']], ['table48', ['PK48']], ['table49', ['PK49']]]


In [30]:
# Read data from tables - read tables as pandas and set index as Primary key column
read_tables_content = dict()
for [table, pk] in read_tables:
    read_tables_content[table] = pd.read_csv('output/' + table + '.csv', index_col=pk)
    
print(read_tables_content)

{'table00':                       FK12  FK113  FK114  FK115
PassengerId Survived                           
1           0          370    317    289    266
2           1          685    493    154    145
3           1          299    558    875    831
4           1           34    524    215    573
5           0           56     20     44    163
...                    ...    ...    ...    ...
887         0          331    790    259    128
888         1          861    781    685    167
889         0          199    263    484    265
890         1          823    292    684    422
891         0          271    455    142    109

[891 rows x 4 columns], 'table12':       FK24  FK212
PK12             
0      299    566
1      324    257
2      154    318
3      745    392
4        2    560
...    ...    ...
886    565    826
887    138    619
888    333    716
889    817    786
890    873    614

[891 rows x 2 columns], 'table113':        Pclass
PK113        
317       2.0
493       0.0
5

In [31]:
# Test table reading
table01 = pd.read_csv('output/table00.csv', index_col='PassengerId')
table01

Unnamed: 0_level_0,Survived,FK12,FK113,FK114,FK115
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,370,317,289,266
2,1,685,493,154,145
3,1,299,558,875,831
4,1,34,524,215,573
5,0,56,20,44,163
...,...,...,...,...,...
887,0,331,790,259,128
888,1,861,781,685,167
889,0,199,263,484,265
890,1,823,292,684,422


In [32]:
# Read all ((table1, PK), (table2, FK)) relations and add them to a list
read_connections = map(lambda x: ((x[0], x[1]), (x[2], x[3])), genfromtxt('output/connections.csv', delimiter=',', dtype='str'))
read_connections = list(read_connections)
print(read_connections)

[(('table00', 'FK12'), ('table12', 'PK12')), (('table12', 'FK24'), ('table24', 'PK24')), (('table24', 'FK36'), ('table36', 'PK36')), (('table36', 'FK48'), ('table48', 'PK48')), (('table36', 'FK49'), ('table49', 'PK49')), (('table24', 'FK310'), ('table310', 'PK310')), (('table24', 'FK311'), ('table311', 'PK311')), (('table12', 'FK212'), ('table212', 'PK212')), (('table00', 'FK113'), ('table113', 'PK113')), (('table00', 'FK114'), ('table114', 'PK114')), (('table00', 'FK115'), ('table115', 'PK115'))]


In [33]:
# Example of joining tables by going over the relations list
for ((table1, index1), (table2, index2)) in read_connections:
    print(read_tables_content[table1].merge(read_tables_content[table2], left_on=index1, right_on=index2))

     FK12  FK113  FK114  FK115  FK24  FK212
0     370    317    289    266   716    532
1     685    493    154    145   600    120
2     299    558    875    831   547    119
3      34    524    215    573    12    115
4      56     20     44    163   284      2
..    ...    ...    ...    ...   ...    ...
886   331    790    259    128   139    692
887   861    781    685    167   467    762
888   199    263    484    265   155    632
889   823    292    684    422   108    166
890   271    455    142    109   870    691

[891 rows x 6 columns]
     FK24  FK212  FK36  FK310  FK311
0     299    566   530    273    127
1     324    257   340    295    226
2     154    318   121    702    524
3     745    392   374    579    818
4       2    560   384    711      3
..    ...    ...   ...    ...    ...
886   565    826    86    831    356
887   138    619   177    533    735
888   333    716   641    297    113
889   817    786   341    646    501
890   873    614   148    577    255

[89