# Training and testiong a ANN for the Titanic dataset

## Importing required libraries

In [174]:
import pandas as pd
import numpy as np

## Defining constants

In [175]:
#Collumns that are not participate in the learning

class TicketFeatures:
    TICKET = 'Ticket'
    TICKET_PARSED = 'TicketParsed'
    TICKET_NUMBER = 'TicketNumber'
    TICKET_PREFIX = 'TicketPrefix'

class CabinFeatures:
    CABIN = 'Cabin'
    CABIN_PARSED = 'CabinParsed'
    CABIN_NUMBER = 'CabinNumber'
    DECK_CODE = 'DeckCode'
    
columns_to_drop = ['PassengerId', 'Name']

## Loading the Titanic dataset

In [176]:
titanic_df = pd.read_csv('../datasets/titanic/train.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data description

Feature description:

 1. Survival - Survival (0 = No; 1 = Yes). Not included in test.csv file.
 2. Pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
 3. Name - Name
 4. Sex - Sex
 5. Age - Age
 6. Sibsp - Number of Siblings/Spouses Aboard
 7. Parch - Number of Parents/Children Aboard
 8. Ticket - Ticket Number
 9. Fare - Passenger Fare
 10. Cabin - Cabin
 11. Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

## Data Structures

In [177]:
class Ticket:
    def __init__(self, prefix, number):
        self.__prefix = prefix
        self.__number = number

    def __str__(self):
        
        ticket = ''
        if(not self.__prefix is np.nan):
            ticket = self.__prefix
        
        return f'Ticket {ticket} {self.__number} no. {self.__number}'
        
    @property    
    def prefix(self):
        return self.__prefix
    
    @prefix.setter
    def prefix(self, value):
        self.__prefix = value
        
    @property    
    def number(self):
        return self.__number
    
    @number.setter
    def number(self, value):
        self.__number = value
            
    def parse(ticket):
        try:
            ticket_number = np.nan
            ticket_prefix = np.nan
            ticket_splited = ticket.split(" ")
    
            if (ticket == 'LINE'):
                ticket_prefix = ticket
            elif (len(ticket_splited) == 1):
                ticket_number = int(ticket_splited[0])
            elif (len(ticket_splited) == 2):
                ticket_number = int(ticket_splited[1])
                ticket_prefix = str(ticket_splited[0])
            elif (len(ticket_splited) == 3):
                ticket_number = int(ticket_splited[2])
                ticket_prefix = f'{ticket_splited[0]} {ticket_splited[1]}'
            else:
                raise Exception(f'Unsupported ticket format.')
                
            return Ticket(ticket_prefix, ticket_number)
        except ValueError as e:
            print(f'Unable parse the "{ticket}" ticket. Details: {e}')

## Data preparation

In [178]:
#Set PassengerId as an index
titanic_df.set_index('PassengerId', inplace=True)

### Ticket feature preparation

In [179]:
null_ticket_filter = titanic_df[TicketFeatures.TICKET].isnull()
null_tickets_number = len(titanic_df[TicketFeatures.TICKET].loc[null_ticket_filter])

print(f'NULL tickets number is: {null_tickets_number}')

NULL tickets number is: 0


In [180]:
titanic_df[TicketFeatures.TICKET_PARSED] = titanic_df[TicketFeatures.TICKET].apply(lambda t : Ticket.parse(t))
titanic_df[TicketFeatures.TICKET_PARSED].head()

PassengerId
1             Ticket A/5 21171 no. 21171
2              Ticket PC 17599 no. 17599
3    Ticket STON/O2. 3101282 no. 3101282
4              Ticket  113803 no. 113803
5              Ticket  373450 no. 373450
Name: TicketParsed, dtype: object

In [181]:
#Create Ticket_Number feature
titanic_df[TicketFeatures.TICKET_NUMBER] =  titanic_df[TicketFeatures.TICKET_PARSED].apply(lambda t : t.number)
titanic_df[TicketFeatures.TICKET_NUMBER].head()

PassengerId
1      21171.0
2      17599.0
3    3101282.0
4     113803.0
5     373450.0
Name: TicketNumber, dtype: float64

In [182]:
#Create Ticket_Prefix feature
titanic_df[TicketFeatures.TICKET_PREFIX] =  titanic_df[TicketFeatures.TICKET_PARSED].apply(lambda t : t.prefix)
titanic_df[TicketFeatures.TICKET_PREFIX].head()

PassengerId
1         A/5
2          PC
3    STON/O2.
4         NaN
5         NaN
Name: TicketPrefix, dtype: object

In [183]:
#Printing ticket feature statisitcs
filt = titanic_df[TicketFeatures.TICKET_PREFIX].notnull()
not_null_ticket_prefixes_number = len(titanic_df.loc[filt])

filt = titanic_df[TicketFeatures.TICKET_NUMBER].notnull()
not_null_ticket_numbers_number = len(titanic_df.loc[filt])

print(f'Not NULL ticket prefixes number is: {not_null_ticket_prefixes_number}')
print(f'NULL ticket prefixes number is: {len(titanic_df) - not_null_ticket_prefixes_number}')
print(f'Not NULL ticket numbers number is: {not_null_ticket_numbers_number}')
print(f'NULL ticket numbers number is: {len(titanic_df) - not_null_ticket_numbers_number}')

Not NULL ticket prefixes number is: 230
NULL ticket prefixes number is: 661
Not NULL ticket numbers number is: 887
NULL ticket numbers number is: 4


### Cabin feature preparation

In [185]:
titanic_df[CabinFeatures.CABIN]

PassengerId
1       NaN
2       C85
3       NaN
4      C123
5       NaN
       ... 
887     NaN
888     B42
889     NaN
890    C148
891     NaN
Name: Cabin, Length: 891, dtype: object