In [1]:
# Imports needed for the script
import numpy as np
import pandas as pd
import re
from sklearn import tree

In [2]:
# Loading the data
dataset = pd.read_csv('titanic_dataset/train.csv')

In [3]:
dataset['Has_Cabin'] = dataset["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

In [4]:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [5]:
dataset['IsAlone'] = 0
dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

In [6]:
dataset['Embarked'] = dataset['Embarked'].fillna('S')
dataset['Fare'] = dataset['Fare'].fillna(dataset['Fare'].median())

In [7]:
age_avg = dataset['Age'].mean()
age_std = dataset['Age'].std()
age_null_count = dataset['Age'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)

# Next line has been improved to avoid warning
dataset.loc[np.isnan(dataset['Age']), 'Age'] = age_null_random_list
dataset['Age'] = dataset['Age'].astype(int)

In [8]:
dataset.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Has_Cabin,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S,0,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C,1,2,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S,0,1,1


In [9]:
dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

In [10]:
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

dataset['Title'] = dataset['Name'].apply(get_title)

# Group all non-common titles into one single grouping "Rare"
dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [11]:
dataset.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Has_Cabin,FamilySize,IsAlone,Title
0,1,0,3,"Braund, Mr. Owen Harris",1,22,1,0,A/5 21171,7.25,,S,0,2,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38,1,0,PC 17599,71.2833,C85,C,1,2,0,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",0,26,0,0,STON/O2. 3101282,7.925,,S,0,1,1,Miss


In [12]:
# Mapping titles
title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
dataset['Title'] = dataset['Title'].map(title_mapping)
dataset['Title'] = dataset['Title'].fillna(0)

# Mapping Embarked
dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
# Mapping Fare
dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
dataset['Fare'] = dataset['Fare'].astype(int)
    
# Mapping Age
dataset.loc[ dataset['Age'] <= 16, 'Age'] 					       = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age'] ;

In [13]:
dataset.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Has_Cabin,FamilySize,IsAlone,Title
0,1,0,3,"Braund, Mr. Owen Harris",1,1,1,0,A/5 21171,0,,0,0,2,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,PC 17599,3,C85,1,1,2,0,3
2,3,1,3,"Heikkinen, Miss. Laina",0,1,0,0,STON/O2. 3101282,1,,0,0,1,1,4


In [14]:
# Feature selection: remove variables no longer containing relevant information
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
dataset = dataset.drop(drop_elements, axis = 1)

In [15]:
dataset.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,Has_Cabin,FamilySize,IsAlone,Title
0,0,3,1,1,0,0,0,0,2,0,1
1,1,1,0,2,0,3,1,1,2,0,3
2,1,3,0,1,0,1,0,0,1,1,4


In [16]:
# Create Numpy arrays of train, test and target (Survived) dataframes to feed into our models
y_train = dataset['Survived']
x_train = dataset.drop(['Survived'], axis=1).values 

# Create Decision Tree with max_depth = 3
decision_tree = tree.DecisionTreeClassifier(max_depth = 3)
decision_tree.fit(x_train, y_train)



DecisionTreeClassifier(max_depth=3)

In [17]:
inputs_df = test_dataset.loc[test_dataset['PassengerId'] == '920']
inputs_dict = inputs_df.set_index('PassengerId').T.to_dict('dict')
inputs_dict = inputs_dict[inputs['PassengerId']]

NameError: name 'test_dataset' is not defined

In [18]:
test_dataset = pd.read_csv('titanic_dataset/test.csv')

In [19]:
inputs_df = test_dataset.loc[test_dataset['PassengerId'] == '920']

In [20]:
print(inputs_df)

Empty DataFrame
Columns: [PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]
Index: []


In [21]:
inputs_df = test_dataset.loc[test_dataset['PassengerId'] == 920]

In [22]:
print(inputs_df)

    PassengerId  Pclass                     Name   Sex   Age  SibSp  Parch  \
28          920       1  Brady, Mr. John Bertram  male  41.0      0      0   

    Ticket  Fare Cabin Embarked  
28  113054  30.5   A21        S  


In [23]:
print(inputs_df.T)

                                  28
PassengerId                      920
Pclass                             1
Name         Brady, Mr. John Bertram
Sex                             male
Age                             41.0
SibSp                              0
Parch                              0
Ticket                        113054
Fare                            30.5
Cabin                            A21
Embarked                           S


In [24]:
print(inputs_df.set_index('PassengerId'))

             Pclass                     Name   Sex   Age  SibSp  Parch  \
PassengerId                                                              
920               1  Brady, Mr. John Bertram  male  41.0      0      0   

             Ticket  Fare Cabin Embarked  
PassengerId                               
920          113054  30.5   A21        S  


In [25]:
print(inputs_df.set_index('PassengerId').T)

PassengerId                      920
Pclass                             1
Name         Brady, Mr. John Bertram
Sex                             male
Age                             41.0
SibSp                              0
Parch                              0
Ticket                        113054
Fare                            30.5
Cabin                            A21
Embarked                           S


In [26]:
print(inputs_df.set_index('PassengerId').T.to_dict('dict'))

{920: {'Pclass': 1, 'Name': 'Brady, Mr. John Bertram', 'Sex': 'male', 'Age': 41.0, 'SibSp': 0, 'Parch': 0, 'Ticket': '113054', 'Fare': 30.5, 'Cabin': 'A21', 'Embarked': 'S'}}


In [28]:
inputs = {
		 'PassengerId': 920,
		#  'Pclass': args['Pclass'],
		#  'Name': args['Name'],
		#  'Sex': args['Sex'],
		#  'Age': args['Age'],
		#  'SibSp': args['SibSp'],
		#  'Parch': args['Parch'],
		#  'Ticket': args['Ticket'],
		#  'Fare': args['Fare'],
		#  'Cabin': args['Cabin'],
		#  'Embarked': args['Embarked'],
		}
inputs_dict = inputs_df.set_index('PassengerId').T.to_dict('dict')

In [29]:
inputs_dict

{920: {'Pclass': 1,
  'Name': 'Brady, Mr. John Bertram',
  'Sex': 'male',
  'Age': 41.0,
  'SibSp': 0,
  'Parch': 0,
  'Ticket': '113054',
  'Fare': 30.5,
  'Cabin': 'A21',
  'Embarked': 'S'}}

In [30]:
inputs_dict = inputs_dict[inputs['PassengerId']]

In [31]:
inputs_dict

{'Pclass': 1,
 'Name': 'Brady, Mr. John Bertram',
 'Sex': 'male',
 'Age': 41.0,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '113054',
 'Fare': 30.5,
 'Cabin': 'A21',
 'Embarked': 'S'}

In [32]:
inputs_dict['Has_Cabin'] = (lambda x: 0 if type(x) == float else 1)(inputs_dict['Cabin'])

In [33]:
inputs_dict

{'Pclass': 1,
 'Name': 'Brady, Mr. John Bertram',
 'Sex': 'male',
 'Age': 41.0,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '113054',
 'Fare': 30.5,
 'Cabin': 'A21',
 'Embarked': 'S',
 'Has_Cabin': 1}

In [34]:
inputs_dict['FamilySize'] = inputs_dict['SibSp'] + inputs_dict['Parch'] + 1

In [35]:
inputs_dict

{'Pclass': 1,
 'Name': 'Brady, Mr. John Bertram',
 'Sex': 'male',
 'Age': 41.0,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '113054',
 'Fare': 30.5,
 'Cabin': 'A21',
 'Embarked': 'S',
 'Has_Cabin': 1,
 'FamilySize': 1}

In [36]:
inputs_dict['IsAlone'] = 1 if inputs_df['FamilySize'] == 1 else 0
inputs_dict

KeyError: 'FamilySize'

In [37]:
inputs_dict['IsAlone'] = 1 if inputs_dict['FamilySize'] == 1 else 0
inputs_dict

{'Pclass': 1,
 'Name': 'Brady, Mr. John Bertram',
 'Sex': 'male',
 'Age': 41.0,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '113054',
 'Fare': 30.5,
 'Cabin': 'A21',
 'Embarked': 'S',
 'Has_Cabin': 1,
 'FamilySize': 1,
 'IsAlone': 1}

In [39]:
inputs_dict['Age'] = int(inputs_dict['Age'])
if inputs_dict['Sex'] == 'female':
    inputs_dict['Sex'] = 0 
elif inputs_dict['Sex'] == 'male':
    inputs_dict['Sex'] = 1
    
inputs_dict

{'Pclass': 1,
 'Name': 'Brady, Mr. John Bertram',
 'Sex': 1,
 'Age': 41,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '113054',
 'Fare': 30.5,
 'Cabin': 'A21',
 'Embarked': 'S',
 'Has_Cabin': 1,
 'FamilySize': 1,
 'IsAlone': 1}

In [40]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

inputs_dict['Title'] = get_title(inputs_dict['Name'])
inputs_dict

{'Pclass': 1,
 'Name': 'Brady, Mr. John Bertram',
 'Sex': 1,
 'Age': 41,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '113054',
 'Fare': 30.5,
 'Cabin': 'A21',
 'Embarked': 'S',
 'Has_Cabin': 1,
 'FamilySize': 1,
 'IsAlone': 1,
 'Title': 'Mr'}

In [41]:
inputs_dict['Title'] = 'Rare' if inputs_dict in ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']

SyntaxError: invalid syntax (1021595976.py, line 1)

In [42]:
inputs_dict['Title'] = 'Rare' if inputs_dict['Title'] in ['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'] else inputs_dict['Title']

In [43]:
inputs_dict['Title'] = inputs_dict['Title'].replace('Mlle', 'Miss')
inputs_dict['Title'] = inputs_dict['Title'].replace('Ms', 'Miss')
inputs_dict['Title'] = inputs_dict['Title'].replace('Mme', 'Mrs')

In [44]:
# Mapping titles
title_mapping = {"Mr": 1, "Master": 2, "Mrs": 3, "Miss": 4, "Rare": 5}
inputs_dict['Title'] = title_mapping[inputs_dict['Title']]

In [45]:
inputs_dict

{'Pclass': 1,
 'Name': 'Brady, Mr. John Bertram',
 'Sex': 1,
 'Age': 41,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '113054',
 'Fare': 30.5,
 'Cabin': 'A21',
 'Embarked': 'S',
 'Has_Cabin': 1,
 'FamilySize': 1,
 'IsAlone': 1,
 'Title': 1}

In [46]:
embarked_mapping = {'S': 0, 'C': 1, 'Q': 2}
inputs_dict['Embarked'] = embarked_mapping[inputs_dict['Embarked']]
inputs_dict

{'Pclass': 1,
 'Name': 'Brady, Mr. John Bertram',
 'Sex': 1,
 'Age': 41,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '113054',
 'Fare': 30.5,
 'Cabin': 'A21',
 'Embarked': 0,
 'Has_Cabin': 1,
 'FamilySize': 1,
 'IsAlone': 1,
 'Title': 1}

In [48]:

if inputs_dict['Age'] >= 64: 
    inputs_dict['Age'] = 4
elif inputs_dict['Age'] >= 48: 
    inputs_dict['Age'] = 3
elif inputs_dict['Age'] >= 32: 
    inputs_dict['Age'] = 2
elif inputs_dict['Age'] >= 16: 
    inputs_dict['Age'] = 1
else:
    inputs_dict['Age'] = 0

inputs_dict

{'Pclass': 1,
 'Name': 'Brady, Mr. John Bertram',
 'Sex': 1,
 'Age': 2,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '113054',
 'Fare': 30.5,
 'Cabin': 'A21',
 'Embarked': 0,
 'Has_Cabin': 1,
 'FamilySize': 1,
 'IsAlone': 1,
 'Title': 1}

In [49]:
del inputs_dict['Name']
inputs_dict

{'Pclass': 1,
 'Sex': 1,
 'Age': 2,
 'SibSp': 0,
 'Parch': 0,
 'Ticket': '113054',
 'Fare': 30.5,
 'Cabin': 'A21',
 'Embarked': 0,
 'Has_Cabin': 1,
 'FamilySize': 1,
 'IsAlone': 1,
 'Title': 1}

In [50]:
del inputs_dict['Name']

KeyError: 'Name'

In [52]:

del inputs_dict['Ticket']
del inputs_dict['Cabin']
del inputs_dict['SibSp']
inputs_dict

{'Pclass': 1,
 'Sex': 1,
 'Age': 2,
 'Parch': 0,
 'Fare': 30.5,
 'Embarked': 0,
 'Has_Cabin': 1,
 'FamilySize': 1,
 'IsAlone': 1,
 'Title': 1}

In [53]:
if inputs_dict['Fare'] >= 31: 
    inputs_dict['Fare'] = 3
elif inputs_dict['Fare'] >= 14.454: 
    inputs_dict['Fare'] = 2
elif inputs_dict['Fare'] >= 7.91: 
    inputs_dict['Fare'] = 1
else:
    inputs_dict['Fare'] = 0

inputs_dict

{'Pclass': 1,
 'Sex': 1,
 'Age': 2,
 'Parch': 0,
 'Fare': 2,
 'Embarked': 0,
 'Has_Cabin': 1,
 'FamilySize': 1,
 'IsAlone': 1,
 'Title': 1}

In [55]:
inputs_df_prcsd = pd.DataFrame(inputs_dict, index=[0])

In [56]:
inputs_df_prcsd

Unnamed: 0,Pclass,Sex,Age,Parch,Fare,Embarked,Has_Cabin,FamilySize,IsAlone,Title
0,1,1,2,0,2,0,1,1,1,1


In [57]:
dataset.loc[dataset['PassengerId'] == '440']

KeyError: 'PassengerId'

In [58]:
dataset.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,Embarked,Has_Cabin,FamilySize,IsAlone,Title
0,0,3,1,1,0,0,0,0,2,0,1
1,1,1,0,2,0,3,1,1,2,0,3
2,1,3,0,1,0,1,0,0,1,1,4


In [61]:
dataset1 = pd.read_csv('titanic_dataset/train.csv')
dataset1.loc[dataset1['PassengerId'] == 440]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
439,440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31.0,0,0,C.A. 18723,10.5,,S


In [62]:
dataset1.loc[dataset1['PassengerId'] == 440]['Survived']

439    0
Name: Survived, dtype: int64

In [69]:
dataset1.loc[dataset1['PassengerId'] == 440]['Survived'].astype(int)

439    0
Name: Survived, dtype: int64

In [70]:
1 + 1

2

In [72]:
dataset1.loc[dataset1['PassengerId'] == 440]['Survived'].values

array([0])

In [73]:
dataset1.loc[dataset1['PassengerId'] == 440]['Survived'].values[0]

0