# This Notebook explores the Titanic dataset and tries to predict if a passenger died.

### Loading and Viewing the datasets. 

In [3]:
# Load libraries for EDA.
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [4]:
# Load required data
path = "../../../Data/titanic/"
train = pd.read_csv(path + "train.csv")
test = pd.read_csv(path + "test.csv")

Let's peek at the three datasets and observe their features.

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The attributes have the following meaning:

- PassengerId: a unique identifier for each passenger
- Survived: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.
- Pclass: passenger class.
- Name, Sex, Age: self-explanatory
- SibSp: how many siblings & spouses of the passenger aboard the Titanic.
- Parch: how many children & parents of the passenger aboard the Titanic.
- Ticket: ticket id
- Fare: price paid (in pounds)
- Cabin: passenger's cabin number
- Embarked: where the passenger embarked the Titanic

In [8]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [9]:
# Explicity make the passenger id the index.
train = train.set_index("PassengerId")
test = test.set_index("PassengerId")

### Data Cleaning.

In [11]:
# Check data type and nulls.
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      417 non-null    float64
 8   Cabin     91 non-null     object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


In [13]:
def percent_null(data):
    """
    This function outputs columns with missing values with their percentages.
    """
    cols_with_null = list()
    cols = data.columns
    for col in cols:
        null_count = data[col].isnull().value_counts()
        if len(null_count) == 2:
            null_count_percent = np.round((null_count[True]/len(data))*100, 2)
            cols_with_null.append(col)
            print(f"Column {col} has {null_count[True]} missing values which is {null_count_percent}%")
    return cols_with_null
print("Train set has the following information missing:")
cols_with_null_train = percent_null(train)
print(f"\n {'-'*50} \n")
print("Test set has the following information missing:")
cols_with_null_test = percent_null(test)

Train set has the following information missing:
Column Age has 177 missing values which is 19.87%
Column Cabin has 687 missing values which is 77.1%
Column Embarked has 2 missing values which is 0.22%

 -------------------------------------------------- 

Test set has the following information missing:
Column Age has 86 missing values which is 20.57%
Column Fare has 1 missing values which is 0.24%
Column Cabin has 327 missing values which is 78.23%


We will replace missing values based on passenger class and sex.

In [15]:
def fill_missing(data, null_cols):
    """
    Fills numerical columns with missing values using their mean based on grouping.
    Categorical columns are filled using mode.
    """
    for col in null_cols:
        if data[col].dtype == 'object':
            data[col] = data.groupby(["Pclass", "Sex"])[col].transform(lambda x: x.fillna(x.mode()[0]))
        else:
            data[col] = data.groupby(["Pclass", "Sex"])[col].transform(lambda x: x.fillna(x.mean()))
    return data
train = fill_missing(train, cols_with_null_train)
test = fill_missing(test, cols_with_null_test)

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       891 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     891 non-null    object 
 10  Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [17]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 418 entries, 892 to 1309
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Name      418 non-null    object 
 2   Sex       418 non-null    object 
 3   Age       418 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Ticket    418 non-null    object 
 7   Fare      418 non-null    float64
 8   Cabin     418 non-null    object 
 9   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 35.9+ KB


### Let's do some light Exploratory Data Analysis.

In [19]:
# Reduces the number of features.
train["Related"] = train["SibSp"] + train["Parch"]
test["Related"] = test["SibSp"] + test["Parch"]

train.drop(["SibSp", "Parch"], axis=1, inplace=True)
test.drop(["SibSp", "Parch"], axis=1, inplace=True)

In [20]:
survival = train["Survived"].value_counts()
print(f"{survival[1]} people survived whereas {survival[0]} people died representing a {(survival[1]/len(train))*100:.2f}% survival rate")

342 people survived whereas 549 people died representing a 38.38% survival rate


In [21]:
train["Sex"].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [22]:
np.mean(train["Age"])

29.318642716644145

In [23]:
np.mean(train["Fare"])

32.204207968574636

In [24]:
# Calculates aggregates of various columns based on Pclass column.
pclass_group = train.groupby("Pclass").agg(
    pclass_count = pd.NamedAgg(column="Pclass", aggfunc="count"), 
    pclass_age = pd.NamedAgg(column="Age", aggfunc="mean"),
    pclass_fare = pd.NamedAgg(column="Fare", aggfunc="mean"), 
    pclass_survived = pd.NamedAgg(column="Survived", aggfunc="sum"))
pclass_group["pclass_survival_rate"] = np.round((pclass_group["pclass_survived"]/pclass_group["pclass_count"])*100,0)
pclass_group

Unnamed: 0_level_0,pclass_count,pclass_age,pclass_fare,pclass_survived,pclass_survival_rate
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,216,38.378866,84.154687,136,63.0
2,184,29.907295,20.662183,87,47.0
3,491,25.112288,13.67555,119,24.0


In [25]:
# Calculates aggregates of various columns based on Embarked column.
embarked_group = train.groupby("Embarked").agg(
    embarked_count = pd.NamedAgg(column="Embarked", aggfunc="count"), 
    embarked_age = pd.NamedAgg(column="Age", aggfunc="mean"),
    embarked_fare = pd.NamedAgg(column="Fare", aggfunc="mean"), 
    embarked_survived = pd.NamedAgg(column="Survived", aggfunc="sum"))
embarked_group["embarked_survival_rate"] = np.round((embarked_group["embarked_survived"]/embarked_group["embarked_count"])*100,0)
embarked_group

Unnamed: 0_level_0,embarked_count,embarked_age,embarked_fare,embarked_survived,embarked_survival_rate
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C,168,30.461519,59.954144,93,55.0
Q,77,25.690425,13.27603,30,39.0
S,646,29.45389,27.243651,219,34.0


Based on EDA the average fare for the trip was 32 pounds whereas the age was 29 years. As expected, the 1st class passenger paid more than the other class passenger but this margin appears to be very huge. Also, age influenced the ability of a passenger to board better classes with older passengers affording better passenger class. 

People in higher passenger classes seemed to have a better survival rate.

### Preprocessing

In [28]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [29]:
def extract_salutation(name):
    """
    Function to extract salutations people used.
    """
    second_name = name.split(",")[1]
    salutation = second_name.split(".")[0].strip()
    return salutation
train["Salutation"] = train["Name"].apply(extract_salutation)
test["Salutation"] = test["Name"].apply(extract_salutation)

In [None]:
train.drop("Name", axis=1, inplace=True)
test.drop("Name", axis=1, inplace=True)

In [30]:
categorical_cols = ["Sex", "Ticket", "Cabin", "Embarked", "Salutation"]
numeric_cols = ["Pclass", "Age", "Related", "Fare"]

In [31]:
# Checks if the newly added column adds valueable information in relation to survival.
salutation_group = train.groupby("Salutation").agg(
    salutation_count = pd.NamedAgg(column="Salutation", aggfunc="count"), 
    salutation_age = pd.NamedAgg(column="Age", aggfunc="mean"),
    salutation_fare = pd.NamedAgg(column="Fare", aggfunc="mean"), 
    salutation_survived = pd.NamedAgg(column="Survived", aggfunc="sum"))
salutation_group["salutation_survival_rate"] = np.round((salutation_group["salutation_survived"] /
                                                         salutation_group["salutation_count"])*100,0)
salutation_group

Unnamed: 0_level_0,salutation_count,salutation_age,salutation_fare,salutation_survived,salutation_survival_rate
Salutation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capt,1,70.0,71.0,0,0.0
Col,2,58.0,31.025,1,50.0
Don,1,40.0,27.7208,0,0.0
Dr,7,41.897341,49.168457,3,43.0
Jonkheer,1,38.0,0.0,0,0.0
Lady,1,48.0,39.6,1,100.0
Major,2,48.5,28.525,1,50.0
Master,40,6.767509,34.703125,23,57.0
Miss,182,21.916526,43.797873,127,70.0
Mlle,2,24.0,59.4021,2,100.0


In [32]:
# Transforms age into categorical column.
train["AgeBucket"] = train["Age"] // 15 * 15
test["AgeBucket"] = test["Age"] // 15 * 15

In [33]:
# Checks if the newly added column adds valueable information in relation to survival. 
group = train[["AgeBucket", "Survived"]].groupby("AgeBucket").mean().sort_values(by="Survived")
count = train["AgeBucket"].value_counts()
agebucket_survival_count_merge = group.merge(count, how="left", on="AgeBucket")

In [34]:
agebucket_survival_count_merge

Unnamed: 0_level_0,Survived,count
AgeBucket,Unnamed: 1_level_1,Unnamed: 2_level_1
60.0,0.24,25
15.0,0.331081,444
45.0,0.404494,89
30.0,0.42126,254
0.0,0.576923,78
75.0,1.0,1


In [44]:
train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,Related,Salutation,AgeBucket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.00,A/5 21171,7.2500,F G73,S,1,Mr,15.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,PC 17599,71.2833,C85,C,1,Mrs,30.0
3,1,3,"Heikkinen, Miss. Laina",female,26.00,STON/O2. 3101282,7.9250,G6,S,0,Miss,15.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,113803,53.1000,C123,S,1,Mrs,30.0
5,0,3,"Allen, Mr. William Henry",male,35.00,373450,8.0500,F G73,S,0,Mr,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.00,211536,13.0000,F2,S,0,Rev,15.0
888,1,1,"Graham, Miss. Margaret Edith",female,19.00,112053,30.0000,B42,S,0,Miss,15.0
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,21.75,W./C. 6607,23.4500,G6,S,3,Miss,15.0
890,1,1,"Behr, Mr. Karl Howell",male,26.00,111369,30.0000,C148,C,0,Mr,15.0


In [35]:
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
one_hot_encoder.fit(train[categorical_cols])
one_hot_train = one_hot_encoder.transform(train[categorical_cols])
one_hot_test = one_hot_encoder.transform(test[categorical_cols])
one_hot_train = one_hot_train.toarray()
one_hot_test = one_hot_test.toarray()

ValueError: Found unknown categories ['Johansson, Mr. Nils', 'McCarthy, Miss. Catherine Katie""', 'Samaan, Mr. Elias', 'Moubarek, Mrs. George (Omine Amenia" Alexander)"', 'Katavelas, Mr. Vassilios (Catavelas Vassilios")"', 'Carrau, Mr. Jose Pedro', 'Omont, Mr. Alfred Fernand', 'Risien, Mrs. Samuel (Emma)', 'Canavan, Mr. Patrick', 'Geiger, Miss. Amalie', 'Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genevieve Fosdick)', 'Williams, Mr. Richard Norris II', 'Frolicher-Stehli, Mrs. Maxmillian (Margaretha Emerentia Stehli)', 'Khalil, Mr. Betros', 'Drew, Master. Marshall Brines', 'Kink-Heilmann, Mr. Anton', 'Kink, Miss. Maria', 'Kiernan, Mr. John', 'Ryerson, Master. John Borie', 'Ware, Mr. William Jeffery', 'Ford, Mr. Edward Watson', 'Aks, Master. Philip Frank', 'Brown, Miss. Edith Eileen', 'Rowe, Mr. Alfred G', 'Hyman, Mr. Abraham', 'Mulvihill, Miss. Bertha E', 'Riihivouri, Miss. Susanna Juhantytar Sanni""', 'Hiltunen, Miss. Marta', 'Carver, Mr. Alfred John', 'Jefferys, Mr. Ernest Wilfred', 'Hocking, Miss. Ellen Nellie""', 'Harder, Mrs. George Achilles (Dorothy Annan)', 'Asplund, Master. Carl Edgar', 'Giles, Mr. Ralph', 'Ware, Mr. Frederick', 'Keeping, Mr. Edwin', "McNamee, Mrs. Neal (Eileen O'Leary)", 'White, Mrs. John Stuart (Ella Holmes)', 'Bjorklund, Mr. Ernst Herbert', 'Davies, Mr. Joseph', 'Goodwin, Mr. Charles Frederick', 'Daher, Mr. Shedid', 'Thomas, Mr. Charles P', 'Pokrnic, Mr. Tome', 'Clarke, Mr. Charles Valentine', 'Linehan, Mr. Michael', 'Everett, Mr. Thomas James', 'McCoy, Miss. Alicia', 'Kink-Heilmann, Mrs. Anton (Luise Heilmann)', 'Andersson, Miss. Ida Augusta Margareta', 'Aldworth, Mr. Charles Augustus', 'Beauchamp, Mr. Henry James', 'Touma, Miss. Maria Youssef', 'Ryerson, Mr. Arthur Larned', 'Davies, Mr. Evan', 'Compton, Mr. Alexander Taylor Jr', 'Burns, Miss. Mary Delia', 'Wenzel, Mr. Linhart', 'Pulbaum, Mr. Franz', 'Salomon, Mr. Abraham L', 'Vendel, Mr. Olof Edvin', 'Makinen, Mr. Kalle Edvard', 'Walcroft, Miss. Nellie', 'Stanton, Mr. Samuel Ward', 'Foley, Mr. Joseph', 'Robins, Mr. Alexander A', 'Cardeza, Mrs. James Warburton Martinez (Charlotte Wardle Drake)', 'Hagardon, Miss. Kate', 'de Brito, Mr. Jose Joaquim', 'Andrew, Mr. Frank Thomas', 'Howard, Miss. May Elizabeth', 'Franklin, Mr. Thomas Parham', 'Spencer, Mr. William Augustus', 'Hee, Mr. Ling', 'Louch, Mr. Charles Alexander', 'Buckley, Miss. Katherine', 'Mahon, Miss. Bridget Delia', 'Spector, Mr. Woolf', 'Ryerson, Mrs. Arthur Larned (Emily Maria Borie)', 'Lithman, Mr. Simon', 'Brobeck, Mr. Karl Rudolf', 'Becker, Miss. Ruth Elizabeth', 'Kennedy, Mr. John', 'Bradley, Miss. Bridget Delia', 'Fillbrook, Mr. Joseph Charles', 'Saether, Mr. Simon Sivertsen', 'Lyntakoff, Mr. Stanko', 'Nilsson, Mr. August Ferdinand', 'Peacock, Miss. Treasteall', 'Warren, Mr. Charles William', 'Lefebre, Mrs. Frank (Frances)', 'Pokrnic, Mr. Mate', 'Chaudanson, Miss. Victorine', 'Franklin, Mr. Charles (Charles Fardon)', 'Peter, Master. Michael J', 'Cacic, Miss. Manda', 'Herman, Mr. Samuel', 'Straus, Mr. Isidor', 'Keane, Mr. Daniel', 'Karlsson, Mr. Julius Konrad Eugen', 'Lines, Mrs. Ernest H (Elizabeth Lindsey James)', 'McCrae, Mr. Arthur Gordon', 'Drapkin, Miss. Jennie', 'Riordan, Miss. Johanna Hannah""', 'Birnbaum, Mr. Jakob', 'Andersson, Mr. Johan Samuel', 'Nesson, Mr. Israel', 'Widener, Mrs. George Dunton (Eleanor Elkins)', 'Thomson, Mr. Alexander Morrison', 'Minkoff, Mr. Lazar', 'Conlon, Mr. Thomas Henry', 'Roth, Miss. Sarah A', 'Fortune, Mrs. Mark (Mary McDougald)', 'Angheloff, Mr. Minko', 'Sage, Miss. Ada', 'Enander, Mr. Ingvar', 'Sage, Master. William Henry', 'Olsen, Master. Artur Karl', 'Whabee, Mrs. George Joseph (Shawneene Abi-Saab)', 'Denbury, Mr. Herbert', 'Rice, Master. Albert', 'Baimbrigge, Mr. Charles Robert', 'Duran y More, Miss. Florentina', 'Giles, Mr. Edgar', 'Rosenbaum, Miss. Edith Louise', 'Cook, Mrs. (Selena Rogers)', 'Samaan, Mr. Hanna', 'Thomas, Mr. John', 'del Carlo, Mrs. Sebastiano (Argenia Genovesi)', 'Maguire, Mr. John Edward', 'Mallet, Mrs. Albert (Antoinette Magnin)', 'Saade, Mr. Jean Nassr', 'Weisz, Mr. Leopold', 'Dulles, Mr. William Crothers', 'Rothschild, Mr. Martin', 'Assam, Mr. Ali', 'Douglas, Mrs. Frederick Charles (Mary Helene Baxter)', 'Holthen, Mr. Johan Martin', 'Stokes, Mr. Philip Joseph', 'Sandstrom, Miss. Beatrice Irene', 'Cribb, Miss. Laura Alice', 'Petersen, Mr. Marius', 'Duquemin, Mr. Joseph', 'Payne, Mr. Vivian Ponsonby', 'Murphy, Miss. Nora', 'McCrie, Mr. James Matthew', 'Compton, Mrs. Alexander Taylor (Mary Eliza Ingersoll)', 'Doyle, Miss. Elizabeth', 'West, Miss. Barbara J', 'Drew, Mr. James Vivian', 'Klasen, Miss. Gertrud Emilia', 'Flegenheim, Mrs. Alfred (Antoinette)', 'Becker, Mrs. Allen Oliver (Nellie E Baumgardner)', 'Dintcheff, Mr. Valtcho', 'Cor, Mr. Bartol', 'Gracie, Col. Archibald IV', 'Douglas, Mrs. Walter Donald (Mahala Dutton)', 'Wheeler, Mr. Edwin Frederick""', 'Oxenham, Mr. Percy Thomas', 'Mock, Mr. Philipp Edmund', 'Jefferys, Mr. Clifford Thomas', 'Baccos, Mr. Raffull', 'Peacock, Master. Alfred Edward', 'Spinner, Mr. Henry John', 'Olsson, Mr. Oscar Wilhelm', 'Matinoff, Mr. Nicola', 'Lundstrom, Mr. Thure Edvin', 'Watt, Miss. Bertha J', 'Cotterill, Mr. Henry Harry""', 'Davison, Mr. Thomas Henry', 'Karun, Mr. Franz', 'Dodge, Mrs. Washington (Ruth Vidaver)', 'Asplund, Master. Filip Oscar', "O'Connor, Mr. Patrick", 'Abelseth, Miss. Karen Marie', 'Badman, Miss. Emily Louisa', 'Hirvonen, Mrs. Alexander (Helga E Lindqvist)', 'Henriksson, Miss. Jenny Lovisa', 'Johansson Palmquist, Mr. Oskar Leander', 'Hellstrom, Miss. Hilda Maria', 'Swane, Mr. George', 'Abbott, Master. Eugene Joseph', 'Dean, Mrs. Bertram (Eva Georgetta Light)', 'Rheims, Mr. George Alexander Lucien', 'Collett, Mr. Sidney C Stuart', 'Ashby, Mr. John', 'Wittevrongel, Mr. Camille', 'Willer, Mr. Aaron (Abi Weller")"', 'Niklasson, Mr. Samuel', 'Krekorian, Mr. Neshan', 'McGowan, Miss. Katherine', 'Midtsjo, Mr. Karl Albert', 'Lingane, Mr. John', 'Warren, Mr. Frank Manley', 'Ryan, Mr. Edward', 'Zakarian, Mr. Mapriededer', 'Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)', 'Ware, Mrs. John James (Florence Louise Long)', 'Hilliard, Mr. Herbert Henry', 'Caldwell, Mr. Albert Francis', 'Peruschitz, Rev. Joseph Maria', 'Allison, Mr. Hudson Joshua Creighton', 'Fox, Mr. Patrick', 'Straus, Mrs. Isidor (Rosalie Ida Blun)', 'Aronsson, Mr. Ernst Axel Algot', 'Cacic, Mr. Jego Grga', 'Minahan, Mrs. William Edward (Lillian E Thorpe)', 'Laroche, Miss. Louise', 'Maybery, Mr. Frank Hubert', 'Thomas, Mrs. Alexander (Thamine Thelma")"', 'Thomas, Mr. Tannous', 'Gale, Mr. Harry', 'Abrahamsson, Mr. Abraham August Johannes', 'Nancarrow, Mr. William Henry', 'Carlsson, Mr. Carl Robert', 'Bowen, Miss. Grace Scott', 'Svensson, Mr. Johan Cervin', 'Wilson, Miss. Helen Alice', 'Strilic, Mr. Ivan', 'Howard, Mr. Benjamin', 'Schabert, Mrs. Paul (Emma Mock)', 'Daly, Miss. Margaret Marcella Maggie""', 'MacKay, Mr. George William', 'Karnes, Mrs. J Frank (Claire Bennett)', 'Johnston, Master. William Arthur Willie""', 'Davies, Mr. John Samuel', 'Brady, Mr. John Bertram', 'Finoli, Mr. Luigi', 'Smith, Mr. Lucien Philip', 'Vartanian, Mr. David', 'Oliva y Ocana, Dona. Fermina', 'Chronopoulos, Mr. Demetrios', 'Bryhl, Miss. Dagmar Jenny Ingeborg ', 'Evans, Miss. Edith Corse', 'Schmidt, Mr. August', 'Willard, Miss. Constance', 'Sage, Mr. John George', 'Lennon, Miss. Mary', 'Bird, Miss. Ellen', 'Wick, Mr. George Dennick', 'Astor, Col. John Jacob', 'Andersen, Mr. Albert Karvin', 'Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judith Andersson)', 'Foley, Mr. William', 'Wilkes, Mrs. James (Ellen Needs)', 'Rasmussen, Mrs. (Lena Jacobsen Solvang)', 'Lamb, Mr. John Joseph', 'Sadowitz, Mr. Harry', 'Loring, Mr. Joseph Holland', 'Mangiavacchi, Mr. Serafino Emilio', 'Frauenthal, Mr. Isaac Gerald', 'Ilieff, Mr. Ylio', 'Phillips, Miss. Alice Frances Louisa', 'Salander, Mr. Karl Johan', 'Bentham, Miss. Lilian W', 'Case, Mr. Howard Brown', 'Pedersen, Mr. Olaf', 'Klasen, Mrs. (Hulda Kristina Eugenia Lofqvist)', 'Wirz, Mr. Albert', 'Marvin, Mrs. Daniel Warner (Mary Graham Carmichael Farquarson)', 'de Messemaeker, Mr. Guillaume Joseph', "O'Keefe, Mr. Patrick", 'Parker, Mr. Clifford Richard', 'Dibden, Mr. William', 'Angle, Mr. William A', 'Clark, Mrs. Walter Miller (Virginia McDowell)', 'Delalic, Mr. Redjo', 'Ostby, Miss. Helene Ragnhild', 'Cavendish, Mrs. Tyrell William (Julia Florence Siegel)', 'Asplund, Mr. Carl Oscar Vilhelm Gustafsson', 'Peacock, Mrs. Benjamin (Edith Nile)', 'Torfa, Mr. Assad', 'Wells, Mrs. Arthur Henry (Addie" Dart Trevaskis)"', 'Stengel, Mrs. Charles Emil Henry (Annie May Morris)', 'Bucknell, Mrs. William Robert (Emma Eliza Ward)', 'Malachard, Mr. Noel', 'Snyder, Mr. John Pillsbury', 'Danbom, Master. Gilbert Sigvard Emanuel', 'Colbert, Mr. Patrick', 'Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)', 'Sincock, Miss. Maude', 'Brandeis, Mr. Emil', 'Barry, Miss. Julia', 'Hipkins, Mr. William Edward', 'Khalil, Mrs. Betros (Zahie Maria" Elias)"', 'Ware, Mr. John James', 'Rosenshine, Mr. George (Mr George Thorne")"', 'Coutts, Mrs. William (Winnie Minnie" Treanor)"', 'Candee, Mrs. Edward (Helen Churchill Hungerford)', 'Snyder, Mrs. John Pillsbury (Nelle Stevenson)', 'Sweet, Mr. George Frederick', "O'Donoghue, Ms. Bridget", 'Hocking, Mr. Samuel James Metcalfe', 'Sap, Mr. Julius', 'Miles, Mr. Frank', 'Wiklund, Mr. Karl Johan', 'Daniels, Miss. Sarah', 'Hays, Mr. Charles Melville', 'Hansen, Mrs. Claus Peter (Jennie L Howard)', 'Kimball, Mrs. Edwin Nelson Jr (Gertrude Parsons)', 'Demetri, Mr. Marinko', 'Herman, Miss. Kate', 'Peltomaki, Mr. Nikolai Johannes', 'Abrahim, Mrs. Joseph (Sophie Halaut Easu)', 'Boulos, Master. Akar', 'Hold, Mrs. Stephen (Annie Margaret Hill)', 'Botsford, Mr. William Hull', 'Brown, Mrs. John Murray (Caroline Lane Lamson)', 'Earnshaw, Mrs. Boulton (Olive Potter)', 'Storey, Mr. Thomas', 'Dodge, Dr. Washington', 'Lockyer, Mr. Edward', 'Assaf, Mr. Gerios', 'Gibson, Mrs. Leonard (Pauline C Boeson)', 'Fleming, Miss. Honora', 'Stengel, Mr. Charles Emil Henry', 'McCaffry, Mr. Thomas Francis', 'Carr, Miss. Jeannie', 'Ford, Mr. Arthur', 'Oreskovic, Miss. Jelka', 'Dean, Miss. Elizabeth Gladys Millvina""', 'Clark, Mr. Walter Miller', 'Jones, Mr. Charles Cresson', 'Ismay, Mr. Joseph Bruce', 'Kenyon, Mr. Frederick R', 'van Billiard, Master. James William', 'Goldsmith, Mr. Nathan', 'Widener, Mr. George Dunton', 'Fortune, Miss. Ethel Flora', 'Spedden, Master. Robert Douglas', 'Nourney, Mr. Alfred (Baron von Drachstedt")"', 'Asplund, Mr. Johan Charles', 'Assaf Khalil, Mrs. Mariana (Miriam")"', 'van Billiard, Master. Walter John', 'Cornell, Mrs. Robert Clifford (Malvina Helen Lamson)', 'Greenfield, Mrs. Leo David (Blanche Strouse)', 'Chevre, Mr. Paul Romaine', 'Nasr, Mr. Mustafa', 'Reynolds, Mr. Harold J', 'Moore, Mr. Clarence Bloomfield', 'Rogers, Mr. Reginald Harry', 'Mardirosian, Mr. Sarkis', 'Lane, Mr. Patrick', 'Veal, Mr. James', 'Elias, Mr. Joseph', 'Caram, Mr. Joseph', 'Palsson, Master. Paul Folke', 'Pallas y Castello, Mr. Emilio', 'Lindell, Mrs. Edvard Bengtsson (Elin Gerda Persson)', 'Mahon, Mr. John', 'Nakid, Mrs. Said (Waika Mary" Mowad)"', 'Nieminen, Miss. Manta Josefina', 'Deacon, Mr. Percy William', 'Shine, Miss. Ellen Natalia', 'Gilbert, Mr. William', 'Chisholm, Mr. Roderick Robert Crispin', 'Rosblom, Miss. Salli Helena', 'Cumings, Mr. John Bradley', 'Head, Mr. Christopher', 'Pearce, Mr. Ernest', 'Corbett, Mrs. Walter H (Irene Colvin)', 'Kreuchen, Miss. Emilie', 'Myles, Mr. Thomas Francis', 'Tucker, Mr. Gilbert Milligan Jr', 'Portaluppi, Mr. Emilio Ilario Giuseppe', 'McNeill, Miss. Bridget', 'Larsson-Rondberg, Mr. Edvard A', 'Buckley, Mr. Daniel', 'Dyker, Mr. Adolf Fredrik', 'Braf, Miss. Elin Ester Maria', 'Julian, Mr. Henry Forbes', 'Christy, Mrs. (Alice Frances)', 'Wells, Master. Ralph Lester', 'Tenglin, Mr. Gunnar Isidor', 'Naughton, Miss. Hannah', 'Bonnell, Miss. Caroline', 'Ilmakangas, Miss. Ida Livija', 'Bowenur, Mr. Solomon', 'Corey, Mrs. Percy C (Mary Phyllis Elizabeth Miller)', 'Zakarian, Mr. Ortin', 'Chapman, Mrs. John Henry (Sara Elizabeth Lawry)', 'Lahtinen, Rev. William', 'Quick, Miss. Winifred Vera', 'Dennis, Mr. William', 'Phillips, Mr. Escott Robert', 'Goodwin, Miss. Jessie Allis', 'Shaughnessy, Mr. Patrick', 'Borebank, Mr. John James', 'Gibson, Miss. Dorothy Winifred', 'Nilsson, Miss. Berta Olivia', 'Johnston, Mrs. Andrew G (Elizabeth Lily" Watson)"', 'Dika, Mr. Mirko', 'Crafton, Mr. John Bertram', 'Faunthorpe, Mr. Harry', 'Wright, Miss. Marion', 'Harbeck, Mr. William H', 'Karlsson, Mr. Einar Gervasius', 'Abelseth, Mr. Olaus Jorgensen', 'Lindeberg-Lind, Mr. Erik Gustaf (Mr Edward Lingrey")"', 'Beattie, Mr. Thomson', 'Spedden, Mr. Frederic Oakley', 'Davidson, Mrs. Thornton (Orian Hays)', 'Smyth, Miss. Julia', 'Touma, Master. Georges Youssef', 'Sage, Mrs. John (Annie Bullen)', 'Cor, Mr. Ivan', 'Vander Planke, Mr. Julius', 'Davies, Mrs. John Morgan (Elizabeth Agnes Mary White) ', 'Smith, Mrs. Lucien Philip (Mary Eloise Hughes)', 'Guest, Mr. Robert', 'Howard, Mrs. Benjamin (Ellen Truelove Arman)', 'Jonsson, Mr. Nils Hilding', 'Lindstrom, Mrs. Carl Johan (Sigrid Posse)', 'Betros, Master. Seman', 'Lundin, Miss. Olga Elida', 'Ovies y Rodriguez, Mr. Servando'] in column 0 during transform

In [None]:
y_train = train["Survived"].to_numpy()

In [None]:
X_train = np.c_[train[numeric_cols], one_hot_train]
X_test = np.c_[test[numeric_cols], one_hot_test]

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Build some ML models.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

Let's use grid search to search for hyperparameters.

In [None]:
svm_clf = LinearSVC()

In [None]:
params = [{"C": [1, 2, 3, 5], "dual": ["auto", False]}]

In [None]:
grid_search = GridSearchCV(svm_clf, params, scoring="accuracy", verbose=3)
grid_search.fit(X_train_scaled, y_train)

In [None]:
print(grid_search.best_params_)

In [None]:
cvres = grid_search.cv_results_
for mean_test_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(f"mean_test_score: {mean_test_score} and params: {params}")

In [None]:
print(cvres["mean_test_score"].max())

In [None]:
predictions = grid_search.predict(X_test_scaled)

In [None]:
df = pd.DataFrame()
df["PassengerId"] = np.arange(892, 1310)
df["Survived"] = predictions
df.to_csv("titanic_svm.csv", index=False)