In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# TITANIC MACHINE LEARNING FROM DISASTER - Kyle Graupe

In [14]:
import numpy as np
from IPython.display import display

In [15]:
# READ DATA INTO PANDAS DATAFRAME
train_input = pd.read_csv("/kaggle/input/titanic/train.csv")
test_input = pd.read_csv("/kaggle/input/titanic/test.csv")

# VISUALIZE TRAIN AND TEST DATAFRAMES
print("\n================")
print("TRAIN INFO: ")
display(train_input.info())
print("\n================")
print("TEST INFO: ")
display(test_input.info())

print("\n================")
print("TRAIN HEAD: train_input.head(20)")
display(train_input.head(20))
print("\n================")
print("TEST HEAD: ")
display(test_input.head(20))

print("\n================")
print(f"TRAIN SHAPE: {train_input.shape}")
print("\n================")
print(f"TEST SHAPE: {test_input.shape}")

print("\n================")
print(f"TRAIN COLUMNS: \n {train_input.columns}")
print("\n================")
print(f"TEST COLUMNS: \n{test_input.columns}")


In [16]:
# COPY INPUT DATA
train = train_input.copy()
test = test_input.copy()

In [17]:
# FEATURE SELECTION: DROP PASSENGER ID AND NAME

# CREATE A LIST OF COLUMN NAMES TO DROP
drop_list = ["PassengerId", "Name"]

# FOR LOOP TO DROP COLUMNS BY NAME
for i in drop_list:
    train = train.drop(i, axis=1)
    test = test.drop(i, axis=1)
    
# CHECK THE OUTPUT
print("\n================")
print("TRAIN INFO: ")
print(train.info())
print("\n================")
print("TEST INFO: ")
print(test.info())

In [18]:
# FEATURE ENCODING

# WORK WITH ENCODING COPY
train_enc = train.copy()
test_enc = test.copy()

# CREATE ONE-HOT DATAFRAMES
embarked_OH = pd.get_dummies(train_enc.Embarked, prefix='OH_Embarked')
sex_OH = pd.get_dummies(train_enc.Sex, prefix='OH_Sex')

embarked_test_OH = pd.get_dummies(test_enc.Embarked, prefix='OH_Embarked')
sex_test_OH = pd.get_dummies(test_enc.Sex, prefix='OH_Sex')

# CHECK THE OUTPUT
print("\n================")
print("ONE HOT CHECK: ")
display(embarked_OH.head(10))
display(sex_OH.head(10))

# ADD ONE-HOT ENCODED COLUMNS TO "train_enc" DATAFRAME
train_enc["OH_C"] = embarked_OH["OH_Embarked_C"]
train_enc["OH_Q"] = embarked_OH["OH_Embarked_Q"]
train_enc["OH_S"] = embarked_OH["OH_Embarked_S"]

train_enc["OH_female"] = sex_OH["OH_Sex_female"]
train_enc["OH_male"] = sex_OH["OH_Sex_male"]

# ADD ONE-HOT ENCODED COLUMNS TO "test_enc" DATAFRAME
test_enc["OH_C"] = embarked_test_OH["OH_Embarked_C"]
test_enc["OH_Q"] = embarked_test_OH["OH_Embarked_Q"]
test_enc["OH_S"] = embarked_test_OH["OH_Embarked_S"]

test_enc["OH_female"] = sex_test_OH["OH_Sex_female"]
test_enc["OH_male"] = sex_test_OH["OH_Sex_male"]

# DROP NON-ENCODED COLUMNS
train_enc = train_enc.drop("Embarked", axis=1)
train_enc = train_enc.drop("Sex", axis=1)

test_enc = test_enc.drop("Embarked", axis=1)
test_enc = test_enc.drop("Sex", axis=1)

# CHECK THE OUTPUT
print("\n================")
print("COLUMN CHECK: ")
print(train_enc.columns)
print("\n================")
print("DATAFRAME CHECK: ")
display(train_enc.head(20))

On further iteration, implement Cabin/Fare relationship algorithm. The cabin object may give insight into the survivability based on the location of the passenger (assuming they were in their cabin). The Cabin column only has 91 non-null values, but maybe the Fare and Ticket columns can help to locate the Cabin number.

For now, Cabin and Ticket will be dropped. Later iterations, we can feature extract some information from these columns, but we will ignore them for now. 

In [19]:
# FEATURE EXTRACTION AND ENCODING

# WORK WITH COPY
train_ext = train_enc.copy()
test_ext = test_enc.copy()

# THIS COLUMN TRACKS IF THE PASSENGER HAS SIBLINGS, A SPOUSE, A PARENT, OR A CHILD ON BOARD WITH THEM (DOES NOT ACCOUNT
# FOR GRANDPARENTS -> SOURCE OF BIAS)
# CREATE NEW COLUMN: 1 FOR LONE TRAVELER, 0 FOR IMMEDIATE FAMILY ON BOARD
train_ext["No Immediate Family"] = 0
test_ext["No Immediate Family"] = 0

# ITERATE THROUGH DATAFRAME ROWS. IF NO. OF SIBLINGS/SPOUSE AND PARENTS/CHILDREN EQUALS ZERO, THEN THEY HAVE NO IMMEDIATE
# FAMILY ON BOARD. SET VALUES TO 1 IN THIS INSTANCE.
for i in range(len(train_ext)):
    if train_ext.loc[i, "SibSp"] == 0 and train_ext.loc[i,"Parch"] == 0:
        train_ext.loc[i, "No Immediate Family"] = 1
        
for u in range(len(test_ext)):
    if test_ext.loc[u, "SibSp"] == 0 and test_ext.loc[u,"Parch"] == 0:
        test_ext.loc[u, "No Immediate Family"] = 1

# NUMBER OF LONE TRAVELERS
print("\n================")
print("NO IMMEDIATE FAMILY VALUE COUNTS: ")
print(train_ext["No Immediate Family"].value_counts())
print("1 = NO IMMEDIATE FAMILY ON BOARD, 0 = IMMEDIATE FAMILY ON BOARD")

train_ext["Lone Male"] = 0
train_ext["Lone Female"] = 0

# ALGORITHM TO DETERMINE IF INDIVIDUAL MALES ARE TRAVELING WITHOUT IMMEDIATE FAMILY
for j in range(len(train_ext)):
    if train_ext.loc[j, "No Immediate Family"] == 1 and train_ext.loc[j, "OH_male"] == 1:
        train_ext.loc[j, "Lone Male"] = 1

# ALGORITHM TO DETERMINE IF INDIVIDUAL FEMALES ARE TRAVELING WITHOUT IMMEDIATE FAMILY
for k in range(len(train_ext)):
    if train_ext.loc[k, "No Immediate Family"] == 1 and train_ext.loc[k, "OH_female"] == 1:
        train_ext.loc[k, "Lone Female"] = 1

    
test_ext["Lone Male"] = 0
test_ext["Lone Female"] = 0

# ALGORITHM TO DETERMINE IF INDIVIDUAL MALES ARE TRAVELING WITHOUT IMMEDIATE FAMILY
for z in range(len(test_ext)):
    if test_ext.loc[z, "No Immediate Family"] == 1 and test_ext.loc[z, "OH_male"] == 1:
        test_ext.loc[z, "Lone Male"] = 1

# ALGORITHM TO DETERMINE IF INDIVIDUAL FEMALES ARE TRAVELING WITHOUT IMMEDIATE FAMILY
for l in range(len(test_ext)):
    if test_ext.loc[l, "No Immediate Family"] == 1 and test_ext.loc[l, "OH_female"] == 1:
        test_ext.loc[l, "Lone Female"] = 1

print("\n================")
print("LONE MALE VALUE COUNTS: ")
print(train_ext["Lone Male"].value_counts())
print("1 = LONE MALE, 0 = IMMEDIATE FAMILE ON BOARD")

print("\n================")
print("LONE FEMALE VALUE COUNTS: ")
print(train_ext["Lone Female"].value_counts())
print("1 = LONE FEMALE, 0 = IMMEDIATE FAMILE ON BOARD")

# CONVERT COLUMNS TO STRING FOR WRANGLING
train_ext["Cabin"] = train_ext["Cabin"].astype('str')
test_ext["Cabin"] = test_ext["Cabin"].astype('str')

# CREATE COLUMN FOR CABIN DECK LEVEL
train_ext["Deck"] = ''
test_ext["Deck"] = ''

# SET DECK LEVELS
for h in range(len(train_ext)):
    if train_ext.loc[h, "Cabin"][0] == "A":
        train_ext.loc[h, "Deck"] = "A"
    elif train_ext.loc[h, "Cabin"][0] == "B":
        train_ext.loc[h, "Deck"] = "B"
    elif train_ext.loc[h, "Cabin"][0] == "C":
        train_ext.loc[h, "Deck"] = "C"
    elif train_ext.loc[h, "Cabin"][0] == "D":
        train_ext.loc[h, "Deck"] = "D"
    elif train_ext.loc[h, "Cabin"][0] == "E":
        train_ext.loc[h, "Deck"] = "E"
    elif train_ext.loc[h, "Cabin"][0] == "F":
        train_ext.loc[h, "Deck"] = "F"
    elif train_ext.loc[h, "Cabin"][0] == "G":
        train_ext.loc[h, "Deck"] = "G"

# SET DECK LEVELS
for f in range(len(test_ext)):
    if test_ext.loc[f, "Cabin"][0] == "A":
        test_ext.loc[f, "Deck"] = "A"
    elif test_ext.loc[f, "Cabin"][0] == "B":
        test_ext.loc[f, "Deck"] = "B"
    elif test_ext.loc[f, "Cabin"][0] == "C":
        test_ext.loc[f, "Deck"] = "C"
    elif test_ext.loc[f, "Cabin"][0] == "D":
        test_ext.loc[f, "Deck"] = "D"
    elif test_ext.loc[f, "Cabin"][0] == "E":
        test_ext.loc[f, "Deck"] = "E"
    elif test_ext.loc[f, "Cabin"][0] == "F":
        test_ext.loc[f, "Deck"] = "F"
    elif test_ext.loc[f, "Cabin"][0] == "G":
        test_ext.loc[f, "Deck"] = "G"

# CREATE DATAFRAME FOR ONE-HOT ENCODING OF DECK LEVELS
deck_OH = pd.get_dummies(train_ext.Deck, prefix='OH_Deck')
deck_test_OH = pd.get_dummies(test_ext.Deck, prefix='OH_Deck')

# CHECK THE OUTPUT
print("\n================")
print("ONE HOT CHECK: ")
display(deck_OH.head(10))

# ADD ONE-HOT ENCODED DECK DATAFRAMES TO MASTER WORKING DATAFRAME
train_ext["A Deck"] = deck_OH["OH_Deck_A"]
train_ext["B Deck"] = deck_OH["OH_Deck_B"]
train_ext["C Deck"] = deck_OH["OH_Deck_C"]
train_ext["D Deck"] = deck_OH["OH_Deck_D"]
train_ext["E Deck"] = deck_OH["OH_Deck_E"]
train_ext["F Deck"] = deck_OH["OH_Deck_F"]
train_ext["G Deck"] = deck_OH["OH_Deck_G"]
train_ext["No Deck Indicated"] = deck_OH["OH_Deck_"]

# ADD ONE-HOT ENCODED DECK DATAFRAMES TO MASTER WORKING DATAFRAME
test_ext["A Deck"] = deck_test_OH["OH_Deck_A"]
test_ext["B Deck"] = deck_test_OH["OH_Deck_B"]
test_ext["C Deck"] = deck_test_OH["OH_Deck_C"]
test_ext["D Deck"] = deck_test_OH["OH_Deck_D"]
test_ext["E Deck"] = deck_test_OH["OH_Deck_E"]
test_ext["F Deck"] = deck_test_OH["OH_Deck_F"]
test_ext["G Deck"] = deck_test_OH["OH_Deck_G"]
test_ext["No Deck Indicated"] = deck_test_OH["OH_Deck_"]

# GET TICKET CLASS ONE-HOT ENCODED DATAFRAME
class_OH = pd.get_dummies(train_ext.Pclass, prefix='OH_Class')
class_test_OH = pd.get_dummies(test_ext.Pclass, prefix='OH_Class')

# CHECK THE OUTPUT
print("\n================")
print("ONE HOT CHECK: ")
display(class_OH.head(10))

# ADD ONE-HOT ENCODED TICKET CLASS DATAFRAMES TO MASTER WORKING DATAFRAME
train_ext["First Class"] = class_OH["OH_Class_1"]
train_ext["Second Class"] = class_OH["OH_Class_2"]
train_ext["Third Class"] = class_OH["OH_Class_3"]

# ADD ONE-HOT ENCODED TICKET CLASS DATAFRAMES TO MASTER WORKING DATAFRAME
test_ext["First Class"] = class_test_OH["OH_Class_1"]
test_ext["Second Class"] = class_test_OH["OH_Class_2"]
test_ext["Third Class"] = class_test_OH["OH_Class_3"]

print("\n================")
print("TRAIN_EXT INFO: ")
print(train_ext.info())

print("\n================")
print("TRAIN_EXT HEAD: ")
display(train_ext.head(10))
print("\n================")
print("COLUMNS: ")
print(train_ext.columns)


In [20]:
# EXTRACT CABIN NUMBER

# WORK WITH COPY
train_cab = train_ext.copy()
test_cab = test_ext.copy()

# ISOLATE CABIN NUMBERS
train_cab["Cabin"] = train_cab["Cabin"].str.replace('A', '')
train_cab["Cabin"] = train_cab["Cabin"].str.replace('B', '')
train_cab["Cabin"] = train_cab["Cabin"].str.replace('C', '')
train_cab["Cabin"] = train_cab["Cabin"].str.replace('D', '0')
train_cab["Cabin"] = train_cab["Cabin"].str.replace('E', '')
train_cab["Cabin"] = train_cab["Cabin"].str.replace('F', '')
train_cab["Cabin"] = train_cab["Cabin"].str.replace('G', '')
train_cab["Cabin"] = train_cab["Cabin"].str.replace('T', '0')

# ISOLATE CABIN NUMBERS
test_cab["Cabin"] = test_cab["Cabin"].str.replace('A', '')
test_cab["Cabin"] = test_cab["Cabin"].str.replace('B', '')
test_cab["Cabin"] = test_cab["Cabin"].str.replace('C', '')
test_cab["Cabin"] = test_cab["Cabin"].str.replace('D', '0')
test_cab["Cabin"] = test_cab["Cabin"].str.replace('E', '')
test_cab["Cabin"] = test_cab["Cabin"].str.replace('F', '0')
test_cab["Cabin"] = test_cab["Cabin"].str.replace('G', '')
test_cab["Cabin"] = test_cab["Cabin"].str.replace('T', '0')

# SECOND WORKING COPY
train_loc = train_cab.copy()
test_loc = test_cab.copy()

# CREATE NEW COLUMN FOR PORT VS STARBOARD CABIN CLASSIFICATION: PORT = 1, STBD = 0
train_cab["Cabin P/S"] = 1
test_cab["Cabin P/S"] = 1

# TRAIN SET

# HARD CODE STRING REPLACEMENT
for p in range(len(train_loc)):
    if train_loc.loc[p, "Cabin"] == "nan":
        train_loc.loc[p, "Cabin"] = "0"
    if train_loc.loc[p, "Cabin"] == ' ':
        train_loc.loc[p, "Cabin"] == '0'

# FIND CABIN NUMBER
for q in range(len(train_loc)):
    a = train_loc.loc[q, "Cabin"].split()
    b = [int(x) for x in a]
    train_loc.loc[q, "Cabin"] = b[0]

# BINARY PORT/STARBOARD CLASSIFICATION
for r in range(len(train_loc)):
    if train_loc.loc[r, "Cabin"] % 2 == 0:
        train_cab.loc[r, "Cabin P/S"] = 0

# HARD CODE STRING REPLACEMENT
for n in range(len(train_cab)):
    if train_cab.loc[n, "Cabin"] == "nan":
        train_cab.loc[n, "Cabin"] = "0"
    if train_cab.loc[n, "Cabin"] == ' ':
        train_cab.loc[n, "Cabin"] == '0'

# FIND AVERAGE OF CABIN NUMBERS IN CASE THEY WERE BOUGHT TOGETHER
for o in range(len(train_cab)):
    a = train_cab.loc[o, "Cabin"].split()
    b = [int(x) for x in a]
    num_sum = sum(b)
    avg = num_sum/len(b)
    train_cab.loc[o, "Cabin"] = avg
    
# TEST SET 

# HARD CODE STRING REPLACEMENT
for aa in range(len(test_loc)):
    if test_loc.loc[aa, "Cabin"] == "nan":
        test_loc.loc[aa, "Cabin"] = "0"
    if test_loc.loc[aa, "Cabin"] == ' ':
        test_loc.loc[aa, "Cabin"] == '0'


# for q in range(len(train_loc)):
#     a = train_loc.loc[q, "Cabin"].split()
#     b = [int(x) for x in a]
#     train_loc.loc[q, "Cabin"] = b[0]

# FIND CABIN NUMBER OF FIRST INSTANCE (IN CASE IN LIST OF CABIN NUMBERS)
for bb in range(len(test_loc)):
    a_1 = test_loc.loc[bb, "Cabin"].split()
    b_1 = [int(x_1) for x_1 in a_1]
    test_loc.loc[bb, "Cabin"] = b_1[0]

# DETERMINE IF CABIN IS ON PORT OR STARBOARD SIDE. SHIPS HAVE EVEN NUMBER COMPARTMENTS FOR STARBOARD, ODD FOR PORT
for cc in range(len(test_loc)):
    if test_loc.loc[cc, "Cabin"] % 2 == 0:
        test_cab.loc[cc, "Cabin P/S"] = 0

# HARDCODE STRING REPLACEMENT
for dd in range(len(test_cab)):
    if test_cab.loc[dd, "Cabin"] == "nan":
        test_cab.loc[dd, "Cabin"] = "0"
    if test_cab.loc[dd, "Cabin"] == ' ':
        test_cab.loc[dd, "Cabin"] == '0'

# FIND AVERAGE LOCATION OF CABIN
for ee in range(len(test_cab)):
    a_2 = test_cab.loc[ee, "Cabin"].split()
    b_2 = [int(x_2) for x_2 in a_2]
    num_sum_2 = sum(b_2)
    avg_2 = num_sum_2/len(b_2)
    test_cab.loc[ee, "Cabin"] = avg_2

print("\n================")
display(train_cab.head(10))

In [21]:
# EXTRACT DATA FROM SIBLINGS/SPOUSES AND PARENTS/CHILDREN

In [22]:
# DROP IRRELEVANT COLUMNS (THOSE THAT HAVE BEEN ENCODED)

# WORK WITH COPY
train_drop = train_cab.copy()
test_drop = test_cab.copy()

# CREATE LIST OF COLUMNS TO DROP
drop_list_2 = ["Pclass", "Ticket", "Cabin", "Deck"]

# ITERATE AND DROP COLUMNS SPECIFIED ABOVE
for t in drop_list_2:
    train_drop = train_drop.drop(t, axis=1)
    test_drop = test_drop.drop(t, axis=1)

print("\n================")
print("TRAIN HEAD: ")
display(train_drop.head())
print("\n================")
print("TRAIN INFO: ")
print(train_drop.info())
print("\n================")
print("TEST HEAD: ")
display(test_drop.head())
print("\n================")
print("TEST INFO: ")
print(test_drop.info())

In [23]:
from sklearn.impute import SimpleImputer
# REPLACE MISSING VALUES FOR AGE

# WORK WITH COPY
train_imp = train_drop.copy()
test_imp = test_drop.copy()

# CONVERT AGE COLUMN TO FLOATS
train_imp["Age"] = train_imp["Age"].astype(float)
test_imp["Age"] = test_imp["Age"].astype(float)

print("\n================")
print("TRAIN DATATYPES: ")
print(train_imp.dtypes)
print("\n================")
print("TEST DATATYPES: ")
print(test_imp.dtypes)

# INSTANTIATE SIMPLEIMPUTER
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

# CONVERT IMPUTED DATA BACK INTO PANDAS DATAFRAME
train_imp = pd.DataFrame(imputer.fit_transform(train_imp), columns=train_drop.columns, index=train_drop.index)
test_imp = pd.DataFrame(imputer.fit_transform(test_imp), columns=test_drop.columns, index=test_drop.index)

train_imp["Minor"] = 0

for iter in range(len(train_imp)):
    if train_imp.loc[iter, "Age"] <= 12.0:
        train_imp.loc[iter, "Minor"] = 1
        
test_imp["Minor"] = 0

for iter_minor in range(len(test_imp)):
    if test_imp.loc[iter_minor, "Age"] <= 12.0:
        test_imp.loc[iter_minor, "Minor"] = 1
        
# train_imp["Senior Citizen"] = 0

# for iter_1 in range(len(train_imp)):
#     if train_imp.loc[iter_1, "Age"] >= 60:
#         train_imp.loc[iter_1, "Senior Citizen"] = 1
        
# train_imp["Lone Minor"] = 0

# for iter_2 in range(len(train_imp)):
#     if train_imp.loc[iter_2, "Age"] <= 12.0 and train_imp.loc[iter_2, "SibSp"] == 0 and train_imp.loc[iter_2, "Parch"] == 0:
#         train_imp.loc[iter_2, "Lone Minor"] = 1
        
# train_imp["Lone Male Senior Citizen"] = 0

# for iter_3 in range(len(train_imp)):
#     if train_imp.loc[iter_3, "Age"] >= 60 and train_imp.loc[iter_3, "Lone Male"] == 1:
#         train_imp.loc[iter_3, "Lone Male Senior Citizen"] = 1
        
# train_imp["Lone Female Senior Citizen"] = 0

# for iter_4 in range(len(train_imp)):
#     if train_imp.loc[iter_4, "Age"] >= 60 and train_imp.loc[iter_4, "Lone Female"] == 1:
#         train_imp.loc[iter_4, "Lone Female Senior Citizen"] = 1
        
train_imp["First Class Woman"] = 0
train_imp["First Class Child"] = 0
train_imp["First Class Man"] = 0

for iter_0 in range(len(train_imp)):
    if train_imp.loc[iter_0, "OH_female"] == 1 and train_imp.loc[iter_0, "Age"] >= 12.0 and train_imp.loc[iter_0, "First Class"] == 1:
        train_imp.loc[iter_0, "First Class Woman"] = 1
        
for iter_1 in range(len(train_imp)):
    if train_imp.loc[iter_1, "Age"] <= 12 and train_imp.loc[iter_1, "First Class"] == 1:
        train_imp.loc[iter_1, "First Class Child"] = 1
        
for iter_2 in range(len(train_imp)):
    if train_imp.loc[iter_2, "OH_male"] == 1 and train_imp.loc[iter_2, "Age"] >= 12.0 and train_imp.loc[iter_2, "First Class"] == 1:
        train_imp.loc[iter_2, "First Class Man"] = 1
    
# *************

test_imp["First Class Woman"] = 0
test_imp["First Class Child"] = 0
test_imp["First Class Man"] = 0

for iter_a in range(len(test_imp)):
    if test_imp.loc[iter_a, "OH_female"] == 1 and test_imp.loc[iter_a, "Age"] >= 12.0 and test_imp.loc[iter_a, "First Class"] == 1:
        test_imp.loc[iter_a, "First Class Woman"] = 1
        
for iter_b in range(len(test_imp)):
    if test_imp.loc[iter_b, "Age"] <= 12 and test_imp.loc[iter_b, "First Class"] == 1:
        test_imp.loc[iter_b, "First Class Child"] = 1
        
for iter_c in range(len(test_imp)):
    if test_imp.loc[iter_c, "OH_male"] == 1 and test_imp.loc[iter_c, "Age"] >= 12.0 and test_imp.loc[iter_c, "First Class"] == 1:
        test_imp.loc[iter_c, "First Class Man"] = 1

# **********

train_imp["Second Class Woman"] = 0
train_imp["Second Class Child"] = 0
train_imp["Second Class Man"] = 0

for iter_3 in range(len(train_imp)):
    if train_imp.loc[iter_3, "OH_female"] == 1 and train_imp.loc[iter_3, "Age"] >= 12.0 and train_imp.loc[iter_3, "Second Class"] == 1:
        train_imp.loc[iter_3, "Second Class Woman"] = 1

for iter_4 in range(len(train_imp)):
    if train_imp.loc[iter_4, "Age"] <= 12.0 and train_imp.loc[iter_4, "Second Class"] == 1:
        train_imp.loc[iter_4, "Second Class Child"] = 1

for iter_5 in range(len(train_imp)):
    if train_imp.loc[iter_5, "OH_male"] == 1 and train_imp.loc[iter_5, "Age"] >= 12.0 and train_imp.loc[iter_5, "Second Class"] == 1:
        train_imp.loc[iter_5, "Second Class Man"] = 1
        
# *************

test_imp["Second Class Woman"] = 0
test_imp["Second Class Child"] = 0
test_imp["Second Class Man"] = 0

for iter_d in range(len(test_imp)):
    if test_imp.loc[iter_d, "OH_female"] == 1 and test_imp.loc[iter_d, "Age"] >= 12.0 and test_imp.loc[iter_d, "Second Class"] == 1:
        test_imp.loc[iter_d, "Second Class Woman"] = 1
        
for iter_e in range(len(test_imp)):
    if test_imp.loc[iter_e, "Age"] <= 12 and test_imp.loc[iter_e, "Second Class"] == 1:
        test_imp.loc[iter_e, "Second Class Child"] = 1
        
for iter_f in range(len(test_imp)):
    if test_imp.loc[iter_f, "OH_male"] == 1 and test_imp.loc[iter_f, "Age"] >= 12.0 and test_imp.loc[iter_f, "Second Class"] == 1:
        test_imp.loc[iter_f, "Second Class Man"] = 1
        
# *************
        
train_imp["Third Class Woman"] = 0
train_imp["Third Class Child"] = 0
train_imp["Third Class Man"] = 0

for iter_6 in range(len(train_imp)):
    if train_imp.loc[iter_6, "OH_female"] == 1 and train_imp.loc[iter_6, "Age"] >= 12.0 and train_imp.loc[iter_6, "Third Class"] == 1:
        train_imp.loc[iter_6, "Third Class Woman"] = 1
        
for iter_7 in range(len(train_imp)):
    if train_imp.loc[iter_7, "Age"] <= 12.0 and train_imp.loc[iter_7, "Third Class"] == 1:
        train_imp.loc[iter_7, "Third Class Child"] = 1
        
for iter_8 in range(len(train_imp)):
    if train_imp.loc[iter_8, "OH_male"] == 1 and train_imp.loc[iter_8, "Age"] >= 12.0 and train_imp.loc[iter_8, "Third Class"] == 1:
        train_imp.loc[iter_8, "Third Class Man"] = 1

# **************

test_imp["Third Class Woman"] = 0
test_imp["Third Class Child"] = 0
test_imp["Third Class Man"] = 0

for iter_g in range(len(test_imp)):
    if test_imp.loc[iter_g, "OH_female"] == 1 and test_imp.loc[iter_g, "Age"] >= 12.0 and test_imp.loc[iter_g, "Third Class"] == 1:
        test_imp.loc[iter_g, "Third Class Woman"] = 1
        
for iter_h in range(len(test_imp)):
    if test_imp.loc[iter_h, "Age"] <= 12 and test_imp.loc[iter_h, "Third Class"] == 1:
        test_imp.loc[iter_h, "Third Class Child"] = 1
        
for iter_i in range(len(test_imp)):
    if test_imp.loc[iter_i, "OH_male"] == 1 and test_imp.loc[iter_i, "Age"] >= 12.0 and test_imp.loc[iter_i, "Third Class"] == 1:
        test_imp.loc[iter_i, "Third Class Man"] = 1

print("\n================")
print("SECOND CLASS CHILD COUNTS: ")
print(train_imp["Second Class Child"].value_counts())

print("\n================")
print("TRAIN HEAD: ")
display(train_imp.head(10))
print("\n================")
print("TRAIN INFO: ")
print(train_imp.info())

print("\n================")
print("TEST HEAD: ")
display(test_imp.head(10))
print("\n================")
print("TEST INFO: ")
print(test_imp.info())

In [24]:
from sklearn.preprocessing import MinMaxScaler

# SCALE NON-BINARY DATA

# WORK WITH COPY
train_sca = train_imp.copy()
test_sca = test_imp.copy()

# INSTANTIATE SCALER
scaler = MinMaxScaler()

train_sca[["Age", "SibSp", "Parch", "Fare"]] = scaler.fit_transform(train_sca[["Age", "SibSp", "Parch", "Fare"]])
test_sca[["Age", "SibSp", "Parch", "Fare"]] = scaler.fit_transform(test_sca[["Age", "SibSp", "Parch", "Fare"]])

print("\n================")
print("TRAIN HEAD: ")
display(train_sca.head(10))

print("\n================")
print("TEST HEAD: ")
display(test_sca.head(10))


In [25]:
correlation_sorted = train_sca.corr()
correlation_grouped = train_sca.corr()["Survived"][:]
print("\n================")
print("GROUPED CORRELATION: ")
print(correlation_grouped)
print("\n================")
print("SORTED CORRELATION: ")
print(correlation_sorted["Survived"].sort_values(ascending=False))


# FEATURE DROPPING

In [26]:
train_fe = train_sca.copy()
test_fe = test_sca.copy()

# DROP UNCORRELATED COLUMNS
final_drop_list = ["Parch", "SibSp", "Age", "First Class", "Second Class", "Third Class", "OH_female", "OH_male", 
                  "A Deck", "B Deck", "C Deck", "D Deck", "E Deck", "F Deck", "G Deck"]

for drop_iter in final_drop_list:
    train_fe = train_fe.drop(drop_iter, axis=1)
    test_fe = test_fe.drop(drop_iter, axis=1)

print("\n================")
print(f"FINAL TRAIN COLUMNS: \n{train_fe.columns}")
print("\n================")
print(f"FINAL TEST COLUMNS: \n{test_fe.columns}")

In [27]:
import tensorflow as tf
import tensorflow.keras.models
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.utils import shuffle



SPLIT TRAIN, DEV, TEST, BEFORE DATA PREP

In [28]:
# SPLIT TRAIN, DEV, TEST SETS
train_fin = train_fe.copy()
test_fin = test_fe.copy()
train_fin.to_csv("train_csv_final.csv", index=False)
test_fin.to_csv("test_csv_final.csv", index=False)
features_0 = train_fin.loc[:, "Fare": "Third Class Man"]
labels_0 = train_fin.loc[:, : "Survived"].astype(int)

features, labels = shuffle(features_0, labels_0)

# print(features)
# print(labels)

x_train, x_rem, y_train, y_rem = train_test_split(features, labels, train_size=0.8)

test_size = 0.5

x_dev, x_test, y_dev, y_test = train_test_split(x_rem,y_rem, test_size=0.5)

In [29]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [30]:
import time

start = time.time()
# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestClassifier()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(x_train, y_train.values.ravel())

# rf_best = rf_random.best_params_
end = time.time()

print("\n================")
print(f"Model training run time: {end-start}")


{'n_estimators': 2000, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 30, 'bootstrap': False}

In [31]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

rf_use = RandomForestClassifier(n_estimators=2000, min_samples_split=2, min_samples_leaf=1, max_features='auto',
                               max_depth=5, bootstrap=False)

rf_use.fit(x_train, y_train.values.ravel())

y_pred = rf_use.predict(x_train)
acc_score = accuracy_score(y_train, y_pred)
confusion_matrix = confusion_matrix(y_train, y_pred)
f1 = f1_score(y_train, y_pred)
print("\n================")
print(f"ACCURACY SCORE ON TRAIN SET: \n{acc_score}")
print("\n================")
print(f"CONFUSION MATRIX ON TRAIN SET: \n{confusion_matrix}")
print("\n================")
print(f"F1 SCORE ON TRAIN SET: \n{f1}")



# rf_full = cross_val_score(rf_use, features, labels.values.ravel(), cv=10)
# rf_cv_train = cross_val_score(rf_use, x_train, y_train.values.ravel(), cv=10)
# rf_cv_dev = cross_val_score(rf_use, x_dev, y_dev.values.ravel(), cv=10)
# rf_cv_test = cross_val_score(rf_use, x_test, y_test.values.ravel(), cv=10)

# print("\n================")
# print("MEAN SCORES OF TUNED RANDOM FOREST CLASSIFIER, FULL (k = 10): ")
# print(mean(rf_full))

# print("\n================")
# print("MEAN SCORES OF TUNED RANDOM FOREST CLASSIFIER, TRAIN (k = 10): ")
# print(mean(rf_cv_train))

# print("\n================")
# print("MEAN SCORES OF TUNED RANDOM FOREST CLASSIFIER, DEV (k = 10): ")
# print(mean(rf_cv_dev))

# print("\n================")
# print("MEAN SCORES OF TUNED RANDOM FOREST CLASSIFIER, TEST (k = 10): ")
# print(mean(rf_cv_test))

**Added features increased the training accuracy of the random forest, but also increased variance. Drop uncorrelated values and make sure to add the newly engineered features to the test set. ******

In [32]:
out = rf_use.predict(test_fin)
out_df = pd.DataFrame(out, columns=["Survived"], index=test_fin.index)

test_temp = test_input.copy()
out_df["PassengerId"] = test_temp["PassengerId"]

out_df.to_csv("submission prediction 15.csv", index=False)

In [33]:
# from sklearn.metrics import accuracy_score
# # from sklearn.metrics import classification_matrix

# nn = create_baseline()
# nn.fit(x_train, y_train.values.ravel(), verbose=0)
# y_pred = nn.predict(x_train)
# # y_pred = int(y_pred)
# # print(y_pred.dtype)
# y_pred = pd.DataFrame(y_pred, columns=["Survived"], index=x_train.index).astype(int)
# print(y_pred.dtypes)



# acc_score = accuracy_score(y_train, y_pred)
# # class_matrix = classification_matrix(y_train, y_pred)

# train_loss, train_acc = nn.evaluate(x_train, y_train)

# print(train_loss)
# print(train_acc)
# print(acc_score)
# # print(class_matrix)

In [34]:
# train_loss, train_acc = nn.evaluate(x_train, y_train)
# dev_loss, dev_acc = nn.evaluate(x_dev, y_dev)

# print("\n================")
# print("TRAIN ACC, TRAIN LOSS: ["  + str(train_acc) + ", " + str(train_loss) + "]")

# print("\n================")
# print("TRAIN ACC, TRAIN LOSS: ["  + str(dev_acc) + ", " + str(dev_loss) + "]")

out = estimator.predict(test_fin)
out_df = pd.DataFrame(out, columns=["Survived"], index=test_fin.index)

test_temp = test_input.copy()
out_df["PassengerId"] = test_temp["PassengerId"]

out_df.to_csv("submission prediction 12.csv", index=False)

In [None]:
# rf_use.fit(features, labels.values.ravel())
# output = rf_use.predict(test_fin)
# output_df = pd.DataFrame(output, columns=["Survived"], index=test_fin.index)
# #
# test_temp = test_input.copy()
# print(test_temp.columns)
# output_df["PassengerId"] = test_temp["PassengerId"]
# print(output_df.columns)

In [None]:
# output_df.to_csv('submission predictions 9.csv', index=False)