In [1]:
#load packages
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time


#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)


Python version: 3.12.6 | packaged by conda-forge | (main, Sep 11 2024, 04:55:15) [Clang 17.0.6 ]
pandas version: 2.2.2
matplotlib version: 3.9.2
NumPy version: 1.26.4
SciPy version: 1.13.1
IPython version: 8.27.0
scikit-learn version: 1.5.1
-------------------------


In [2]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [3]:
train_data = pd.read_csv('../data/train.csv')
test_data  = pd.read_csv('../data/test.csv')

# By using a list w/ references, we can clean both datasets at once
data_cleaner = [train_data, test_data]

#preview data
print (train_data.info()) 
train_data.head()
test_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
print('Train columns with null values:\n', train_data.isnull().sum())
print("-"*10)

print('Test/Validation columns with null values:\n', test_data.isnull().sum())
print("-"*10)

Train columns with null values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
----------
Test/Validation columns with null values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
----------


In [5]:
###COMPLETING: complete or delete missing values in train and test/validation dataset
for dataset in data_cleaner:    
    #complete missing age with median
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)

    #complete embarked with mode
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)

    #complete missing fare with median
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)
    
    #delete the cabin feature/column and others previously stated to exclude in train dataset
    drop_column = ['Cabin', 'Ticket', 'Name']
    dataset.drop(drop_column, axis=1, inplace = True)
    dataset.set_index(keys=['PassengerId'], drop=True, inplace=True)

print(train_data.isnull().sum())
print("-"*10)
print(test_data.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64
----------
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [6]:
###CREATE: Feature Engineering for train and test/validation dataset
for dataset in data_cleaner:    
    #Discrete variables
    dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1

    dataset['IsAlone'] = 1 #initialize to yes/1 is alone
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1

#preview data again
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,male,22.0,1,0,7.25,S,2,0
2,1,1,female,38.0,1,0,71.2833,C,2,0
3,1,3,female,26.0,0,0,7.925,S,1,1
4,1,1,female,35.0,1,0,53.1,S,2,0
5,0,3,male,35.0,0,0,8.05,S,1,1


In [7]:
#CONVERT: convert objects to category using Label Encoder for train and test/validation dataset

#code categorical data
label = LabelEncoder()
for dataset in data_cleaner:    
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])

    columns_map = {'Sex': {'male': 0, 'female': 1}}
    dataset.replace(columns_map, inplace=True)


#define y variable aka target/outcome
Target = ['Survived']

#define x variables for original features aka feature selection
data_x = ['Sex','Pclass', 'Embarked','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone']
data_xy =  Target + data_x

#encode embarking location to categories
train_data = pd.get_dummies(train_data[data_xy])
test_data = pd.get_dummies(test_data[data_x])


In [8]:
train_data.head()

Unnamed: 0_level_0,Survived,Sex,Pclass,SibSp,Parch,Age,Fare,FamilySize,IsAlone,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,0,3,1,0,22.0,7.25,2,0,False,False,True
2,1,1,1,1,0,38.0,71.2833,2,0,True,False,False
3,1,1,3,0,0,26.0,7.925,1,1,False,False,True
4,1,1,1,1,0,35.0,53.1,2,0,False,False,True
5,0,0,3,0,0,35.0,8.05,1,1,False,False,True


In [9]:
test_data.head()

Unnamed: 0_level_0,Sex,Pclass,SibSp,Parch,Age,Fare,FamilySize,IsAlone,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
892,0,3,0,0,34.5,7.8292,1,1,False,True,False
893,1,3,1,0,47.0,7.0,2,0,False,False,True
894,0,2,0,0,62.0,9.6875,1,1,False,True,False
895,0,3,0,0,27.0,8.6625,1,1,False,False,True
896,1,3,1,1,22.0,12.2875,3,0,False,False,True


In [10]:
print('Train columns with null values: \n', train_data.isnull().sum())
print("-"*10)
print (train_data.info())
print("-"*10)

print('Test/Validation columns with null values: \n', train_data.isnull().sum())
print("-"*10)
print (train_data.info())
print("-"*10)

train_data.describe(include = 'all')

Train columns with null values: 
 Survived      0
Sex           0
Pclass        0
SibSp         0
Parch         0
Age           0
Fare          0
FamilySize    0
IsAlone       0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64
----------
<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Sex         891 non-null    int64  
 2   Pclass      891 non-null    int64  
 3   SibSp       891 non-null    int64  
 4   Parch       891 non-null    int64  
 5   Age         891 non-null    float64
 6   Fare        891 non-null    float64
 7   FamilySize  891 non-null    int64  
 8   IsAlone     891 non-null    int64  
 9   Embarked_C  891 non-null    bool   
 10  Embarked_Q  891 non-null    bool   
 11  Embarked_S  891 non-null    bool   
dtypes: bool(3), float64(2), int64(7)
memory usage: 72.2 KB
None
----------


Unnamed: 0,Survived,Sex,Pclass,SibSp,Parch,Age,Fare,FamilySize,IsAlone,Embarked_C,Embarked_Q,Embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891,891,891
unique,,,,,,,,,,2,2,2
top,,,,,,,,,,False,False,True
freq,,,,,,,,,,723,814,646
mean,0.383838,0.352413,2.308642,0.523008,0.381594,29.361582,32.204208,1.904602,0.602694,,,
std,0.486592,0.47799,0.836071,1.102743,0.806057,13.019697,49.693429,1.613459,0.489615,,,
min,0.0,0.0,1.0,0.0,0.0,0.42,0.0,1.0,0.0,,,
25%,0.0,0.0,2.0,0.0,0.0,22.0,7.9104,1.0,0.0,,,
50%,0.0,0.0,3.0,0.0,0.0,28.0,14.4542,1.0,1.0,,,
75%,1.0,1.0,3.0,1.0,0.0,35.0,31.0,2.0,1.0,,,


In [11]:
x_train = train_data.loc[:, train_data.columns != 'Survived']
y_train = train_data.loc[:, 'Survived']

x_train, x_test, y_train, y_test = model_selection.train_test_split(x_train, y_train, test_size=0.33, random_state=10)

In [12]:
print(x_train.head())
print(y_train.head())
print(test_data.head())

             Sex  Pclass  SibSp  Parch   Age    Fare  FamilySize  IsAlone  \
PassengerId                                                                 
464            0       2      0      0  48.0  13.000           1        1   
160            0       3      8      2  28.0  69.550          11        0   
48             1       3      0      0  28.0   7.750           1        1   
403            1       3      1      0  21.0   9.825           2        0   
619            1       2      2      1   4.0  39.000           4        0   

             Embarked_C  Embarked_Q  Embarked_S  
PassengerId                                      
464               False       False        True  
160               False       False        True  
48                False        True       False  
403               False       False        True  
619               False       False        True  
PassengerId
464    0
160    0
48     1
403    0
619    1
Name: Survived, dtype: int64
             Sex  Pclass

In [13]:
random.seed(0)
np.random.seed(0)

In [14]:
rfc = ensemble.RandomForestClassifier(criterion="gini", max_depth=4, random_state=0)
rfc.fit(x_train.values, y_train.values)
print(rfc.score(x_test.values, y_test.values))

y_pred = rfc.predict(x_test.values)
y_truth = y_test.values

0.847457627118644


In [15]:
tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_truth, y_pred).ravel()
print("Confusion Matrix")
print(sklearn.metrics.confusion_matrix(y_truth, y_pred, labels=[0, 1]))
print("")
print("True Negatives", tn)
print("False Positives", fp)
print("False Negatives", fn)
print("True Positives", tp)

Confusion Matrix
[[175  16]
 [ 29  75]]

True Negatives 175
False Positives 16
False Negatives 29
True Positives 75
