In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

import os;
os.listdir('/kaggle/input/')

titanic_train = pd.read_csv("../input/titanic-data/train.csv")
titanic_test = pd.read_csv("../input/titanic-data/test.csv")
print(titanic_train.shape)
titanic_test.head()

(891, 12)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [2]:
unsurvivors = len(titanic_train[titanic_train['Survived'] == 0])
survivors = len(titanic_train[titanic_train['Survived'] == 1])
print(titanic_train.shape)
unsurvivors, survivors

(891, 12)


(549, 342)

In [3]:
#Look for useless features in the data
print(titanic_train.dtypes)
print(titanic_train.isna().sum())

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


Based on the information above and from the feature descriptions, there is barely any data on cabin numbers and location embarked from seems like an unnecessary variable. Along with that, the variables name, ticket, and embarked don't show much significance in my opinion, so they are candidates to be dropped as well.

In [4]:
#Survival Using M/F
survival_rate_sex = titanic_train.groupby('Sex')['Survived'].mean()
survival_rate_class = titanic_train.pivot_table('Survived', index = 'Sex', columns = 'Pclass')
survival_avg_age = titanic_train.groupby('Sex')['Age'].mean()
survival_rate_sex, survival_rate_class, survival_avg_age

(Sex
 female    0.742038
 male      0.188908
 Name: Survived, dtype: float64,
 Pclass         1         2         3
 Sex                                 
 female  0.968085  0.921053  0.500000
 male    0.368852  0.157407  0.135447,
 Sex
 female    27.915709
 male      30.726645
 Name: Age, dtype: float64)

In [5]:
#Survival Using Class, Age, and Sex
old_or_young = ['older' if x >= 18 else 'younger' for x in titanic_train["Age"]]
#lower_age = titanic_age[titanic_train["Age"] > 18]
titanic_train['Age_split'] = old_or_young

#older is 18+ & younger is 0-18
titanic_train.pivot_table('Survived', ['Sex', 'Age'], 'Pclass')

Unnamed: 0_level_0,Pclass,1,2,3
Sex,Age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.75,,,1.00
female,1.00,,,1.00
female,2.00,0.0,1.0,0.25
female,3.00,,1.0,0.00
female,4.00,,1.0,1.00
...,...,...,...,...
male,70.00,0.0,0.0,
male,70.50,,,0.00
male,71.00,0.0,,
male,74.00,,,0.00


Based on all this data, we will be using the remaining features and change the categorical data values to a numerical system.

In [6]:
sex_as_val = [1 if x == 'Male' else 0 for x in titanic_train["Age"]]
titanic_train['Sex'] = sex_as_val
print(titanic_train.shape)
titanic_train.dtypes

(891, 13)


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
Age_split       object
dtype: object

In [7]:
#Fix Categorical Values
titanic_train['Age'] = [1 if x >= 18 else 0 for x in titanic_train["Age"]]
titanic_train['Sex'] = [1 if x == 'Male' else 0 for x in titanic_train["Sex"]]

#Remove Useless Features
titanic_train = titanic_train.drop(['Cabin', 'Embarked', 'Age_split', 'Ticket', 'Name'], axis = 1)
titanic_train = titanic_train.set_index('PassengerId')
titanic_train.shape, titanic_train.dtypes


((891, 7),
 Survived      int64
 Pclass        int64
 Sex           int64
 Age           int64
 SibSp         int64
 Parch         int64
 Fare        float64
 dtype: object)

In [8]:
#Do same on Test
titanic_test['Age'] = [1 if x >= 18 else 0 for x in titanic_test["Age"]]
titanic_test['Sex'] = [1 if x == 'Male' else 0 for x in titanic_test["Sex"]]

titanic_test = titanic_test.set_index('PassengerId')
titanic_test = titanic_test.drop(['Cabin', 'Embarked', 'Ticket', 'Name'], axis = 1)
titanic_test.shape, titanic_test.dtypes

((418, 6),
 Pclass      int64
 Sex         int64
 Age         int64
 SibSp       int64
 Parch       int64
 Fare      float64
 dtype: object)

In [9]:
titanic_test = titanic_test.fillna(0)
#Separate X, Y, Train, Test Properly
Y_train = titanic_train.iloc[:, 0]
X_train = titanic_train.iloc[:, 1:]
X_test = titanic_test.fillna(0)

In [10]:
#Scale data to Normalize
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(titanic_test)

In [11]:
#Check Test Data
print(titanic_test.isna().sum())

Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
Fare      0
dtype: int64


In [12]:
#Trying different Methods

#Logistical Regression
lg = LogisticRegression(random_state = 42)
lg.fit(X_train, Y_train)
print('Logistical Regression Score:', lg.score(X_train, Y_train))
lg_prediction = lg.predict(X_test)

Logistical Regression Score: 0.696969696969697


In [13]:
#Turn my predictions into a DataFrame

preditions = pd.DataFrame(data = lg_prediction, index = titanic_test.index)
tada = preditions.to_csv()