In [5]:
#Load pandas
import pandas as pd

#Load data
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

#Dropping the columns that we won't use
train = train.drop(['Cabin', 'Embarked'], axis=1)
test  = test.drop(['Cabin', 'Embarked'], axis=1)

train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05


In [6]:
train.isna().mean()

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
dtype: float64

In [12]:
#Converting sex to numeric
for df in [train,test]:
    df['Sex_binary']=df['Sex'].map({'male':1,'female':0})

#Fill NAs with 0
train['Age'] = train['Age'].fillna(0)
test['Age'] = test['Age'].fillna(0)

#Select features and target
features = ['Pclass', 'Sex_binary', 'Parch', 'SibSp']
target = ['Survived']

train[features].head(5)

Unnamed: 0,Pclass,Sex_binary,Parch,SibSp
0,3,1,0,1
1,1,0,0,1
2,3,0,0,0
3,1,0,0,1
4,3,1,0,0


In [13]:
df_corr = train.corr(method = 'pearson').round(decimals = 2)

df_corr['Survived'].sort_values(ascending = False)

Survived       1.00
Fare           0.26
Parch          0.08
Age            0.01
PassengerId   -0.01
SibSp         -0.04
Pclass        -0.34
Sex_binary    -0.54
Name: Survived, dtype: float64

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Sex_binary   891 non-null    int64  
dtypes: float64(2), int64(6), object(3)
memory usage: 76.7+ KB


In [15]:
from sklearn.tree import DecisionTreeClassifier

#instantiate the tree
clf = DecisionTreeClassifier()

#Fit our classifier using the training features and the training target values
clf.fit(train[features], train[target])

DecisionTreeClassifier()

In [16]:
#make predictions using the features from the test data set
predictions = clf.predict(test[features])

predictions

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [17]:
#Creating a Dataframe with PassengerID in the first column and whether they survived in the second column
submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':predictions})

#Visualize the first 5 rows
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [18]:
#Convert Dataframe to csv file
filename = 'Titanic_Predictions_2.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: Titanic_Predictions_2.csv
