In [61]:
import pandas as pd
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [62]:
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis='columns',inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [63]:
target = df.Survived
inputs=df.drop('Survived', axis='columns')  #drop survivors for separate target

In [75]:
dummies = pd.get_dummies(inputs.Sex, dtype=int)   #convert male/female into numeric using dummies func from pandas coz ML doesn't like strings
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [76]:
inputs = pd.concat([inputs,dummies], axis='columns') #concat inputs table w/o survivor and dummies(sex)
inputs.head(5)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0
3,1,female,35.0,53.1,1,0
4,3,male,35.0,8.05,0,1


In [77]:
#drop sex column
inputs.drop('Sex', axis='columns', inplace=True)
inputs.head(5)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [78]:
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [79]:
inputs.Age[:10]    #check NA values

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [80]:
#fill the NA with mean

inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.000000,7.2500,0,1
1,1,38.000000,71.2833,1,0
2,3,26.000000,7.9250,1,0
3,1,35.000000,53.1000,1,0
4,3,35.000000,8.0500,0,1
...,...,...,...,...,...
886,2,27.000000,13.0000,0,1
887,1,19.000000,30.0000,1,0
888,3,29.699118,23.4500,1,0
889,1,26.000000,30.0000,0,1


In [81]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [82]:
pip show scikit-learn

Name: scikit-learn
Version: 1.5.2
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: BSD 3-Clause License

Copyright (c) 2007-2024 The scikit-learn developers.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS 

In [83]:
import sklearn

In [84]:
from sklearn.model_selection import train_test_split   #splits data into train and test set
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)      #test size 20% of the list

In [85]:
#train set numbers
len(X_train)

712

In [86]:
#test set numbers
len(X_test)

179

In [87]:
#total num of inputs
len(inputs)

891

In [108]:
#create naive bayes model

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [109]:
model.fit(X_train, y_train)

In [110]:
#accuracy of model
model.score(X_test, y_test)

0.8435754189944135

In [91]:
X_test[:10]

Unnamed: 0,Pclass,Age,Fare,female,male
555,1,62.0,26.55,0,1
259,2,50.0,26.0,1,0
855,3,18.0,9.35,1,0
661,3,40.0,7.225,0,1
389,2,17.0,12.0,1,0
367,3,29.699118,7.2292,1,0
339,1,45.0,35.5,0,1
40,3,40.0,9.475,1,0
584,3,29.699118,8.7125,0,1
544,1,50.0,106.425,0,1


In [95]:
y_test[:10]

555    0
259    1
855    1
661    0
389    1
367    1
339    0
40     0
584    0
544    0
Name: Survived, dtype: int64

In [102]:
#predict survival for first 10 guests, 0 is no, 1 is yes
model.predict(X_test[:10])

array([0, 1, 1, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [112]:
#predict survival using probability for first 10 guests, first column didnt survive, second survived
model.predict_proba(X_test[:10])

array([[0.84916948, 0.15083052],
       [0.04015474, 0.95984526],
       [0.09066795, 0.90933205],
       [0.98946988, 0.01053012],
       [0.0376137 , 0.9623863 ],
       [0.1061125 , 0.8938875 ],
       [0.87897158, 0.12102842],
       [0.10888523, 0.89111477],
       [0.98933585, 0.01066415],
       [0.26178898, 0.73821102]])

In [115]:
len(y_train)

712

In [116]:
len(y_test)

179