In [39]:
import pandas as pd

In [40]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


**Data Exploration**

Dropping variables whos impact is less or negligible

In [41]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'], axis='columns',inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


*inplace=True* modifies the original DataFrame, while *inplace=False* or not specifying it creates a new DataFrame, leaving the original unchanged.

In [42]:
# Assuming 'df' is a DataFrame with a 'Survived' column.

# Extract the target variable ('Survived') and store it in the 'target' variable.
target = df.Survived

# Create a new DataFrame 'inputs' by dropping the 'Survived' column from the original DataFrame.
# This is done to separate the target variable from the input features.
inputs = df.drop('Survived', axis='columns')


In [43]:
# Assuming 'inputs' is a DataFrame and 'Sex' is a column in it.

# Create dummy variables for the 'Sex' column using one-hot encoding.
# This will convert categorical values in 'Sex' into binary columns (0 or 1).
dummies = pd.get_dummies(inputs.Sex)

# Display the first three rows of the resulting DataFrame containing dummy variables.
dummies.head(3)


Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0


In [44]:
# Assuming 'inputs' is a DataFrame and 'dummies' is another DataFrame containing dummy variables.

# Concatenate the 'inputs' DataFrame and the 'dummies' DataFrame along the columns (axis=1).
# This combines the original input features with the dummy variables created for the 'Sex' column.
inputs = pd.concat([inputs, dummies], axis='columns')

# Display the first three rows of the modified 'inputs' DataFrame.
inputs.head(3)


Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0


In [45]:
# Drop the original 'Sex' column from the 'inputs' DataFrame as it has been replaced with dummy variables.
# This step is performed inplace, modifying the original DataFrame.
inputs.drop('Sex', axis='columns', inplace=True)

# Display the first three rows of the modified 'inputs' DataFrame.
inputs.head(3)

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0


**Handling Missing Values**

In [46]:
# Identify columns in 'inputs' with at least one NaN value.
inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [47]:
inputs.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [48]:
# Fill missing values in the 'Age' column of the 'inputs' DataFrame with the mean value of the column.
# This helps handle NaN values by replacing them with the average age.
inputs.Age = inputs.Age.fillna(inputs.Age.mean())

# Display the first few rows of the modified 'inputs' DataFrame after filling missing values.
inputs.head(6)


Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1
5,3,29.699118,8.4583,0,1


**Training and Testing**

*  20% data used for testing
*  80% data used for training



In [49]:
# Import the 'train_test_split' function from scikit-learn to split the data into training and testing sets.
from sklearn.model_selection import train_test_split

# Split the 'inputs' and 'target' into training and testing sets.
# 'test_size=0.2' specifies that 20% of the data will be used for testing, and 80% for training.
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)


In [50]:
len(X_train)

712

In [51]:
len(X_test)

179

In [52]:
len(inputs)

891

training data = 712 (80 %)

testing data = 179(20 %)

total data = 891

**Naive Bayes Model**

Gaussian Naive Bayes is a machine learning algorithm used for classification tasks. It assumes that the features are normally distributed and independent within each class. It applies Bayes' theorem to calculate the probability of a sample belonging to a certain class, making it a simple and efficient classifier

In [53]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

we always call 'fit' in order to train the model

In [54]:
model.fit(X_train, y_train)

In [55]:
model.score(X_test, y_test)

0.770949720670391

In [56]:
# Display the first 10 rows of the testing set 'X_test'.
X_test[:10]

Unnamed: 0,Pclass,Age,Fare,female,male
400,3,39.0,7.925,0,1
690,1,31.0,57.0,0,1
459,3,29.699118,7.75,0,1
235,3,29.699118,7.55,1,0
413,2,29.699118,0.0,0,1
285,3,33.0,8.6625,0,1
709,3,29.699118,15.2458,0,1
622,3,20.0,15.7417,0,1
666,2,25.0,13.0,0,1
450,2,36.0,27.75,0,1


In [57]:
# Display the first 10 rows of the testing set 'y_test'.
y_test[:10]


400    1
690    1
459    0
235    0
413    0
285    0
709    1
622    1
666    0
450    0
Name: Survived, dtype: int64

In [58]:
# Use the trained machine learning 'model' to make predictions on the first 10 rows of the testing set 'X_test'.
predictions = model.predict(X_test[:10])

# Display the predictions.
predictions


array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [59]:
# Use the trained machine learning 'model' to calculate class probabilities for the first 10 rows of the testing set 'X_test'.
# 'predict_proba' returns the probability estimates for each class in a binary or multiclass classification problem.
probability_predictions = model.predict_proba(X_test[:10])

# Display the probability predictions.
probability_predictions


array([[0.99068073, 0.00931927],
       [0.88422802, 0.11577198],
       [0.98961598, 0.01038402],
       [0.06761532, 0.93238468],
       [0.97718059, 0.02281941],
       [0.99011802, 0.00988198],
       [0.98985155, 0.01014845],
       [0.98793242, 0.01206758],
       [0.97737869, 0.02262131],
       [0.97969277, 0.02030723]])

In [61]:
probability_predictions = model.predict_proba(X_test[:10]) * 100

# Display the probability predictions in percentage form.
probability_predictions


array([[99.06807334,  0.93192666],
       [88.4228021 , 11.5771979 ],
       [98.96159778,  1.03840222],
       [ 6.76153234, 93.23846766],
       [97.71805896,  2.28194104],
       [99.01180212,  0.98819788],
       [98.98515543,  1.01484457],
       [98.79324158,  1.20675842],
       [97.73786923,  2.26213077],
       [97.96927725,  2.03072275]])

***so for person 0***

probability of not survival = 99.06 %

probability of survival = 0.93 %

this probability of not survival is higher -> **so person did not survived**

In [64]:
# Check the order of classes in the model
class_order = model.classes_
print("Class Order:", class_order)

# Display the probability predictions in percentage form.
probability_predictions = model.predict_proba(X_test[:10]) * 100
probability_predictions

# 0 = not survived
# 1 = survived

Class Order: [0 1]


array([[99.06807334,  0.93192666],
       [88.4228021 , 11.5771979 ],
       [98.96159778,  1.03840222],
       [ 6.76153234, 93.23846766],
       [97.71805896,  2.28194104],
       [99.01180212,  0.98819788],
       [98.98515543,  1.01484457],
       [98.79324158,  1.20675842],
       [97.73786923,  2.26213077],
       [97.96927725,  2.03072275]])