In [5]:
import pandas as pd

# Load the data
df = pd.read_csv("../data/Titanic_dataset/train.csv")

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,BirthYear,Age,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,,1890.0,22.0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C85,1874.0,38.0,C
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.925,,1886.0,26.0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1,C123,1877.0,35.0,S
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.05,,1877.0,35.0,S


 Probability Rules and Concepts

In [8]:
# check for missing values
df['Sex'].isna().sum()

0

In [12]:
# total passengers
total_passengers = len(df)
print(f"Total passengers: {total_passengers}")

Total passengers: 891


In [70]:
male_count = len(df[df['Sex'] == 'male'])
print(f"Number of male passengers: {male_count}")

Number of male passengers: 577


In [71]:
female_count = len(df[df['Sex'] == 'female'])
print(f"Number of female passengers: {female_count}")

Number of female passengers: 314


In [72]:
p_male = male_count / total_passengers
p_female = female_count / total_passengers

print(f"Probability of male passengers: {p_male}")
print(f"Probability of female passengers: {p_female}")


Probability of male passengers: 0.6475869809203143
Probability of female passengers: 0.35241301907968575


Basic Probability Rules for Quantitative Variables

In [73]:
# check for missing values
df['Age'].isna().sum()

177

In [74]:
# clean the data
df_age_clean = df.dropna(subset=['Age'])
print(f"Number of passengers with age data: {len(df_age_clean)}")

Number of passengers with age data: 714


In [75]:
# compute probability of for 'Age' < 18
minors_count = len(df_age_clean[df_age_clean['Age'] < 18])
p_minors = minors_count / len(df_age_clean)
print(f"Probability of passengers under 18: {p_minors}")


Probability of passengers under 18: 0.15826330532212884


In [86]:
# compute probability for 'Fare' > 100
rich_count = len(df_age_clean[df_age_clean['Fare'] > 100])
p_rich = rich_count / len(df_age_clean)
print(f"Probability of passengers with fare > 100: {p_rich}")

Probability of passengers with fare > 100: 0.06722689075630252


Conditional Probability

conditional probability on qualitative variable Sex on the subset of events where Survived

In [77]:
# compute probability of survival
survived_count = len(df[df['Survived'] == 1])
p_survived = survived_count / total_passengers
print(f"Probability of survival: {p_survived}")

# compute probability of female passengers
female_count = len(df[df['Sex'] == 'female'])
p_female = female_count / total_passengers
print(f"Probability of female passengers: {p_female}")

# compute probability for female and survived
survived_female_count = len(df[(df['Sex'] == 'female') & (df['Survived'] == 1)])
p_survived_female_count = survived_female_count / total_passengers
print(f"Probability of survived female passengers: {p_survived_female_count}")

# Conditional probability: Sex = female given Survived
if survived_count > 0:
    p_female_given_survived = survived_female_count / survived_count
else:
    p_female_given_survived = 0
print(f"Probability of female passengers given Survived: {p_female_given_survived}")

Probability of survival: 0.3838383838383838
Probability of female passengers: 0.35241301907968575
Probability of survived female passengers: 0.2615039281705948
Probability of female passengers given Survived: 0.6812865497076024


conditional probability on quantitative variable Age given criteria Fare > 100

In [78]:
# compute probability for 'Age' < 18 and 'Fare' > 100
minors_rich_count = len(df_age_clean[(df_age_clean['Age'] < 18) & (df_age_clean['Fare'] > 100)])
p_minors_rich = minors_rich_count / len(df_age_clean)
print(f"Probability of passengers under 18 with fare > 100: {p_minors_rich}")

# Conditional Probability: Fare > 100 given Age < 18
if minors_count > 0:
    p_rich_given_minors = minors_rich_count / minors_count
else:
    p_rich_given_minors = 0
print(f"Probability of fare > 100 given age < 18: {p_rich_given_minors}")

Probability of passengers under 18 with fare > 100: 0.00980392156862745
Probability of fare > 100 given age < 18: 0.061946902654867256


Bayes' Theorem

In [79]:
# check for missing values of survived and fare
df[['Survived', 'Fare']].isna().sum()

Survived    0
Fare        0
dtype: int64

In [80]:
print(f"Total passengers: {total_passengers}")

Total passengers: 891


In [81]:
# compute probability of survival
survived_count = len(df[df['Survived'] == 1])
p_survived = survived_count / total_passengers
print(f"Probability of survival: {p_survived}")

Probability of survival: 0.3838383838383838


In [82]:
# compute probability of fare > 100 given survival
rich_survived_count = len(df[(df['Fare'] > 100) & (df['Survived'] == 1)])
p_rich_survived = rich_survived_count / survived_count
print(f"Probability of fare > 100 given survival: {p_rich_survived}")

Probability of fare > 100 given survival: 0.11403508771929824


In [83]:
# compute probability of survival given fare > 100
if rich_count > 0:
    p_survived_rich = rich_survived_count / rich_count
else:
    p_survived_rich = 0
print(f"Probability of survival given fare > 100: {p_survived_rich}")

Probability of survival given fare > 100: 0.8125


In [84]:
# compute probability of survival given fare > 100 using Bayes' Theorem
p_rich = rich_count / len(df_age_clean)
p_survived = survived_count / total_passengers
p_survived_rich_bayes = (p_rich_survived * p_survived) / p_rich
print(f"Probability of survival given fare > 100 using Bayes' Theorem: {p_survived_rich_bayes}")

Probability of survival given fare > 100 using Bayes' Theorem: 0.6510942760942761
