The train.csv can be obtained here : https://www.kaggle.com/competitions/titanic/data?select=train.csv

### Understanding the data dictionary and the variables :

Data Dictionary of original dataset
================
| Variable | Description | Details |
| -------- | ----------- | ------- |
| Survival | Survival | 0 = No; 1 = Yes |
| Pclass | Passenger Class | 1 = upper; 2 = middle; 3 = lower |
| Name | First and Last Name | |
| Sex | Sex | |
| Age | Age | Fractional if Age less than One (1); If the Age is Estimated, it is in the form xx.5 |
| Sibsp | Number of Siblings/Spouses Aboard | |
| Parch | Number of Parents/Children Aboard | |
| Ticket | Ticket Number | |
| Fare | Passenger Fare | |
| Cabin | Cabin | |
| Embarked | Port of Embarkation | C = Cherbourg; Q = Queenstown; S = Southampton |


### Importing libraries and setting up plotly

In [1]:
import pandas as pd 
import numpy as np 

import plotly as py 
import plotly.express as px 
import plotly.graph_objects as go 
from plotly.subplots import make_subplots

import plotly.io as pio
pio.templates.default = "plotly_dark"

import warnings
warnings.filterwarnings('ignore')

### Loading and copying the train dataset

In [2]:
train = pd.read_csv("train.csv")

df = train.copy(deep=True)

### Viewing the first five rows of the dataset

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Creating New Features from Existing Columns

In [4]:
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
df['CabinLetter'] = df['Cabin'].str[0]

### Checking the number of rows and columns in the dataframe

In [5]:
df.shape

(891, 14)

### Checking the data types and missing values

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  FamilySize   891 non-null    int64  
 13  CabinLetter  204 non-null    object 
dtypes: float64(2), int64(6), object(6)
memory usage: 97.6+ KB


### Summary statistics of the numerical columns

In [7]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,1.904602
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,1.613459
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,1.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104,1.0
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,1.0
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0,2.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,11.0


### Goal : Analyze the missing values in the dataset

In [8]:
def missing_values(df):
    missingInfo = pd.DataFrame(df.dtypes, columns=["dtypes"])
    missingInfo["missing"] = df.isnull().sum()
    missingInfo["% missing"] = round((df.isnull().sum() / len(df)) * 100, 2)
    missingInfo = missingInfo.sort_values(by="% missing", ascending=False)
    fig_missing = px.imshow(df.isnull(), width=500, title="Missing Values")
    fig_missing.show()
    return missingInfo
missing_values(df)

Unnamed: 0,dtypes,missing,% missing
Cabin,object,687,77.1
CabinLetter,object,687,77.1
Age,float64,177,19.87
Embarked,object,2,0.22
PassengerId,int64,0,0.0
Survived,int64,0,0.0
Pclass,int64,0,0.0
Name,object,0,0.0
Sex,object,0,0.0
SibSp,int64,0,0.0


### Replacing missing values

In [9]:
df["Age"] = df["Age"].fillna(df["Age"].mean())
df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode()[0])
df["Cabin"] = df["Cabin"].fillna("Unknown")
df["CabinLetter"] = df["CabinLetter"].fillna("Unknown")
missing_values(df)

Unnamed: 0,dtypes,missing,% missing
PassengerId,int64,0,0.0
Survived,int64,0,0.0
Pclass,int64,0,0.0
Name,object,0,0.0
Sex,object,0,0.0
Age,float64,0,0.0
SibSp,int64,0,0.0
Parch,int64,0,0.0
Ticket,object,0,0.0
Fare,float64,0,0.0


### Goal : Analyze the outliers in titanic dataset

In [10]:
box_traces = [
    go.Box(x=df['Fare'], name='Fare', marker=dict(color='#87edff')),
    go.Box(x=df['Age'], name='Age', marker=dict(color='#f37a7a')),
    go.Box(x=df['SibSp'], name='SibSp', marker=dict(color='green')),
    go.Box(x=df['Parch'], name='Parch', marker=dict(color='orange'))
]

layout = go.Layout(title='Outliers in Titanic Dataset', height=400, font=dict(size=14))

fig = go.Figure(data=box_traces, layout=layout)

fig.show()

In [11]:
fare_stats = df['Fare'].describe()
age_stats = df['Age'].describe()
sibsp_stats = df['SibSp'].describe()
parch_stats = df['Parch'].describe()

fare_outlier_range = 1.5 * (fare_stats['75%'] - fare_stats['25%'])
age_outlier_range = 1.5 * (age_stats['75%'] - age_stats['25%'])
sibsp_outlier_range = 1.5 * (sibsp_stats['75%'] - sibsp_stats['25%'])
parch_outlier_range = 1.5 * (parch_stats['75%'] - parch_stats['25%'])

fare_outliers = df[df['Fare'] > fare_stats['75%'] + fare_outlier_range]
age_outliers = df[df['Age'] < age_stats['25%'] - age_outlier_range]
sibsp_outliers = df[df['SibSp'] > sibsp_stats['75%'] + sibsp_outlier_range]
parch_outliers = df[df['Parch'] > parch_stats['75%'] + parch_outlier_range]

print("Outlier Information:")
print(f"Parch Outliers (Above {parch_stats['75%'] + parch_outlier_range:.2f}): {len(parch_outliers)} passengers")
print(f"SibSp Outliers (Above {sibsp_stats['75%'] + sibsp_outlier_range:.2f}): {len(sibsp_outliers)} passengers")
print(f"Age Outliers (Below {age_stats['25%'] - age_outlier_range:.2f}): {len(age_outliers)} passengers")
print(f"Fare Outliers (Above {fare_stats['75%'] + fare_outlier_range:.2f}): {len(fare_outliers)} passengers")

Outlier Information:
Parch Outliers (Above 0.00): 213 passengers
SibSp Outliers (Above 2.50): 46 passengers
Age Outliers (Below 2.50): 24 passengers
Fare Outliers (Above 65.63): 116 passengers


### Goal : Distribution of data

In [12]:
def plot_row(column, row, pull=None):
    fig.add_trace(go.Bar(
        x=df[column].unique(), 
        y=df[column].value_counts().values,
        text=df[column].value_counts(),
        name="",
        marker_color=None),
    row=row, col=1)

    fig.add_trace(go.Pie(
        labels=df[column].unique(),
        values=df[column].value_counts().values, 
        name="",
        pull=pull,
        marker_colors=px.colors.qualitative.G10),
    row=row, col=2)


fig = make_subplots(rows=8, cols=2,
                    specs=[[{}, {'type': 'domain'}],
                           [{}, {'type': 'domain'}],
                           [{}, {'type': 'domain'}],
                           [{}, {'type': 'domain'}],
                           [{"rowspan": 2, "colspan": 2}, None],
                           [None, None],
                           [{"rowspan": 2, "colspan": 2}, None],
                           [None, None]],
                    subplot_titles=('Gender Bar', 'Gender Pie', 'Pclass Bar', 'Pclass Pie', 'Embarked Bar', 'Embarked Pie',
                                    'Survived Bar', 'Survived Pie', 'Age Distribution', 'Family Size'),
                    )

plot_row('Sex', 1, pull=[0.1, 0])
plot_row('Pclass', 2, pull=[0.1, 0, 0])
plot_row('Embarked', 3, pull=[0.1, 0, 0])
plot_row('Survived', 4, pull=[0.1, 0])

fig.add_trace(go.Histogram(
    x=df['Age'], name="",
    histnorm='density',
    marker_color="#f4a582"),
row=5, col=1)

fig.add_trace(go.Histogram(x=df['FamilySize'],
    name="",
    histnorm='density',
    text=df['FamilySize'].value_counts(),
    marker_color=px.colors.diverging.RdBu),
row=7, col=1)

fig.update_layout(height=1600,
    width=800,
    showlegend=False,
    title_text="Distribution of data",
    title_x=0.5,
    titlefont={'size': 25, 'family': 'Roboto', 'color': 'white'}, 
    paper_bgcolor="black",
    plot_bgcolor="black",
    font_color="white")

fig.update_yaxes(showgrid=False)

fig.update_traces(marker_line_color='black',
                  marker_line_width=2, textfont_color='white')

fig.update_annotations(font={'color': '#6bddff'})

fig.show()


In [13]:
def histogram_summary(column):
    print("Summary for", column)
    
    data = df[column].values
    
    data = data[~np.isnan(data)].astype(int)

    mean = np.mean(data)
    median = np.median(data)
    mode = np.argmax(np.bincount(data))

    range1 = np.ptp(data)
    iqr = np.percentile(data, 75) - np.percentile(data, 25)
    std = np.std(data)

    print("Mean :", mean)
    print("Median:", median)
    print("Mode:", mode)
    print("Range:", range1)
    print("Interquartile range:", iqr)
    print("Standard deviation:", std, "\n")

histogram_summary("Age")
histogram_summary("FamilySize")

Summary for Age
Mean : 29.544332210998878
Median: 29.0
Mode: 29
Range: 80
Interquartile range: 13.0
Standard deviation: 13.006473346327034 

Summary for FamilySize
Mean : 1.904601571268238
Median: 1.0
Mode: 1
Range: 10
Interquartile range: 1.0
Standard deviation: 1.6125528671095162 



### Goal : Correlation Among All Features

In [14]:
corr = df.corr(method='pearson')

mask = np.triu(np.ones_like(corr, dtype=bool))

masked_corr = corr.mask(mask)

fig = px.imshow(masked_corr,
                title='Correlations Among All Features',
                height=700, width=700)

fig.update_traces(text=corr.values.round(2),
                  hovertemplate='Feature 1: %{y}<br>Feature 2: %{x}<br>Correlation: %{text}',
                  colorbar=dict(title="Correlation"))

for i in range(len(masked_corr)):
    for j in range(len(masked_corr)):
        if i != j and not pd.isnull(masked_corr.iloc[i, j]):
            color = 'white' if float(masked_corr.iloc[i, j]) < 0.5 else 'black'
            fig.add_annotation(x=j, y=i, text=str(masked_corr.iloc[i, j].round(2)),
                               showarrow=False, font=dict(color=color))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)

fig.show()

### Distribution of Survivors and Non-Survivors

In [15]:
color_map = {0: '#ef553b', 1: '#00cc96'}
columns = ['Sex', 'Pclass', 'Age', 'FamilySize', 'SibSp', 'Parch']
fig = make_subplots(rows=3, cols=2, subplot_titles=columns)

for i, col in enumerate(columns):
    text = True if col in ['Sex', 'Pclass'] else False
    hist = px.histogram(df, x=col, color='Survived', text_auto=text, barmode="group", opacity=0.7, color_discrete_map=color_map)
    hist.update_traces(textfont_color='white')
    fig.add_trace(hist.data[0], row=(i//2)+1, col=(i % 2)+1) 
    fig.add_trace(hist.data[1], row=(i//2)+1, col=(i % 2)+1)

fig.update_layout(
    showlegend=False,
    width=900,
    height=900,
    title='Distribution of Survivors and Non-Survivors'
)

fig.show()

In [16]:
def age_group(age):
    if age < 18:
        return 'Child'
    elif age < 60:
        return 'Adult'
    else:
        return 'Senior'
    
df['AgeGroup'] = df['Age'].apply(age_group)

variables = ['Sex', 'Pclass', 'AgeGroup', 'FamilySize', 'SibSp', 'Parch']

for var in variables:
    dist = df.groupby([var, 'Survived'])['Survived'].count()
    print(pd.DataFrame(dist))

                 Survived
Sex    Survived          
female 0               81
       1              233
male   0              468
       1              109
                 Survived
Pclass Survived          
1      0               80
       1              136
2      0               97
       1               87
3      0              372
       1              119
                   Survived
AgeGroup Survived          
Adult    0              478
         1              274
Child    0               52
         1               61
Senior   0               19
         1                7
                     Survived
FamilySize Survived          
1          0              374
           1              163
2          0               72
           1               89
3          0               43
           1               59
4          0                8
           1               21
5          0               12
           1                3
6          0               19
           1          

### Goal : Distribution of Age by Sex and Survival status

In [17]:
fig = px.violin(df, y="Age", x="Sex", 
                title = "Distribution of age by sex and survival status", 
                color="Survived", box=True, points="all", 
                width=800, height=500)
fig.show()

In [18]:
grouped = df.groupby(["Sex", "Survived"])
stats = grouped["Age"].agg(["mean", "median", "std", "quantile"])
print(pd.DataFrame(stats))

                      mean     median        std   quantile
Sex    Survived                                            
female 0         26.023272  29.000000  12.234723  29.000000
       1         28.979263  29.699118  13.032597  29.699118
male   0         31.175224  29.699118  12.350532  29.699118
       1         27.631705  29.699118  15.257584  29.699118


### Goal : Relationship between Age, Fare, Pclass and Survival

In [19]:
color_sequence = ['#cd1e1e', '#00FF00']
fig = px.scatter_3d(df, x="Age", y="Fare", z="Pclass", color="Survived", color_continuous_scale=color_sequence, title="Relationship between Age, Fare, Pclass, and Survival", height=700, width=700)
fig.show()

### Goal : Survival Rate of Passengers by Cabin Letter

In [20]:
survival_rate = df.groupby("CabinLetter")["Survived"].mean()
survival_rate = pd.DataFrame(survival_rate).reset_index()

fig = go.Figure(data=[go.Pie(labels=survival_rate["CabinLetter"],
                             values=survival_rate["Survived"],
                             textinfo='label+percent',
                             hole=.3,
                             textfont={'color': 'dark blue', 'family': 'Arial Black', 'size': 15})])

fig.update_layout(title="Survival Rate of Passengers by Cabin Letter",
                  showlegend=False,
                  width=1000,
                  height=600,
                  colorway=['rgb(128, 0, 128)'])
fig.show()

### Goal : Survival Rate of Passengers by Title and Passenger Class

In [21]:
df['title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

df_grouped = df.groupby(['title', 'Pclass'])['Survived'].mean().reset_index()

fig = px.bar(df_grouped, x='title', y='Survived', color='Pclass', barmode='stack', labels={'title':'Title', 'Survived':'Survival Rate'}, title = "Survival Rate of Passengers by Title and Passenger Class")
fig.show()

In [22]:
df_grouped.sort_values(by='Survived', ascending=False)

Unnamed: 0,title,Pclass,Survived
25,Sir,1,1.0
10,Master,2,1.0
2,Countess,1,1.0
23,Ms,2,1.0
16,Mme,1,1.0
15,Mlle,1,1.0
7,Lady,1,1.0
9,Master,1,1.0
20,Mrs,1,0.97619
12,Miss,1,0.956522


### Goal : Passenger flow from embarkation to survival status

In [23]:
nodes = dict(pad=15, thickness=20, line=dict(color="black", width=0.5), label=["Southampton", "Cherbourg", "Queenstown", "3rd Class", "2nd Class", "1st Class", "Survived", "Died"], color=["#336699", "#FF9933", "#CC66CC", "#669966", "#FF99CC ", "#FF99CC", "#00AA00", "#AA0000"])

links = dict(source=[], target=[], value=[], color=[])

def get_count(condition):
    return df[condition].shape[0]

for i, port in enumerate(["S", "C", "Q"]):
    for j, pclass in enumerate([3, 2, 1]):
        links["source"].append(i)
        links["target"].append(3 + j)
        links["value"].append(get_count((df["Embarked"] == port) & (df["Pclass"] == pclass)))
        links["color"].append(nodes["color"][i])

for k, pclass in enumerate([3, 2, 1]):
    for l, survived in enumerate([1, 0]):
        links["source"].append(3 + k)
        links["target"].append(6 + l)
        links["value"].append(get_count((df["Pclass"] == pclass) & (df["Survived"] == survived)))
        links["color"].append(nodes["color"][3 + k])

fig = go.Figure(data=[go.Sankey(node=nodes, link=links)])

fig.update_layout(title_text="Passenger flow from embarkation to survival status", width = 900)

fig.show()