# Welcome to a Titanic Data Analysis

![Memorial to the passengers of the Titanic](fotos/memorial.jpg)

Welcome to the Titanic Data Analysis Project! This project involves cleaning, transforming, and visualizing data from the Titanic dataset, culminating in a user-friendly Streamlit application to present the results.

The RMS Titanic was a British passenger liner that embarked on its maiden voyage from Southampton to New York City on April 10, 1912. Renowned for its luxury and deemed "unsinkable," the ship tragically struck an iceberg on April 14, 1912, and sank in the early hours of April 15. Over 1,500 of the 2,224 passengers and crew lost their lives in one of the deadliest maritime disasters in history. This project aims to analyze the data surrounding the Titanic's passengers to uncover insights and stories from this historic event.


## Import libraries and load the data

In [1]:
#import libraries for data analysis and visualization
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.subplots as sp
from plotly.subplots import make_subplots

In [2]:
#Load the data
df = pd.read_csv(r"C:\Users\Win10\Desktop\UPGRADE\BOOTCAMP_PYTHON\modulos\modulo_1\trabajo_final\csv\titanic.csv")

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Feature engineering

### Column transformation

In [7]:
df["Survived"] = df["Survived"].replace({0: "No", 1: "Yes"})

In [8]:
df["Title"]=df["Name"].apply(lambda x: x.split(',')[1].split('.')[0])

In [9]:
df["Name"] = df["Name"].apply(lambda x: x.split(',')[1].split('.')[1])

In [10]:
unique_titles = df['Title'].unique()
unique_titles

array([' Mr', ' Mrs', ' Miss', ' Master', ' Don', ' Rev', ' Dr', ' Mme',
       ' Ms', ' Major', ' Lady', ' Sir', ' Mlle', ' Col', ' Capt',
       ' the Countess', ' Jonkheer'], dtype=object)

In [11]:
df.replace(to_replace = [' Mlle', ' Ms',' Miss'], value = 'Miss', inplace = True)
df.replace(to_replace = [' Mme', ' Mrs'], value = 'Mrs', inplace = True)
df.replace(to_replace = [' Don',' Sir',' Mr'],value='Mr',inplace=True)

In [12]:
df["Update_fare"]=df["Fare"].apply(lambda x: x * 98.4).round(1)

In [13]:
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

In [14]:
df = df.drop(columns = ["SibSp", "Parch"])

In [15]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Cabin,Embarked,Title,Update_fare,FamilySize
0,1,No,3,Owen Harris,male,22.0,A/5 21171,7.2500,,S,Mr,713.4,2
1,2,Yes,1,John Bradley (Florence Briggs Thayer),female,38.0,PC 17599,71.2833,C85,C,Mrs,7014.3,2
2,3,Yes,3,Laina,female,26.0,STON/O2. 3101282,7.9250,,S,Miss,779.8,1
3,4,Yes,1,Jacques Heath (Lily May Peel),female,35.0,113803,53.1000,C123,S,Mrs,5225.0,2
4,5,No,3,William Henry,male,35.0,373450,8.0500,,S,Mr,792.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,No,2,Juozas,male,27.0,211536,13.0000,,S,Rev,1279.2,1
887,888,Yes,1,Margaret Edith,female,19.0,112053,30.0000,B42,S,Miss,2952.0,1
888,889,No,3,"Catherine Helen ""Carrie""",female,,W./C. 6607,23.4500,,S,Miss,2307.5,4
889,890,Yes,1,Karl Howell,male,26.0,111369,30.0000,C148,C,Mr,2952.0,1


### Handling missing data

In [16]:
fig=px.box(df, x="Title", y="Age")

fig

In [17]:
median_ages = df.groupby('Title')['Age'].median()
#define a function that will impute the age of the missing values
def impute_age(row):
    if pd.isna(row['Age']):
        title = row['Title']
        return median_ages[title]
    else:
        return row['Age']


df['Age'] = df.apply(impute_age, axis=1)

In [18]:
df["Age"]=df["Age"].astype(int)

In [19]:
df.fillna({'Cabin':'Unknown'}, inplace=True)

In [20]:
df.fillna({'Embarked':'Unknown'}, inplace=True)

In [21]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Title          0
Update_fare    0
FamilySize     0
dtype: int64

In [22]:
df.duplicated().sum()

0

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    object 
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    int32  
 6   Ticket       891 non-null    object 
 7   Fare         891 non-null    float64
 8   Cabin        891 non-null    object 
 9   Embarked     891 non-null    object 
 10  Title        891 non-null    object 
 11  Update_fare  891 non-null    float64
 12  FamilySize   891 non-null    int64  
dtypes: float64(2), int32(1), int64(3), object(7)
memory usage: 87.1+ KB


In [24]:
df.to_csv(r"C:\Users\Win10\Desktop\UPGRADE\BOOTCAMP_PYTHON\modulos\modulo_1\trabajo_final\csv\titanic_clean.csv", index=False)

# Graphs

## Passengers

### How many passengers were on the titanic

In [25]:
import plotly.graph_objects as go
fig = go.Figure()
area = go.Indicator( mode = "number", value = df["PassengerId"].count(), name = "Passenger on the Titanic" )
fig.add_trace(area)
fig.update_layout(template = "plotly_dark")
fig

### Number of passangers by class

In [26]:

df['Pclass'] = df['Pclass'].astype(str)
total_passengers_by_class = df.groupby(['Pclass']).size().reset_index(name='Total Passengers')
fig = px.bar(total_passengers_by_class, x='Pclass', y='Total Passengers', 
             title='Total passengers by class', 
             labels={'Pclass':'Class', 'Total Passengers':'Total passengers'}, 
             template='plotly_dark',
             color='Pclass',
             color_discrete_map= {"1":"#48C9B0 ", "2":"#5499C7", "3":"#AF7AC5 "})
             
fig

In [27]:
df['Pclass'] = df['Pclass'].astype(str)
fig= px.pie(df, names="Pclass", title="Passengers by class", template='plotly_dark', color="Pclass", color_discrete_map= {"1":"#48C9B0 ", "2":"#5499C7", "3":"#AF7AC5 "})
fig.update_traces(textfont=dict(color="black"))
fig.update_layout(width=500, height=500)
fig.write_html("passengers_by_class.html")
fig

### Number of passangers by title.

In [28]:

total_passengers_by_title = df.groupby('Title').size().reset_index(name='Total Passengers').sort_values(by='Total Passengers', ascending=False). head(8)

fig = px.bar(total_passengers_by_title, x='Title', y='Total Passengers', 
             title="Number of passengers by title", 
             labels={'Title':'Title', 'Total Passengers':'Total passengers'},
             template='plotly_dark',
             color='Title',
             color_discrete_sequence= ['#d78f88', '#7f7287', '#5b7a8e', 'silver', 'gold', 'lightblue', 'lightgreen', 'lightcoral'], height=600, width=800)
fig.write_html("passxtitulo.html")
fig

### Number of passangers by sex   

In [29]:
fig = px. pie(df,names="Sex", title="Distribution of passengers by sex", template="plotly_dark",
              width=500, height=500,color="Sex" ,color_discrete_map = {"male":"#F4D03F" , "female":"#58D68D " })
fig.update_traces(textfont=dict(color="black"))
fig.write_html("passxsex.html")
fig

##  Age

### Distribution by age

In [30]:
fig = px.histogram(df, x='Age', title='Age distribution of passengers', template='plotly_dark',
                    width=800, height=600, color_discrete_sequence=px.colors.sequential.RdBu, marginal='box')
fig.write_html("histograma.html")
fig

### Age distribution by title

In [31]:
fig = px.box(df, x= "Title",y="Age", title="Age distribution by title", 
             template="plotly_dark", width=800, height=600, color="Title", 
             color_discrete_sequence= ['#d78f88', '#7f7287', '#5b7a8e', 'silver', 'gold', 'lightblue', 'lightgreen', 'lightcoral'])

fig.write_html("edadvstitulo.html")
fig

In [32]:
total_passengers_by_family_size = df.groupby('FamilySize').size().reset_index(name='Total Passengers').sort_values(by='Total Passengers', ascending=False).head(8)
fig = px.bar(total_passengers_by_family_size, x="FamilySize", y="Total Passengers"
             , title="Number of passengers by family size", labels={"FamilySize":"Family size", "Total Passengers":"Number of passengers"},
             template="plotly_dark", color_discrete_sequence=px.colors.sequential.RdBu)
fig.write_html("passxfamilia.html")
fig

## Distribution of survived passengers

In [33]:
fig= px.pie(df, names="Survived", title="Survived passengers", template="plotly_dark", width=800, height=600, color="Survived", color_discrete_map = {"No":"#707B7C" , "Yes":"#1ABC9C "})
fig.update_traces(textfont=dict(color="black"))
fig.update_layout(width=500, height=500)
fig.write_html("sobrevivientes.html")
fig

### Distribution of passengers suvived by sex

In [34]:
fig = px.treemap(df, path=["Survived", "Sex"], title="Distribution of passengers survived by sex", 
                    template="plotly_dark", width=800, height=600, color="Survived", 
                    color_discrete_map = {"No":"#707B7C" , "Yes":"#1ABC9C" })
fig.write_html("sobxsex.html")
fig

### Distribution of passengers survived by title

In [35]:
fig = px.treemap(df, path=["Survived","Title"], title="Distribution of passengers survived by title", template="plotly_dark",
                width=900, height=800, color="Survived", color_discrete_map = {"No":"#707B7C" , "Yes":"#1ABC9C" })
fig.write_html("sobrevxtitulo.html")
fig

### Passengers survived by class

In [36]:

df['Pclass'] = df['Pclass'].astype(str)
df_survived = df[df['Survived'] == "Yes"]

sobrevivientes_por_clase = df_survived.groupby('Pclass').size().reset_index(name='Survived')

fig = px.bar(sobrevivientes_por_clase, x='Pclass', y="Survived", title='Survived passengers by class', template='plotly_dark', color='Pclass', color_discrete_map= {"1":"#48C9B0 ", "2":"#5499C7", "3":"#AF7AC5 "})

fig.show()

In [37]:
fig = px.histogram(df, x="Pclass", color="Survived", 
                   barmode="group", 
                   category_orders={"Pclass": [1, 2, 3], "Survived": ["No", "Yes"]},
                   labels={"Pclass": "Class", "Survived": "Survived"},
                   title="Survived by Class",
                   color_discrete_map= {"No":"#707B7C" , "Yes":"#1ABC9C" }, template="plotly_dark")
fig.write_html("sobrevivientesxclase.html")
fig.show()

In [38]:
def pie_chart_by_class(pclass):
    # filter data by class
    class_data = df[df["Pclass"] == pclass]
    
    # count the number of survivors and nonsurvivors
    counts = class_data["Survived"].value_counts().reset_index()
    counts.columns = ["Survived", "Count"]
    counts["Survived"] = counts["Survived"].map({"Yes": "Survived", "No": "Non-Survived"})
    
    return counts

class_order = sorted(df["Pclass"].unique())

fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]],
                    subplot_titles=[f'Clase {pclass}' for pclass in class_order])

# make the pie chart for each class
for i, pclass in enumerate(class_order, start=1):
    counts = pie_chart_by_class(pclass)
    fig.add_trace(go.Pie(labels=counts["Survived"], values=counts["Count"], name=f'Clase {pclass}',
                         marker_colors=['#1ABC9C' if survived == 'Survived' else '#707B7C' for survived in counts["Survived"]]), row=1, col=i)


fig.update_layout(title_text="Survived by Class", title_x=0.5, template='plotly_dark', width=800, height=500)
fig.update_traces(textfont=dict(color="black"))
fig.write_html("pie_chart_class.html")
fig.show()

## Fare

### Distribution of the fare paid by the passengers

In [39]:
fig=px.strip(df.sort_values(by="Pclass"), x="Pclass", y="Update_fare",
             title="Fare by Age and Class",
             template="plotly_dark",
             labels={"Update_fare":"Fare"}, width=800, height=600,
             color="Pclass", color_discrete_map = {"1":"#48C9B0 ", "2":"#5499C7", "3":"#AF7AC5 "})
fig

In [40]:
df_pagado = df[(df["Update_fare"] > 0)]
fig=px.box(df_pagado.sort_values(by="Pclass"), x="Pclass", y="Update_fare", 
           color="Pclass", 
           title="Fare by Class", 
           template="plotly_dark", 
           labels={"Update_fare":"Fare updated in Libras", "Pclass":"Class"},
           color_discrete_map = {"1":"#48C9B0 ", "2":"#5499C7", "3":"#AF7AC5 "})
fig.write_html("farexclass.html")
fig

### Max fare

In [41]:
max_fare = df_pagado["Update_fare"].max()

fig = go.Figure()
area = go.Indicator(
    mode = "number",
    value = max_fare,
    number = {"prefix": "£", "valueformat": ".0f"},  
    name = "Max fare"
)
fig.add_trace(area)
fig.update_layout(template = "plotly_dark")
fig

### Min Fare

In [42]:
fig = go.Figure()
area = go.Indicator( mode = "number", 
                    value = df_pagado["Update_fare"].min(),
                    name = "Min fare",
                    number = {"prefix": "£", "valueformat": ".0f"},  # Prefijo de libras y sin decimales
                    )
fig.add_trace(area)
fig.update_layout(template = "plotly_dark")
fig