In [None]:
# Importing necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
# You can download the Titanic dataset from https://www.kaggle.com/c/titanic/data
df = pd.read_csv('titanic.csv')

# Show the first few rows of the dataset
df.head()

# Basic Information about the dataset
df.info()

# Checking for missing values
df.isnull().sum()

# Filling missing values in the 'Age' column with the median value
df['Age'].fillna(df['Age'].median(), inplace=True)

# Drop the rows where 'Embarked' is missing
df.dropna(subset=['Embarked'], inplace=True)

# Filling missing values in 'Cabin' with 'Unknown'
df['Cabin'].fillna('Unknown', inplace=True)

# Let's explore the data with visualizations

# 1. How many passengers survived?
sns.countplot(x='Survived', data=df)
plt.title('Survival Count')
plt.show()

# 2. Distribution of passengers by class
sns.countplot(x='Pclass', data=df)
plt.title('Passenger Class Distribution')
plt.show()

# 3. Survival rate by passenger class
sns.barplot(x='Pclass', y='Survived', data=df)
plt.title('Survival Rate by Passenger Class')
plt.show()

# 4. Age distribution of passengers
sns.histplot(df['Age'], kde=True)
plt.title('Age Distribution of Passengers')
plt.show()

# 5. Survival based on gender
sns.barplot(x='Sex', y='Survived', data=df)
plt.title('Survival Rate by Gender')
plt.show()

# 6. Fare distribution
sns.histplot(df['Fare'], kde=True)
plt.title('Fare Distribution')
plt.show()

# 7. Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Summary Statistics
df.describe()
