# Titanic Survival Analysis

Analyzing factors that influenced survival chances during the Titanic shipwreck.

Dataset source: [Kaggle Titanic Dataset](https://www.kaggle.com/datasets/yasserh/titanic-dataset/data)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
# Load the Titanic dataset (seaborn's built-in dataset is the same as Kaggle's)
df = sns.load_dataset('titanic')

# Display basic info
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
print("Missing values:")
df.isnull().sum()

## 1. Histogram of Survival

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

survival_counts = df['survived'].value_counts()
colors = ['#e74c3c', '#2ecc71']  # Red for died, Green for survived

bars = ax.bar(['Did Not Survive (0)', 'Survived (1)'], 
              [survival_counts[0], survival_counts[1]], 
              color=colors, edgecolor='black')

# Add value labels on bars
for bar, count in zip(bars, [survival_counts[0], survival_counts[1]]):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
            f'{count}\n({count/len(df)*100:.1f}%)', 
            ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.set_xlabel('Survival Status', fontsize=12)
ax.set_ylabel('Number of Passengers', fontsize=12)
ax.set_title('Titanic Survival Distribution', fontsize=14, fontweight='bold')
ax.set_ylim(0, max(survival_counts) + 80)

plt.tight_layout()
plt.show()

print(f"\nSurvival Rate: {df['survived'].mean()*100:.1f}%")

## 2. Survival Rate by Categorical Features

### 2.1 Survival Rate by Gender

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Survival rate by gender
gender_survival = df.groupby('sex')['survived'].mean() * 100

ax1 = axes[0]
bars = ax1.bar(gender_survival.index, gender_survival.values, 
               color=['#3498db', '#e91e63'], edgecolor='black')
ax1.set_xlabel('Gender', fontsize=12)
ax1.set_ylabel('Survival Rate (%)', fontsize=12)
ax1.set_title('Survival Rate by Gender', fontsize=14, fontweight='bold')
ax1.set_ylim(0, 100)

for bar, val in zip(bars, gender_survival.values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
             f'{val:.1f}%', ha='center', fontsize=12, fontweight='bold')

# Count by gender and survival
ax2 = axes[1]
gender_counts = df.groupby(['sex', 'survived']).size().unstack()
gender_counts.plot(kind='bar', ax=ax2, color=['#e74c3c', '#2ecc71'], edgecolor='black')
ax2.set_xlabel('Gender', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)
ax2.set_title('Survival Count by Gender', fontsize=14, fontweight='bold')
ax2.legend(['Did Not Survive', 'Survived'], loc='upper right')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)

plt.tight_layout()
plt.show()

print("\nGender Survival Statistics:")
print(df.groupby('sex')['survived'].agg(['count', 'sum', 'mean']).rename(
    columns={'count': 'Total', 'sum': 'Survived', 'mean': 'Survival Rate'}))

### 2.2 Survival Rate by Passenger Class

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Survival rate by class
class_survival = df.groupby('pclass')['survived'].mean() * 100

ax1 = axes[0]
bars = ax1.bar(['1st Class', '2nd Class', '3rd Class'], class_survival.values, 
               color=['#f1c40f', '#95a5a6', '#cd7f32'], edgecolor='black')
ax1.set_xlabel('Passenger Class', fontsize=12)
ax1.set_ylabel('Survival Rate (%)', fontsize=12)
ax1.set_title('Survival Rate by Passenger Class', fontsize=14, fontweight='bold')
ax1.set_ylim(0, 100)

for bar, val in zip(bars, class_survival.values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
             f'{val:.1f}%', ha='center', fontsize=12, fontweight='bold')

# Count by class and survival
ax2 = axes[1]
class_counts = df.groupby(['pclass', 'survived']).size().unstack()
class_counts.plot(kind='bar', ax=ax2, color=['#e74c3c', '#2ecc71'], edgecolor='black')
ax2.set_xlabel('Passenger Class', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)
ax2.set_title('Survival Count by Passenger Class', fontsize=14, fontweight='bold')
ax2.legend(['Did Not Survive', 'Survived'], loc='upper right')
ax2.set_xticklabels(['1st Class', '2nd Class', '3rd Class'], rotation=0)

plt.tight_layout()
plt.show()

print("\nPassenger Class Survival Statistics:")
print(df.groupby('pclass')['survived'].agg(['count', 'sum', 'mean']).rename(
    columns={'count': 'Total', 'sum': 'Survived', 'mean': 'Survival Rate'}))

### 2.3 Survival Rate by Embarkation Port

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Survival rate by embarkation port
embarked_survival = df.groupby('embarked')['survived'].mean() * 100
port_names = {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'}

ax1 = axes[0]
bars = ax1.bar([port_names.get(p, p) for p in embarked_survival.index], 
               embarked_survival.values, 
               color=['#9b59b6', '#1abc9c', '#e67e22'], edgecolor='black')
ax1.set_xlabel('Embarkation Port', fontsize=12)
ax1.set_ylabel('Survival Rate (%)', fontsize=12)
ax1.set_title('Survival Rate by Embarkation Port', fontsize=14, fontweight='bold')
ax1.set_ylim(0, 100)

for bar, val in zip(bars, embarked_survival.values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
             f'{val:.1f}%', ha='center', fontsize=12, fontweight='bold')

# Count by port and survival
ax2 = axes[1]
embarked_counts = df.groupby(['embarked', 'survived']).size().unstack()
embarked_counts.plot(kind='bar', ax=ax2, color=['#e74c3c', '#2ecc71'], edgecolor='black')
ax2.set_xlabel('Embarkation Port', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)
ax2.set_title('Survival Count by Embarkation Port', fontsize=14, fontweight='bold')
ax2.legend(['Did Not Survive', 'Survived'], loc='upper right')
ax2.set_xticklabels([port_names.get(p, p) for p in embarked_counts.index], rotation=0)

plt.tight_layout()
plt.show()

print("\nEmbarkation Port Survival Statistics:")
stats = df.groupby('embarked')['survived'].agg(['count', 'sum', 'mean']).rename(
    columns={'count': 'Total', 'sum': 'Survived', 'mean': 'Survival Rate'})
stats.index = [port_names.get(p, p) for p in stats.index]
print(stats)

## 3. Correlation Matrix

In [None]:
# Select numerical features for correlation analysis
numerical_cols = ['survived', 'pclass', 'age', 'sibsp', 'parch', 'fare']
df_numeric = df[numerical_cols].dropna()

# Calculate correlation matrix
correlation_matrix = df_numeric.corr()

# Create heatmap
fig, ax = plt.subplots(figsize=(10, 8))

sns.heatmap(correlation_matrix, 
            annot=True, 
            fmt='.3f', 
            cmap='RdYlGn', 
            center=0,
            square=True,
            linewidths=0.5,
            cbar_kws={'shrink': 0.8},
            ax=ax)

ax.set_title('Correlation Matrix: Survival vs Numerical Features', 
             fontsize=14, fontweight='bold', pad=20)

# Rename labels for clarity
labels = ['Survived', 'Passenger Class', 'Age', 'Siblings/Spouse', 'Parents/Children', 'Fare']
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.set_yticklabels(labels, rotation=0)

plt.tight_layout()
plt.show()

print("\nCorrelation with Survival:")
survival_corr = correlation_matrix['survived'].drop('survived').sort_values(key=abs, ascending=False)
for feature, corr in survival_corr.items():
    print(f"  {feature}: {corr:.3f}")

## 4. Additional Analysis: Age Distribution by Survival

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Age distribution by survival
ax1 = axes[0]
df[df['survived'] == 0]['age'].hist(ax=ax1, bins=30, alpha=0.7, label='Did Not Survive', color='#e74c3c')
df[df['survived'] == 1]['age'].hist(ax=ax1, bins=30, alpha=0.7, label='Survived', color='#2ecc71')
ax1.set_xlabel('Age', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.set_title('Age Distribution by Survival', fontsize=14, fontweight='bold')
ax1.legend()

# Survival rate by age group
ax2 = axes[1]
df['age_group'] = pd.cut(df['age'], bins=[0, 12, 18, 35, 50, 80], 
                         labels=['Child (0-12)', 'Teen (13-18)', 'Adult (19-35)', 
                                 'Middle-aged (36-50)', 'Senior (51+)'])
age_survival = df.groupby('age_group')['survived'].mean() * 100

bars = ax2.bar(range(len(age_survival)), age_survival.values, 
               color=plt.cm.viridis([0.2, 0.35, 0.5, 0.65, 0.8]), edgecolor='black')
ax2.set_xlabel('Age Group', fontsize=12)
ax2.set_ylabel('Survival Rate (%)', fontsize=12)
ax2.set_title('Survival Rate by Age Group', fontsize=14, fontweight='bold')
ax2.set_xticks(range(len(age_survival)))
ax2.set_xticklabels(age_survival.index, rotation=45, ha='right')
ax2.set_ylim(0, 100)

for bar, val in zip(bars, age_survival.values):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
             f'{val:.1f}%', ha='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Combined Analysis: Gender and Class

In [None]:
# Survival rate by gender and class combined
fig, ax = plt.subplots(figsize=(10, 6))

survival_by_gender_class = df.groupby(['pclass', 'sex'])['survived'].mean() * 100
survival_pivot = survival_by_gender_class.unstack()

x = range(len(survival_pivot))
width = 0.35

bars1 = ax.bar([i - width/2 for i in x], survival_pivot['female'], width, 
               label='Female', color='#e91e63', edgecolor='black')
bars2 = ax.bar([i + width/2 for i in x], survival_pivot['male'], width, 
               label='Male', color='#3498db', edgecolor='black')

ax.set_xlabel('Passenger Class', fontsize=12)
ax.set_ylabel('Survival Rate (%)', fontsize=12)
ax.set_title('Survival Rate by Gender and Passenger Class', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(['1st Class', '2nd Class', '3rd Class'])
ax.set_ylim(0, 110)
ax.legend()

for bar in bars1:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
            f'{bar.get_height():.1f}%', ha='center', fontsize=10, fontweight='bold')
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 2, 
            f'{bar.get_height():.1f}%', ha='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nSurvival Rate by Gender and Class:")
print(survival_pivot.round(1))

## 6. Conclusions

### Key Findings on Survival Factors:

**1. Gender (STRONGEST FACTOR)**
- Female passengers had a **~74% survival rate** vs only **~19% for males**
- This reflects the "women and children first" evacuation policy
- Gender is the most significant predictor of survival

**2. Passenger Class (STRONG FACTOR)**
- 1st Class: **~63% survival rate**
- 2nd Class: **~47% survival rate**
- 3rd Class: **~24% survival rate**
- Higher class passengers had better access to lifeboats and information
- Strong negative correlation (-0.34) between class number and survival

**3. Age (MODERATE FACTOR)**
- Children (0-12 years) had the highest survival rate (~58%)
- Slight negative correlation with survival (-0.07)
- Children were prioritized in evacuation

**4. Fare (MODERATE FACTOR)**
- Positive correlation with survival (~0.26)
- Higher fare = higher class = better survival chances
- Fare is somewhat redundant with passenger class

**5. Embarkation Port (WEAK FACTOR)**
- Cherbourg: ~55% survival rate
- Queenstown: ~39% survival rate  
- Southampton: ~34% survival rate
- Likely reflects class composition of passengers from each port

**6. Family Size (WEAK FACTOR)**
- Siblings/Spouse (SibSp): slight negative correlation (-0.04)
- Parents/Children (Parch): minimal correlation (0.08)
- Having some family members helped, but large families had lower survival

### Summary Ranking of Factors:
1. **Gender** - Most influential (women survived at ~4x the rate of men)
2. **Passenger Class** - Very influential (1st class ~2.5x survival of 3rd)
3. **Age** - Moderately influential (children prioritized)
4. **Fare** - Correlated with class
5. **Embarkation Port** - Weak/indirect factor
6. **Family aboard** - Minimal direct impact

In [None]:
# Final summary statistics
print("="*60)
print("SURVIVAL ANALYSIS SUMMARY")
print("="*60)
print(f"\nTotal Passengers: {len(df)}")
print(f"Survived: {df['survived'].sum()} ({df['survived'].mean()*100:.1f}%)")
print(f"Did Not Survive: {len(df) - df['survived'].sum()} ({(1-df['survived'].mean())*100:.1f}%)")
print("\n" + "-"*60)
print("TOP SURVIVAL FACTORS (by correlation magnitude):")
print("-"*60)

# Include gender as encoded variable
df_analysis = df.copy()
df_analysis['is_female'] = (df_analysis['sex'] == 'female').astype(int)

factors = ['is_female', 'pclass', 'fare', 'age', 'parch', 'sibsp']
factor_names = ['Gender (Female)', 'Passenger Class', 'Fare', 'Age', 'Parents/Children', 'Siblings/Spouse']

correlations = []
for factor in factors:
    corr = df_analysis[['survived', factor]].dropna().corr().iloc[0, 1]
    correlations.append(corr)

for name, corr in sorted(zip(factor_names, correlations), key=lambda x: abs(x[1]), reverse=True):
    direction = "POSITIVE" if corr > 0 else "NEGATIVE"
    print(f"  {name}: {corr:+.3f} ({direction} correlation)")