In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load sample data
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')

## YOUR TASKS 

### Task 1: Inspect the data
* What's the shape?
* What data types?
* How many missing values per column?

In [10]:
# 1.1: What's the shape?
print("Dataset shape:")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print()

Dataset shape:
Rows: 891, Columns: 12



In [11]:
# 1.2: What data types?
print("Data types:")
print(df.dtypes)
print()

Data types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object



In [12]:
# 1.3: How many missing values per column?
print("Missing values:")
print(df.isnull().sum())
print()

Missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64



In [8]:
# Let's look at the first few rows to understand the data
print("First 5 rows:")
print(df.head())

First 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN

### Task 2: Basic Cleaning
* Drop rows where age is missing
* Fill missing embarked with most common value
* Standardize Name column (remove extra spaces)

In [23]:
# Create clean copy as to not modify the original
df_clean = df.copy()

In [24]:
# 2.1: Drop rows where Age is missing
print(f"Before dropping Age nulls: {len(df_clean)} rows.")
df_clean = df_clean.dropna(subset=['Age'])
print(f"After dropping Age nulls: {len(df_clean)} rows.")
print(f"Dropping {len(df) - len(df_clean)} rows.\n")

Before dropping Age nulls: 891 rows.
After dropping Age nulls: 714 rows.
Dropping 177 rows.



In [25]:
# 2.2: Fill missing Embarked with most common value
# First, let's see what Embarked values exist and which is most common
print("Embarked value counts:")
print(df_clean['Embarked'].value_counts())
print()

Embarked value counts:
Embarked
S    554
C    130
Q     28
Name: count, dtype: int64



In [27]:
# Most common is 'S', so fill the missing values
most_common_port = df_clean['Embarked'].mode()[0]  # mode() gives most frequent
print(f"Most common port: {most_common_port}")
df_clean['Embarked'] = df_clean['Embarked'].fillna(most_common_port)

Most common port: S


In [28]:
# Verify no more missing Embarked
print(f"Missing Embarked after fill: {df_clean['Embarked'].isnull().sum()}\n")

Missing Embarked after fill: 0



In [29]:
#2.3: Standardize Name column (remove extra spaces)
# Let's check if there's any extra spaces first
print("Sample names before cleaning:")
print(df_clean['Name'].head(3))
print()

Sample names before cleaning:
0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
Name: Name, dtype: object



In [30]:
df_clean['Name'] = df_clean['Name'].str.strip()    # Remove leading/trailing space
df_clean['Name'] = df_clean['Name'].str.replace(r's\+', ' ', regex=True)   # Replace multiple spaces with single

print("Sample names after cleaning:")
print(df_clean['Name'].head(3))

Sample names after cleaning:
0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
Name: Name, dtype: object


### Task 3: Create New Features
* Extract title from Name (Mr., Mrs., etc)
* Create age groups (child, adult, senior)
* Create family size (SibSp + Parch + 1)

In [33]:
# 3.1: Extract title from Name (Mr., Mrs., etc.)
print("Sample names to understand structure.")
print(df_clean['Name'].head())
print()

Sample names to understand structure.
0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object



In [34]:
# Names are formatted as: "Last, Title. First"
# We'll extract the title (text between comma and period)
df_clean['Title'] = df_clean['Name'].str.extract(r',\s*([^\.]+)\.', expand=False)

In [35]:
print("Titles extracted:")
print(df_clean['Title'].value_counts())
print()

Titles extracted:
Title
Mr              398
Miss            146
Mrs             108
Master           36
Rev               6
Dr                6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64



In [36]:
# 3.2: Create age groups (child, adult, senior)
print("Age statistics:")
print(df_clean['Age'].describe())
print()

Age statistics:
count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64



In [40]:
# Define age groups
df_clean.loc[df_clean['Age'] < 18, 'AgeGroup'] = 'Child'
df_clean.loc[(df_clean['Age'] >= 18) & (df_clean['Age'] < 60), 'AgeGroup'] = 'Adult'
df_clean.loc[df_clean['Age'] >= 60, 'AgeGroup'] = 'Senior'

print("Age group distributions:")
print(df_clean['AgeGroup'].value_counts())
print()

Age group distributions:
AgeGroup
Adult     575
Child     113
Senior     26
Name: count, dtype: int64



In [41]:
# 3.3: Create family size (SibSp + Parch + 1)
# SibSp = Siblings/Spouses aboard
# Parch = Parents/Children aboard
# +1 for the passenger themselves
df_clean['FamilySize'] = df_clean['SibSp'] + df_clean['Parch'] + 1

In [42]:
print("Family size distribution:")
print(df_clean['FamilySize'].value_counts().sort_index())
print()

Family size distribution:
FamilySize
1    404
2    139
3     93
4     27
5     11
6     22
7     12
8      6
Name: count, dtype: int64



In [43]:
print("Sample of new features:")
print(df_clean[['Name', 'Title', 'Age', 'AgeGroup', 'SibSp', 'Parch', 'FamilySize']].head(10))

Sample of new features:
                                                 Name   Title   Age AgeGroup  \
0                             Braund, Mr. Owen Harris      Mr  22.0    Adult   
1   Cumings, Mrs. John Bradley (Florence Briggs Th...     Mrs  38.0    Adult   
2                              Heikkinen, Miss. Laina    Miss  26.0    Adult   
3        Futrelle, Mrs. Jacques Heath (Lily May Peel)     Mrs  35.0    Adult   
4                            Allen, Mr. William Henry      Mr  35.0    Adult   
6                             McCarthy, Mr. Timothy J      Mr  54.0    Adult   
7                      Palsson, Master. Gosta Leonard  Master   2.0    Child   
8   Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)     Mrs  27.0    Adult   
9                 Nasser, Mrs. Nicholas (Adele Achem)     Mrs  14.0    Child   
10                    Sandstrom, Miss. Marguerite Rut    Miss   4.0    Child   

    SibSp  Parch  FamilySize  
0       1      0           2  
1       1      0           2  
2 

In [44]:
# Standardizing Titles: Collapsing similar/rare titles

# See the full breakdown first
print("All titles and their counts:")
print(df_clean['Title'].value_counts())
print()

All titles and their counts:
Title
Mr              398
Miss            146
Mrs             108
Master           36
Rev               6
Dr                6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64



In [61]:
# Standardize titles - this is REAL data cleaning!

title_mapping = {
    'Mr': 'Mr',
    'Miss':'Miss',
    'Mrs':'Mrs',
    'Master':'Master',  # Young boys
    'Ms':'Miss',  # Same as Miss
    'Mlle':'Miss',  # Mademoiselle (French for Miss)
    'Mme':'Mrs',  # Madame (French for Miss)
    'Dr':'Professional',
    'Rev':'Professional',  # Reverend
    'Col':'Military',
    'Major':'Military',
    'Capt':'Military',
    'Don':'Nobility',
    'Dona':'Nobility',
    'Jonkheer':'Nobility',  # Dutch nobility
    'Sir':'Nobility',
    'Lady':'Nobility',
    'the Countess':'Nobility'
}

df_clean['Title_Clean'] = df_clean['Title'].map(title_mapping)

In [62]:
print("Standardized titles:")
print(df_clean['Title_Clean'].value_counts())
print()

Standardized titles:
Title_Clean
Mr              398
Miss            149
Mrs             109
Master           36
Professional     12
Nobility          5
Military          5
Name: count, dtype: int64



In [63]:
# Check if any titles weren't mapped (will show as NaN)
unmapped = df_clean[df_clean['Title_Clean'].isnull()]['Title'].unique()
if len(unmapped) > 0:
    print(f"⚠️ Unmapped titles: {unmapped}")

### Task 4: Validate
* Check no missing values in critical columns
* Verify data types are correct
* Generate summary statistics

In [50]:
print('=' * 50)
print("VALIDATION REPORT")
print('=' * 50)
print()

VALIDATION REPORT



In [51]:
# 4.1: Check no missing values in critical columns
critical_columns = ['Age', 'Embarked', 'Title_Clean', 'AgeGroup', 'FamilySize']
print("Missing values in critical columns:")
print(df_clean[critical_columns].isnull().sum())
print()

Missing values in critical columns:
Age            0
Embarked       0
Title_Clean    0
AgeGroup       0
FamilySize     0
dtype: int64



In [52]:
# 4.2: Verify data types are correct
print("Data types:")
print(df_clean[critical_columns].dtypes)
print()

Data types:
Age            float64
Embarked        object
Title_Clean     object
AgeGroup        object
FamilySize       int64
dtype: object



In [53]:
# 4.3: Generate summary statistics
print("Summary statistics for Age:")
print(df_clean['Age'].describe())
print()

Summary statistics for Age:
count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64



In [54]:
print("Summary statistics for FamilySize:")
print(df_clean['FamilySize'].describe())
print()

Summary statistics for FamilySize:
count    714.000000
mean       1.943978
std        1.483788
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        8.000000
Name: FamilySize, dtype: float64



In [56]:
# 4.4: Check for any weird values
print("Age range check:")
print(f"Min age: {df_clean['Age'].min()}")
print(f"Max age: {df_clean['Age'].max()}")
print(f"Any negative ages? {(df_clean['Age'] < 0).any()}")
print()

Age range check:
Min age: 0.42
Max age: 80.0
Any negative ages? False



In [57]:
print("Family size range check:")
print(f"Min family size: {df_clean['FamilySize'].min()}")
print(f"Max family size: {df_clean['FamilySize'].max()}")
print()

Family size range check:
Min family size: 1
Max family size: 8



In [58]:
# 4.5: Final row count
print(f"Final dataset: {len(df_clean)} rows, {len(df_clean.columns)} columns.")
print()

Final dataset: 714 rows, 16 columns.



In [59]:
print("✅ Validation complete!")

✅ Validation complete!


### Task 5: Document
* Write markdown cells explaining each step
* Note any decisions made
* List any assumptions

# Titanic Data Cleaning Pipeline

**Author:** Miles Heslen
**Date:** Novemeber 10, 2025
**Purpose:** Practice data cleaning for VCI internship preparation

## Dataset
- **Source:** Kaggle Titanic Dataset
- **Original Size:** 891 passengers, 12 columns
- **Final Size:** 714 passengers, 17 columns

## Cleaning Steps Performed

### 1. Data Inspection
- Identified 891 passengers with 12 features
- Found missing values in: Cabin (687), Age (177), Embarked (2)

### 2. Data Cleaning
- **Dropped rows:** Removed 177 passengers with missing Age (20% of data)
  - *Decision rationale:* Age is critical for analysis; filling would introduce bias
- **Filled values:** Embarked nulls filled with 'S' (Southampton - most common port)
- **Standardized:** Removed whitespace from Name column

### 3. Feature Engineering
- **Title Extraction:** Extracted 17 unique titles from Name field
  - Standardized into 7 categories: Mr, Miss, Mrs, Master, Professional, Military, Nobility
  - Collapsed variations (Ms -> Miss, Mlle -> Miss, Mme -> Mrs)
- **Age groups:** Created 3 categories
  - Child (<18): 113 passengers
  - Adult (18-59): 575 passengers
  - Senior (>60): 26 passengers
- **Family size:** Combined SibSp + Parch + 1
  - Range: 1-8 passengers
  - Most common: Solo travellers (537 passengers)

### 4. Validation Results
- No missing values in critical columns
- All data types correct
- No anomolous values detected
- Age range: 0.42 - 80 years
- Family size range: 1 - 8 people

## Key Insights
- 20% data loss from Age null removal (acceptable for practice)
- Southampton was primary embarkation point (644 passengers)
- Majority were adults traveling alone
- Title reveals social class (nobility, military, professional vs common)

## Next Steps
- Could explore filling Age instead of dropping
- Could analyze survival rates by new features
- Could create more features (Cabin deck, ticket class)

In [65]:
# Save the cleaned dataset
df_clean.to_csv('titanic_cleaned.csv', index=False)
print("Cleaned data saved to 'titanic_cleaned.csv'")

Cleaned data saved to 'titanic_cleaned.csv'


In [66]:
# Also save a dictionary
data_dict = pd.DataFrame({
    'Column': df_clean.columns,
    'Type': df_clean.dtypes,
    'Non-Null Count': df_clean.count(),
    'Null Count': df_clean.isnull().sum(),
    'Sample Values': [df_clean[col].head(3).tolist() for col in df_clean.columns]
})

print("\nData Dictionary:")
print(data_dict)


Data Dictionary:
                  Column     Type  Non-Null Count  Null Count  \
PassengerId  PassengerId    int64             714           0   
Survived        Survived    int64             714           0   
Pclass            Pclass    int64             714           0   
Name                Name   object             714           0   
Sex                  Sex   object             714           0   
Age                  Age  float64             714           0   
SibSp              SibSp    int64             714           0   
Parch              Parch    int64             714           0   
Ticket            Ticket   object             714           0   
Fare                Fare  float64             714           0   
Cabin              Cabin   object             185         529   
Embarked        Embarked   object             714           0   
Title              Title   object             714           0   
AgeGroup        AgeGroup   object             714           0   
FamilyS