# ETL Pipeline for Immigration Data

This notebook demonstrates an ETL (Extract, Transform, Load) pipeline for immigration data. The pipeline includes:

1. Load the full Immigration Raw_Data from csv file 
2. Whether or not add new records to the data
3. Transforming both dataset (enrichment, structural, categorization)
4. Extracting and transforming only the latest (incremental) record
5. Saving and displaying results as tables


In [1]:
import pandas as pd
from tabulate import tabulate

# Load the dataset
file_path ="K:\Code Projects\Cloned_Projects_From_Github\ETL_Extract_Justice_Chawanda_670444\Raw_Data\Immigration_Data.csv"
data = pd.read_csv(file_path)
# Display basic stats
print(f"Rows: {data.shape[0]}, Columns: {data.shape[1]}")
print("Sample data (first 5 rows):")
print(tabulate(data.head(), headers='keys', tablefmt='grid'))

Rows: 105, Columns: 9
Sample data (first 5 rows):
+----+----------------+-------------------+--------------+-----------------+-----------------------+----------------+--------------------+------------------+------------------+
|    | immigrant_id   | passport_number   | name         | date_of_birth   | contact               | country        | purpose_of_visit   | payment_status   | timestamp        |
|  0 | IM0001         | A12345678         | Emily Smith  | 27/11/1989      | emily.smith@email.com | United Kingdom | Tourism            | Paid             | 01/06/2025 00:00 |
+----+----------------+-------------------+--------------+-----------------+-----------------------+----------------+--------------------+------------------+------------------+
|  1 | IM0002         | B23456789         | John Doe     | 10/05/1972      | johndoe@email.com     | United States  | nan                | Paid             | 01/06/2025 02:16 |
+----+----------------+-------------------+--------------+-------

In [None]:
# ==============================================================================================================
# Imports and Setup
# ==============================================================================================================
import os
from datetime import datetime
import numpy as np
import pandas as pd
from tabulate import tabulate

# ==============================================================================================================
# Ensure Output Directory Exists
# ==============================================================================================================
os.makedirs('Tranformed', exist_ok=True)

# ==============================================================================================================
# Load and Clean Raw Data
# ==============================================================================================================
raw_data = pd.read_csv("K:\Code Projects\Cloned_Projects_From_Github\ETL_Extract_Justice_Chawanda_670444\Raw_Data\Immigration_Data.csv")
raw_data = raw_data.dropna()  # Drop records with missing values in raw_data
# ==============================================================================================================
# Load or Initialize Transformed Data
# ==============================================================================================================
transformed_full_path = 'Tranformed/Tranformed_Full.csv'
if os.path.exists(transformed_full_path):
    data = pd.read_csv(transformed_full_path)
    data = data.dropna()  # Drop records with missing values in data if loaded from file
else:
    data = raw_data.copy()

# ==============================================================================================================
# Add Derived Columns (age, country_category)
# ==============================================================================================================
# Add age column if not present
if 'date_of_birth' in data.columns and 'age' not in data.columns:
    data['date_of_birth'] = pd.to_datetime(data['date_of_birth'], errors='coerce')
    today = pd.to_datetime('today')
    data['age'] = (today - data['date_of_birth']).dt.days // 365

# Function to map country to region
def country_to_category(country):
    africa = ['Nigeria', 'Kenya', 'South Africa', 'Egypt', 'Ghana', 'Malawi', 'Zimbabwe']
    europe = ['United Kingdom', 'France', 'Germany', 'Italy', 'Spain']
    asia = ['China', 'India', 'Japan', 'Pakistan', 'Bangladesh']
    north_america = ['United States', 'Canada', 'Mexico']
    south_america = ['Brazil', 'Argentina', 'Colombia']
    australia = ['Australia', 'New Zealand']
    if country in africa:
        return 'Africa'
    elif country in europe:
        return 'Europe'
    elif country in asia:
        return 'Asia'
    elif country in north_america:
        return 'North America'
    elif country in south_america:
        return 'South America'
    elif country in australia:
        return 'Australia'
    else:
        return 'Other'

# Add country_category column if not present
if 'country' in data.columns and 'country_category' not in data.columns:
    data['country_category'] = data['country'].apply(country_to_category)

# ==============================================================================================================
# Optionally Add a New Record
# ==============================================================================================================
add_record = input("Do you want to add a new record? (yes/no): ").strip().lower()
if add_record == 'yes':
    # Generate a random date_of_birth between 1970-01-01 and 2010-12-31
    np.random.seed()
    start_date = pd.to_datetime('1970-01-01')
    end_date = pd.to_datetime('2010-12-31')
    random_days = np.random.randint(0, (end_date - start_date).days)
    random_dob = (start_date + pd.to_timedelta(random_days, unit='D')).date()

    # Collect new record details from the user
    new_record = {
        "immigrant_id": input("Enter immigrant ID: "),
        "passport_number": input("Enter passport number: "),
        "name": input("Enter name: "),
        "country": input("Enter country: "),
        "date_of_birth": str(random_dob),
        "purpose_of_visit": input("Enter purpose of visit: "),
        "contact": input("Enter contact: "),
        "payment_status": input("Enter payment status: "),
        "timestamp": datetime.now().strftime("%Y/%m/%d %H:%M")
    }

    # Calculate age for the new record
    today = pd.to_datetime('today')
    dob = pd.to_datetime(new_record['date_of_birth'], errors='coerce')
    new_record['age'] = (today - dob).days // 365 if pd.notnull(dob) else None

    # Assign country_category for the new record
    new_record['country_category'] = country_to_category(new_record['country'])

    # Append the new record to the dataset
    data = pd.concat([data, pd.DataFrame([new_record])], ignore_index=True)

    print("New record added successfully and timestamp updated")
else:
    print("No new record added.")

# ==============================================================================================================
# Save and Display Transformed Full Data
# ==============================================================================================================
data.to_csv('Tranformed/Tranformed_Full.csv', index=False)

print("\nHead of Transformed Full Data:")
print(tabulate(data, headers='keys', tablefmt='grid'))

# Add blank lines for spacing between tables
print("\n\n\n")

# ==============================================================================================================
# Generate and Update Incremental Data with All New Records
# ==============================================================================================================
# Find all transformed records not present in Raw_Data (by immigrant_id)
if 'immigrant_id' in data.columns and 'immigrant_id' in raw_data.columns:
    incremental = data[~data['immigrant_id'].isin(raw_data['immigrant_id'])]
else:
    incremental = data.iloc[0:0]  # empty if no unique id

# Always update incremental with all new records (no appending old incremental)
if not incremental.empty and 'immigrant_id' in incremental.columns:
    incremental = incremental.drop_duplicates(subset=['immigrant_id'])

# Save all new transformed records to Tranformed_Incremental.csv (overwrite with all new records)
incremental.to_csv('Tranformed/Transformed_Incremental.csv', index=False)

# Display head of transformed incremental data using tabulate only if data is available
if not incremental.empty:
    print("Head of Transformed Incremental Data:")
    print(tabulate(incremental, headers='keys', tablefmt='grid'))
else:
    print("No incremental data available.")

# =============================================================================================================
# Record Last Extraction Timestamp
# =============================================================================================================
if not data.empty:
    last_timestamp = data['timestamp'].iloc[-1]
    with open('last_extraction.txt', 'w', encoding='utf-8') as f:
        f.write(f"Last extraction timestamp recorded: {last_timestamp}")
else:
    with open('last_extraction.txt', 'w', encoding='utf-8') as f:
        f.write('No data available.')
    print("No data available to record timestamp.")

No new record added.

Head of Transformed Full Data:
+-----+----------------+-------------------+------------------+-----------------+---------------------------+----------------+--------------------+------------------+------------------+-------+--------------------+
|     | immigrant_id   | passport_number   | name             | date_of_birth   | contact                   | country        | purpose_of_visit   | payment_status   | timestamp        |   age | country_category   |
|   0 | IM0001         | A12345678         | Emily Smith      | 27/11/1989      | emily.smith@email.com     | United Kingdom | Tourism            | Paid             | 01/06/2025 00:00 |    35 | Europe             |
+-----+----------------+-------------------+------------------+-----------------+---------------------------+----------------+--------------------+------------------+------------------+-------+--------------------+
|   1 | IM0004         | D45678901         | Maria Garcia     | 19/03/1984      | maria