In [14]:
# this code merged two raw datasets on temporary residents 2015-2024 and Permanent Residents 2015-2024

import pandas as pd
import os

# Load the Excel files into DataFrames
file_path_1_latest_2 = '../data/raw/EN_ODP-TR-Work-TFWP_PT_program_sign.xlsx'
file_path_2_latest_2 = '../data/raw/EN_ODP-PR-ProvImmCat.xlsx'

df_1_latest_2 = pd.read_excel(file_path_1_latest_2, sheet_name='TR - TFWP Program')
df_2_latest_2 = pd.read_excel(file_path_2_latest_2, sheet_name='PR - ImmCat')

# Define a transformation function to process the data
def transform_to_month_year_format_v2(df, prefix):
    # Skip the first two rows containing repeated labels
    months = df.iloc[1, :].tolist()
    years = df.iloc[0, :].tolist()
    
    # Construct Month-Year column names
    month_years = [f"{month}-{year}" for month, year in zip(months, years)]
    
    # Reassign columns with Month-Year names
    df.columns = month_years
    df = df.iloc[2:].reset_index(drop=True)
    
    # Convert all data to numeric
    df = df.apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',', ''), errors='coerce'))
    
    # Add a prefix to distinguish the datasets
    df = df.add_prefix(prefix + '_')

    return df

# Transform datasets
df_1_transformed_v2 = transform_to_month_year_format_v2(df_1_latest_2, prefix="TFWP")
df_2_transformed_v2 = transform_to_month_year_format_v2(df_2_latest_2, prefix="PR")

# Merge the datasets based on the Month-Year columns
merged_df_final_v2 = pd.concat([df_1_transformed_v2, df_2_transformed_v2], axis=1)

# Calculate the sum of TFWP and PR values for each Month-Year and create a combined DataFrame
combined_sums = merged_df_final_v2.sum()

# Create a new DataFrame to hold the combined data
combined_df = pd.DataFrame(combined_sums).reset_index()
combined_df.columns = ['Month-Year', 'Total Value']

# Remove the prefixes from the Month-Year column to have consistent labels
combined_df['Month-Year'] = combined_df['Month-Year'].str.replace('TFWP_', '').str.replace('PR_', '')

# Group by Month-Year to ensure any duplicates are correctly summed
combined_df = combined_df.groupby('Month-Year').sum().reset_index()

# Sort the combined DataFrame by year and month
# First, split the 'Month-Year' column to separate the month and year
combined_df[['Month', 'Year']] = combined_df['Month-Year'].str.split('-', expand=True)

# Create a mapping of month names to numbers for proper sorting
month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Map the month names to numbers
combined_df['Month'] = combined_df['Month'].map(month_mapping)

# Convert the Year column to integer
combined_df['Year'] = combined_df['Year'].astype(int)

# Sort the DataFrame by Year and Month
combined_df_sorted = combined_df.sort_values(by=['Year', 'Month']).reset_index(drop=True)

# Drop the temporary 'Month' and 'Year' columns
combined_df_sorted = combined_df_sorted.drop(columns=['Month', 'Year'])




In [15]:
combined_df_sorted

Unnamed: 0,Month-Year,Total Value
0,Jan-2015,18690
1,Feb-2015,21055
2,Mar-2015,29955
3,Apr-2015,33580
4,May-2015,32975
...,...,...
109,Feb-2024,51585
110,Mar-2024,52395
111,Apr-2024,68645
112,May-2024,68815


In [16]:
# Export the sorted combined DataFrame to a CSV file
# Ensure the directory exists
directory_path = '../data/processed/'

# Check if the directory exists, if not, create it
if not os.path.exists(directory_path):
    os.makedirs(directory_path)

# Path to save the CSV file
csv_file_path = os.path.join(directory_path, 'immigration_combined_monthly_year_data.csv')

# Export the sorted combined DataFrame to a CSV file
combined_df_sorted.to_csv(csv_file_path, index=False)

print(f"Data has been exported to: {csv_file_path}")

Data has been exported to: ../data/processed/immigration_combined_monthly_year_data.csv
