In [49]:
import pandas as pd
import numpy as np
import glob
import os

In [50]:
# Create the code to read a csv file 
dtype_spec = {2: str, 3: str, 5: str}

# Load UN population data
population_df = pd.read_csv('modified_UN_population_data_1950_to_2100.csv', dtype=dtype_spec)

In [51]:
population_df.dtypes

SortOrder      float64
LocID            int64
ISO3_code       object
ISO2_code       object
LocTypeID      float64
LocTypeName     object
ParentID       float64
Location        object
VarID            int64
Variant         object
Time             int64
MidPeriod      float64
PopTotal       float64
dtype: object

In [52]:
# Create a mapping from ISO3_code to the standardized country name (Location)
mapping = population_df[['ISO3_code', 'Location']].drop_duplicates().set_index('ISO3_code')['Location'].to_dict()

In [53]:
# Path where modified historical CSV files are located
input_path = './historicalPopulationFiles/renamed/'

# directory to save new file
output_path = './finalDataset/'

# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Get a list of all CSV files in the directory (adjust the pattern if needed)
csv_files = glob.glob(os.path.join(input_path, '*.csv'))

for file in csv_files:
    # Read the CSV file into a DataFrame
    historical_df = pd.read_csv(file)

    # Cast PopTotal to float
    historical_df['PopTotal'] = historical_df['PopTotal'].astype(float)

    # Map the standardized country names from the Big Dataset into the Little Dataset based on ISO3_code
    historical_df['Location'] = historical_df['ISO3_code'].map(mapping)
    
    # Add variant = Medium 
    historical_df['Variant'] = 'Medium'
    historical_df['LocTypeID'] = 4.0

    # Format in thousands
    historical_df['PopTotal'] = historical_df['PopTotal'] / 1000
    
    # Merge
    population_df = pd.concat([population_df, historical_df], ignore_index=True, sort=False)

# Save final file
new_file = os.path.join(output_path, "mergedHistoricalAndProjectionData.csv")
    
# Save the DataFrame to a new CSV file
population_df.to_csv(new_file, index=False)
