In [158]:
import pandas as pd
import numpy as np
import pycountry
from datetime import datetime

Read and clean RawData

In [159]:
def get_iso3(country_name):
    country_name = country_name.replace(
        'St. ', 'Saint ').replace(
        'The', ' ').replace(
        ',', '').replace(
        '(', '').replace(
        ')', '').replace(
        "People's", '').replace(
        'Republic', 'Rep').replace(
        'Rep', '').replace(
        'of', '').replace('.', '').replace(
        'until 1990 former territory  the FRG', '').replace(
        'under United Nations Security Council Resolution 1244/99','').replace(
        'PR: Hong Kong', '').replace(
        'Federal Dem', '').replace(
        'Türkiye', 'Turkey').replace(
        'Lesotho Kingdom', 'Lesotho').replace(
        'Eswathini Kingdom', 'SWZ').replace(
        'Eswatini Kingdom', 'SWZ').replace(
        'Bahrain Kingdom', 'Bahrain').replace(
        'Tanzania United', 'Tanzania').replace(
        'Bahrain United', 'Bahrain').replace(
        'Egypt Arab', 'Egypt').replace(
        'Mauritania Islamic', 'Mauritania').replace(
        'Lao  Dem', 'LAO').replace(
        'Bolivariana de', '').replace(
        'Netherlands Antilles', 'ANT').strip()
 
    if country_name != "":
        try:
            country = pycountry.countries.search_fuzzy(country_name)[0]
            return country.alpha_3
        except LookupError:
            print(f'Could not find iso3 for {country_name}')
            return country_name
    return country_name

In [160]:
# Path to the Excel file
excel_file = '../data/RawData/EC_DSA_stochastic_data.xlsx'

# Read the Excel file
xls = pd.ExcelFile(excel_file)

# Get the sheet names
sheet_names = xls.sheet_names

# Create a dictionary to store DataFrames
ec_dfs = {}

# Loop through each sheet and save it as a separate DataFrame
for sheet_name in sheet_names[1:]:
    # Replace spaces with underscores in the sheet name
    formatted_sheet_name = sheet_name.replace(' ', '_').replace('(', '').replace(')', '').replace('-', '_').lower()
    
    # Read the sheet as a DataFrame
    df = pd.read_excel(excel_file, sheet_name=sheet_name)
    
    # Clean country names
    df = df.loc[~df['Country'].str.startswith('Euro')]
    df['Country'] = df['Country'].apply(get_iso3)
    
    # Set date format
    df = df.rename(columns = {'Country':'date'}).set_index('date').T
    df.index = pd.to_datetime(df.index)
    df = df.loc[df.index >= pd.to_datetime('1970')]
    df.index = df.index.to_period('Q').strftime('%YQ%q')
    
    # uniform nan
    df = df.replace('...', np.nan).replace(':', np.nan)
    
    # Store the DataFrame in the dictionary with the formatted sheet name
    ec_dfs[formatted_sheet_name] = df

xls.close()

In [161]:
# Create dictionary to store the first non-null entry for each country
first_date_dict = {var : {} for var in list(ec_dfs.keys())}

# Loop through the dictionary and extract the first non-null entry for each country
for variable, df in ec_dfs.items():
    # Iterate over each country column
    for country in df.columns:
        # Find the first non-null entry for each country
        first_date_dict[variable][country] = df[country].first_valid_index()

In [162]:
pd.DataFrame(first_date_dict).to_excel('../output/first_dates.xlsx')

In [166]:
# Create a new Excel writer object
writer = pd.ExcelWriter('../data/InputData/EC_DSA_clean.xlsx')

# Iterate over the dictionary items
for key, df in ec_dfs.items():
    # Write each DataFrame to a new sheet in the Excel workbook
    df.to_excel(writer, sheet_name=key, index=True)

# Save and close the workbook
writer.close()