In [1]:
import pandas as pd
import os

In [2]:
files = [
    'gdp-worldbank-constant-usd.csv',
    'foreign-direct-investment-net-inflows-as-share-of-gdp.csv',
    'foreign-direct-investment-net-outflows-as-share-of-gdp.csv',
    'imports-of-goods-and-services-constant-2010-us.csv',
    'population-with-un-projections.csv',
    'trade-as-share-of-gdp.csv',
    'exports-of-goods-and-services-constant-2010-us.csv'
    ]
paths = [os.path.join('datasets', f) for f in files]

In [8]:
# creating a dataframe list
dfs = []

for path in paths:
    
    filename = os.path.basename(path)
    df = pd.read_csv(path)
    
    df['Year'] = df['Year'].astype(int)

    # the population dataset included estimates starting from 2024 and it was the only one containing years before 1960
    if 'population-with-un-projections.csv' in filename:
        mask = (df['Year'] >= 1960) & (df['Year'] <= 2023)
        df = df[mask]

    dfs.append(df)

# preparing the dataframes for merging by dropping the 'Entity' column and dropping NaN values from 'Code'
dfs_for_merge = []

for df in dfs:
    df_temp = df.copy()
    df_temp = df_temp.drop(columns=['Entity'])
    df_temp = df_temp.dropna(subset = ['Code'])
    dfs_for_merge.append(df_temp)

# keeping the most recent country name found for each code for adding the country names later back
code_name_map_list = []
for df in dfs:
    if 'Code' in df.columns and 'Entity' in df.columns:
        code_name_map_list.append(df[['Code', 'Entity']].dropna())

country_code_map = pd.concat(code_name_map_list).drop_duplicates(subset='Code', keep='last')

    

In [9]:
final_df = dfs_for_merge[0]

for df in dfs_for_merge[1:]:
    final_df = pd.merge(final_df, df, on=['Code', 'Year'], how='outer')

# adding the country name back as the first column
final_df = pd.merge(final_df, country_code_map, on='Code', how='left')
cols = ['Entity'] + [c for c in final_df.columns if c != 'Entity']
final_df = final_df[cols]

final_df = final_df.sort_values(by=['Entity', 'Year'])
final_df = final_df.reset_index(drop = True)


In [10]:
final_df

Unnamed: 0,Entity,Code,Year,GDP (constant 2015 US$),"Foreign direct investment, net inflows (% of GDP)","Foreign direct investment, net outflows (% of GDP)",Imports of goods and services (constant 2015 US$),Population - Sex: all - Age: all - Variant: estimates,Population - Sex: all - Age: all - Variant: medium,Trade (% of GDP),Exports of goods and services (constant 2015 US$)
0,Afghanistan,AFG,1960,,,,,9035048.0,,,
1,Afghanistan,AFG,1961,,,,,9214082.0,,,
2,Afghanistan,AFG,1962,,,,,9404410.0,,,
3,Afghanistan,AFG,1963,,,,,9604491.0,,,
4,Afghanistan,AFG,1964,,,,,9814317.0,,,
...,...,...,...,...,...,...,...,...,...,...,...
15427,Zimbabwe,ZWE,2020,1.910105e+10,0.559613,-0.013026,1.948442e+09,15526887.0,,47.313380,1.736087e+09
15428,Zimbabwe,ZWE,2021,2.071853e+10,0.871791,0.009178,3.146538e+09,15797220.0,,50.847122,2.552880e+09
15429,Zimbabwe,ZWE,2022,2.199048e+10,1.027034,0.177800,4.846439e+09,16069061.0,,64.763610,3.661619e+09
15430,Zimbabwe,ZWE,2023,2.316406e+10,1.583455,0.086571,4.311510e+09,16340829.0,,50.794964,3.355619e+09


In [None]:
# removing a redundant column that the population dataset had
final_df = final_df.drop(columns=['Population - Sex: all - Age: all - Variant: medium'])

In [None]:
# this wouldn't work because I started to use one gdp col only
# renaming columns
'''
new_names = {
    'Entity': 'country',
    'Code': 'code',
    'Year': 'year',
    'gdp': 'gdp', # USD $
    'gdp_growth': 'gdp_growth', # percentage %
    'gdp_ppp': 'gdp_ppp', # international dollars $
    'Foreign direct investment, net inflows (% of GDP)': 'fdi_inflows', # % of GDP
    'Foreign direct investment, net outflows (% of GDP)': 'fdi_outflows', # % of GDP
    'Imports of goods and services (constant 2015 US$)': 'imports', # contstant 2015 USD $
    'Population - Sex: all - Age: all - Variant: estimates': 'population', # count
    'Trade (% of GDP)': 'trade', # % of GDP
    'Exports of goods and services (constant 2015 US$)': 'exports' # contstant 2015 USD $
}

final_df = final_df.rename(columns=new_names)
'''

In [None]:
# this wouldn't work because I started to use one gdp col only
# reordering columns
new_order = ['country', 'code', 'year', 'gdp', 'gdp_growth', 'gdp_ppp', 'fdi_inflows', 'fdi_outflows', 'imports', 'exports', 'trade', 'population']

final_df = final_df[new_order]

In [None]:
print(final_df.shape)
final_df.head()

In [None]:
final_df.isna().sum()

In [None]:
unique_counts = final_df.nunique()
unique_counts

In [None]:
missing_values_by_year = final_df.drop(columns=['Year']).groupby(final_df['Year']).agg(lambda x: x.isna().sum())
pd.set_option('display.max_rows', None)
missing_values_by_year

In [None]:
# saving our final dataset
output_file = 'globalization.csv'
final_df.to_csv(output_file, index=False)