In [1]:
# Imports

import os
import pandas as pd
import numpy as np


from sklearn.linear_model import LinearRegression


## Load Data

In [14]:
# Root data Directory
# Base path
base_root = '../data/raw'

file_paths = {
    'capital': os.path.join(base_root, 'CapitalStockData.csv'),
    'energy': os.path.join(base_root, 'energy_use.csv'),
    'labor_force': os.path.join(base_root, 'labor_force.csv'),
    'patents': os.path.join(base_root, 'patents_res_nonres.csv'),
    'rnd': os.path.join(base_root, 'R&D.csv'),
    'unemployment': os.path.join(base_root, 'unemployed_ilo_estimate.csv'),
    'population': os.path.join(base_root, 'population_Data.csv'),
}

In [15]:
dfs = {name: pd.read_csv(path) for name, path in file_paths.items()}

# for name, df in dfs.items():
#     print(f"DataFrame name: {name}")
#     print(df.head())
#     print("\n")

## Wrangle Data

In [4]:
def reshape_to_long(df, id_vars):
    df = df.replace('..', pd.NA)
    df_long = df.melt(id_vars=id_vars, var_name='year', value_name='value')
    df_long['year'] = df_long['year'].str.extract(r'(\d{4})').astype('Int64')
    df_long['value'] = pd.to_numeric(df_long['value'], errors='coerce')
    return df_long

def impute_group_linear(df, group_cols):
    def interpolate_numeric(g):
        g = g.sort_values('year')
        if 'value' in g.columns:
            g['value'] = g['value'].interpolate(method='linear').ffill().bfill()
        return g

    df = df.groupby(group_cols).apply(interpolate_numeric).reset_index(drop=True)
    df['value'] = df['value'].fillna(0)
    return df

In [5]:
processed_dfs = {}


In [6]:
# 1. Capital
capital_df = dfs['capital'].replace('..', pd.NA)
capital_df = capital_df.rename(columns={
    'countryname': 'country',
    'countrycode': 'country_code',
    'year': 'year'
})
# numeric_cols = capital_df.select_dtypes(include='number').columns
# capital_df[numeric_cols] = capital_df[numeric_cols].interpolate().ffill().bfill()
capital_df.fillna(0, inplace=True)

processed_dfs['capital'] = capital_df

capital_df.describe()

Unnamed: 0,ifscode,year,igov_rppp,kgov_rppp,ipriv_rppp,kpriv_rppp,ippp_rppp,kppp_rppp,GDP_rppp,igov_n,kgov_n,ipriv_n,kpriv_n,kppp_n,GDP_n
count,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0
mean,551.195876,1989.5,15.577749,219.564605,46.797938,485.950773,0.310997,2.554725,282.360481,214675.5,19366590.0,36713.95,11657000.0,13560.18,930898.5
std,259.70684,17.318846,94.454951,1063.875166,209.273722,1884.333244,2.435493,18.426828,1123.474887,22814650.0,2076237000.0,3013077.0,1242035000.0,1354151.0,95536720.0
min,111.0,1960.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,314.0,1974.75,0.0,2.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,565.5,1989.5,1.0,14.0,2.0,28.0,0.0,0.0,22.0,0.0,8.0,1.0,14.0,0.0,24.0
75%,733.0,2004.25,6.0,81.0,19.0,196.0,0.0,0.0,138.0,23.0,357.0,64.0,751.25,0.0,521.0
max,968.0,2019.0,3124.0,30187.0,5444.0,35323.0,76.0,462.0,20564.0,2461376000.0,224000000000.0,324779700.0,134000000000.0,146085600.0,10306910000.0


In [7]:
# 2. World Bank-style datasets
wb_datasets = ['energy', 'patents', 'rnd']
id_vars_map = {
    'Series Name': 'series_name',
    'Series Code': 'series_code',
    'Country Name': 'country',
    'countrycode': 'country_code'
}

for name in wb_datasets:
    df = dfs[name].rename(columns=id_vars_map)
    df_long = reshape_to_long(df, list(id_vars_map.values()))
    df_long = impute_group_linear(df_long, ['country', 'series_name'])
    processed_dfs[name] = df_long

In [8]:
# 3. Labor Force enhancement with working-age population
pop_df = dfs['population']
# Filter only the relevant series: ages 15–64
wa_series = pop_df[
    pop_df['Series Name'].isin([
        'Population ages 15-64, female',
        'Population ages 15-64, male'
    ])
]
# Standardize column names
wa_series = wa_series.rename(columns={
    'Country Name': 'country',
    'Country Code': 'country_code',
    'Series Name': 'series_name',
    'Series Code': 'series_code'
})

wa_long = reshape_to_long(wa_series, ['country', 'country_code', 'series_name', 'series_code'])

# Pivot to get female and male in separate columns
wa_pivot = wa_long.pivot_table(
    index=['country', 'country_code', 'year'],
    columns='series_name',
    values='value',
    aggfunc='first'
).reset_index()

# Calculate total working-age population
wa_pivot['working_age_population'] = (
    wa_pivot['Population ages 15-64, female'] +
    wa_pivot['Population ages 15-64, male']
)

working_age_df = wa_pivot[['country', 'country_code', 'year', 'working_age_population']]

# ---------- Labor Force Data ----------

labor_df = dfs['labor_force'].rename(columns={
    'Country Name': 'country',
    'countrycode': 'country_code',
    'Series Name': 'series_name',
    'Series Code': 'series_code'
})

labor_long = reshape_to_long(
    labor_df,
    ['series_name', 'series_code', 'country', 'country_code']
)

# ---------- Merge working-age population with labor force ----------
labor_merged = labor_long.merge(
    working_age_df,
    on=['country', 'country_code', 'year'],
    how='left'
)
labor_merged

# ---------- Train regression model using years 1990–2023 ----------
train_df = labor_merged[(labor_merged['year'] >= 1990) & (labor_merged['value'].notna())]
train_df_clean = train_df.dropna(subset=['working_age_population', 'value'])

X_train = train_df_clean[['working_age_population']]
y_train = train_df_clean['value']

model = LinearRegression()
model.fit(X_train, y_train)

# ---------- Predict labor force for years before 1990 ----------
predict_df = labor_merged[(labor_merged['year'] < 1990) & (labor_merged['working_age_population'].notna())].copy()
predict_df['predicted_labor_force'] = model.predict(predict_df[['working_age_population']])

# Merge predictions back
labor_merged = labor_merged.merge(
    predict_df[['country', 'country_code', 'year', 'predicted_labor_force']],
    on=['country', 'country_code', 'year'],
    how='left'
)

# Fill labor force with prediction if missing
labor_merged['filled_labor_force'] = labor_merged['value'].combine_first(labor_merged['predicted_labor_force'])

# ---------- Final df ----------
final_labor_df = labor_merged[[
    'series_name', 'series_code', 'country', 'country_code', 'year',
    'working_age_population', 'value', 'filled_labor_force'
]].rename(columns={
    'value': 'observed_labor_force'
})

processed_dfs['labor_force'] = final_labor_df

In [16]:
print(processed_dfs['capital'].describe())
processed_dfs['capital']

            ifscode          year     igov_rppp     kgov_rppp    ipriv_rppp   
count  11640.000000  11640.000000  11640.000000  11640.000000  11640.000000  \
mean     551.195876   1989.500000     15.577749    219.564605     46.797938   
std      259.706840     17.318846     94.454951   1063.875166    209.273722   
min      111.000000   1960.000000      0.000000      0.000000      0.000000   
25%      314.000000   1974.750000      0.000000      2.000000      0.000000   
50%      565.500000   1989.500000      1.000000     14.000000      2.000000   
75%      733.000000   2004.250000      6.000000     81.000000     19.000000   
max      968.000000   2019.000000   3124.000000  30187.000000   5444.000000   

         kpriv_rppp     ippp_rppp     kppp_rppp      GDP_rppp        igov_n   
count  11640.000000  11640.000000  11640.000000  11640.000000  1.164000e+04  \
mean     485.950773      0.310997      2.554725    282.360481  2.146755e+05   
std     1884.333244      2.435493     18.426828   1

Unnamed: 0,country_code,ifscode,country,year,igov_rppp,kgov_rppp,ipriv_rppp,kpriv_rppp,ippp_rppp,kppp_rppp,GDP_rppp,igov_n,kgov_n,ipriv_n,kpriv_n,kppp_n,GDP_n,income
0,AFG,512,Afghanistan,1960,3.0,50.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
1,AFG,512,Afghanistan,1961,3.0,52.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
2,AFG,512,Afghanistan,1962,4.0,54.0,1.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
3,AFG,512,Afghanistan,1963,4.0,56.0,1.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
4,AFG,512,Afghanistan,1964,4.0,59.0,1.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11635,ZWE,698,Zimbabwe,2015,0.0,0.0,0.0,0.0,0.0,0.0,42.0,0.0,0.0,0.0,0.0,0.0,20.0,Low Income Developing Countries
11636,ZWE,698,Zimbabwe,2016,0.0,0.0,0.0,0.0,0.0,0.0,42.0,0.0,0.0,0.0,0.0,0.0,21.0,Low Income Developing Countries
11637,ZWE,698,Zimbabwe,2017,0.0,0.0,0.0,0.0,0.0,0.0,44.0,0.0,0.0,0.0,0.0,0.0,22.0,Low Income Developing Countries
11638,ZWE,698,Zimbabwe,2018,0.0,0.0,0.0,0.0,0.0,0.0,46.0,0.0,0.0,0.0,0.0,0.0,24.0,Low Income Developing Countries


In [17]:
print(processed_dfs['energy'].describe)
processed_dfs['energy']


<bound method NDFrame.describe of                                         series_name        series_code   
0      Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE  \
1      Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
2      Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
3      Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
4      Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
...                                             ...                ...   
17019  Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
17020  Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
17021  Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
17022  Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
17023  Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   

           country country_code  year       value  
0      Afghanistan       

Unnamed: 0,series_name,series_code,country,country_code,year,value
0,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1960,0.000000
1,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1961,0.000000
2,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1962,0.000000
3,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1963,0.000000
4,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1964,0.000000
...,...,...,...,...,...,...
17019,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Zimbabwe,ZWE,2019,805.339134
17020,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Zimbabwe,ZWE,2020,805.339134
17021,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Zimbabwe,ZWE,2021,805.339134
17022,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Zimbabwe,ZWE,2022,805.339134


In [18]:
print(processed_dfs['patents'].describe())
processed_dfs['patents']

            year         value
count    34048.0  3.404800e+04
mean      1991.5  1.089502e+04
std    18.473224  8.472374e+04
min       1960.0  0.000000e+00
25%      1975.75  0.000000e+00
50%       1991.5  9.000000e+00
75%      2007.25  2.500000e+02
max       2023.0  2.386300e+06


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1960,0.0
1,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1961,0.0
2,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1962,0.0
3,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1963,0.0
4,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1964,0.0
...,...,...,...,...,...,...
34043,"Patent applications, residents",IP.PAT.RESD,Zimbabwe,ZWE,2019,8.0
34044,"Patent applications, residents",IP.PAT.RESD,Zimbabwe,ZWE,2020,8.0
34045,"Patent applications, residents",IP.PAT.RESD,Zimbabwe,ZWE,2021,8.0
34046,"Patent applications, residents",IP.PAT.RESD,Zimbabwe,ZWE,2022,8.0


In [19]:
print(processed_dfs['rnd'].describe())
processed_dfs['rnd']

            year         value
count    17024.0  17024.000000
mean      1991.5    706.468886
std    18.473496   1282.435104
min       1960.0      0.000000
25%      1975.75      0.000000
50%       1991.5     85.800087
75%      2007.25    714.554627
max       2023.0   9081.935547


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1960,0.000000
1,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1961,0.000000
2,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1962,0.000000
3,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1963,0.000000
4,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1964,0.000000
...,...,...,...,...,...,...
17019,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Zimbabwe,ZWE,2019,99.443527
17020,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Zimbabwe,ZWE,2020,99.443527
17021,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Zimbabwe,ZWE,2021,99.443527
17022,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Zimbabwe,ZWE,2022,99.443527


In [20]:
print(processed_dfs['labor_force'].describe())
processed_dfs['labor_force']

            year  working_age_population  observed_labor_force   
count    17024.0            1.667400e+04          7.986000e+03  \
mean      1991.5            1.221623e+08          1.345144e+08   
std    18.473496            4.283611e+08          4.045214e+08   
min       1960.0            1.384000e+03          1.681200e+04   
25%      1975.75            5.314632e+05          1.348892e+06   
50%       1991.5            3.712378e+06          4.801255e+06   
75%      2007.25            2.469562e+07          3.420624e+07   
max       2023.0            5.238188e+09          3.650969e+09   

       filled_labor_force  
count        1.578600e+04  
mean         9.723952e+07  
std          3.209808e+08  
min          1.681200e+04  
25%          9.654877e+05  
50%          3.407617e+06  
75%          2.276696e+07  
max          3.650969e+09  


Unnamed: 0,series_name,series_code,country,country_code,year,working_age_population,observed_labor_force,filled_labor_force
0,"Labor force, total",SL.TLF.TOTL.IN,Afghanistan,AFG,1960,4.990044e+06,,3.992326e+06
1,"Labor force, total",SL.TLF.TOTL.IN,Albania,ALB,1960,8.581070e+05,,1.099570e+06
2,"Labor force, total",SL.TLF.TOTL.IN,Algeria,DZA,1960,5.865620e+06,,4.605314e+06
3,"Labor force, total",SL.TLF.TOTL.IN,American Samoa,ASM,1960,9.703000e+03,,5.056050e+05
4,"Labor force, total",SL.TLF.TOTL.IN,Andorra,AND,1960,6.275000e+03,,5.032050e+05
...,...,...,...,...,...,...,...,...
17019,"Labor force, total",SL.TLF.TOTL.IN,Sub-Saharan Africa,SSF,2023,7.031632e+08,5.005046e+08,5.005046e+08
17020,"Labor force, total",SL.TLF.TOTL.IN,Sub-Saharan Africa (excluding high income),SSA,2023,7.030776e+08,5.005046e+08,5.005046e+08
17021,"Labor force, total",SL.TLF.TOTL.IN,Sub-Saharan Africa (IDA & IBRD countries),TSS,2023,7.031632e+08,5.005046e+08,5.005046e+08
17022,"Labor force, total",SL.TLF.TOTL.IN,Upper middle income,UMC,2023,1.924485e+09,1.404778e+09,1.404778e+09


## EDA