In [24]:
# Imports

import os
import pandas as pd
import numpy as np


from sklearn.linear_model import LinearRegression


## Load Data

In [48]:
# Root data Directory
# Base path
base_root = '../data/raw'

file_paths = {
    'capital': os.path.join(base_root, 'CapitalStockData.csv'),
    'energy': os.path.join(base_root, 'energy_use.csv'),
    'labor_force': os.path.join(base_root, 'labor_force.csv'),
    'patents': os.path.join(base_root, 'patents_res_nonres.csv'),
    'rnd': os.path.join(base_root, 'R&D.csv'),
    'unemployment': os.path.join(base_root, 'unemployed_ilo_estimate.csv'),
    'population': os.path.join(base_root, 'population_Data.csv'),
}

In [49]:
dfs = {name: pd.read_csv(path) for name, path in file_paths.items()}

# for name, df in dfs.items():
#     print(f"DataFrame name: {name}")
#     print(df.head())
#     print("\n")

## Wrangle Data

In [50]:
def reshape_to_long(df, id_vars):
    df = df.replace('..', pd.NA)
    df_long = df.melt(id_vars=id_vars, var_name='year', value_name='value')
    df_long['year'] = df_long['year'].str.extract(r'(\d{4})').astype('Int64')
    df_long['value'] = pd.to_numeric(df_long['value'], errors='coerce')
    return df_long

def impute_group_linear(df, group_cols):
    def interpolate_numeric(g):
        g = g.sort_values('year')
        if 'value' in g.columns:
            g['value'] = g['value'].interpolate(method='linear').ffill().bfill()
        return g

    df = df.groupby(group_cols).apply(interpolate_numeric).reset_index(drop=True)
    #df['value'] = df['value'].fillna(0)
    return df

In [51]:
processed_dfs = {}


In [52]:
# 1. Capital
capital_df = dfs['capital'].replace('..', pd.NA)
capital_df = capital_df.rename(columns={
    'countryname': 'country',
    'countrycode': 'country_code',
    'year': 'year'
})
numeric_cols = capital_df.select_dtypes(include='number').columns
capital_df[numeric_cols] = capital_df[numeric_cols].interpolate().ffill().bfill()
#capital_df.fillna(0, inplace=True)

processed_dfs['capital'] = capital_df

capital_df.describe()

Unnamed: 0,ifscode,year,igov_rppp,kgov_rppp,ipriv_rppp,kpriv_rppp,ippp_rppp,kppp_rppp,GDP_rppp,igov_n,kgov_n,ipriv_n,kpriv_n,kppp_n,GDP_n
count,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0,11640.0
mean,551.195876,1989.5,16.962758,241.690163,53.714433,560.728351,0.832517,9.107517,312.876718,1272919.0,115603300.0,181762.3,69275290.0,172812.1,5373663.0
std,259.70684,17.318846,94.481525,1066.671149,211.387675,1922.291548,3.476959,34.379212,1128.846605,46638450.0,4244368000.0,6155827.0,2539040000.0,4097627.0,195296600.0
min,111.0,1960.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,314.0,1974.75,0.0,4.0,1.0,8.0,0.0,0.0,8.0,1.0,9.0,2.015957,17.0,0.0,4.0
50%,565.5,1989.5,2.0,21.0,4.0,41.0,0.0,0.32207,36.727273,11.0,159.0,32.0,350.0,4.615385,66.0
75%,733.0,2004.25,8.0,113.0,29.341335,288.098361,0.0,2.844444,197.0,116.4879,1948.273,388.4905,4095.514,108.0667,1004.25
max,968.0,2019.0,3124.0,30187.0,5444.0,35323.0,76.0,462.0,20564.0,2461376000.0,224000000000.0,324779700.0,134000000000.0,146085600.0,10306910000.0


In [53]:
capital_df

Unnamed: 0,country_code,ifscode,country,year,igov_rppp,kgov_rppp,ipriv_rppp,kpriv_rppp,ippp_rppp,kppp_rppp,GDP_rppp,igov_n,kgov_n,ipriv_n,kpriv_n,kppp_n,GDP_n,income
0,AFG,512,Afghanistan,1960,3.0,50.0,1.0,15.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
1,AFG,512,Afghanistan,1961,3.0,52.0,1.0,15.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
2,AFG,512,Afghanistan,1962,4.0,54.0,1.0,16.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
3,AFG,512,Afghanistan,1963,4.0,56.0,1.0,17.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
4,AFG,512,Afghanistan,1964,4.0,59.0,1.0,17.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11635,ZWE,698,Zimbabwe,2015,0.0,8.0,1.0,29.0,0.0,0.0,42.0,0.0,6.0,0.0,19.0,0.0,20.0,Low Income Developing Countries
11636,ZWE,698,Zimbabwe,2016,0.0,8.0,1.0,29.0,0.0,0.0,42.0,0.0,6.0,0.0,19.0,0.0,21.0,Low Income Developing Countries
11637,ZWE,698,Zimbabwe,2017,0.0,8.0,1.0,29.0,0.0,0.0,44.0,0.0,6.0,0.0,19.0,0.0,22.0,Low Income Developing Countries
11638,ZWE,698,Zimbabwe,2018,0.0,8.0,1.0,29.0,0.0,0.0,46.0,0.0,6.0,0.0,19.0,0.0,24.0,Low Income Developing Countries


In [54]:
# 2. World Bank-style datasets
wb_datasets = ['energy', 'patents', 'rnd']
id_vars_map = {
    'Series Name': 'series_name',
    'Series Code': 'series_code',
    'Country Name': 'country',
    'countrycode': 'country_code'
}

for name in wb_datasets:
    df = dfs[name].rename(columns=id_vars_map)
    df_long = reshape_to_long(df, list(id_vars_map.values()))
    df_long = impute_group_linear(df_long, ['country', 'series_name'])
    processed_dfs[name] = df_long

In [55]:
# Total number of NaN values in the entire DataFrame
total_nan = processed_dfs['rnd'].isna().sum()
print(f"Total number of NaN values: {total_nan}")

Total number of NaN values: series_name        0
series_code        0
country            0
country_code       0
year               0
value           6080
dtype: int64


In [45]:
# 3. Labor Force enhancement with working-age population
pop_df = dfs['population']
# Filter only the relevant series: ages 15–64
wa_series = pop_df[
    pop_df['Series Name'].isin([
        'Population ages 15-64, female',
        'Population ages 15-64, male'
    ])
]
# Standardize column names
wa_series = wa_series.rename(columns={
    'Country Name': 'country',
    'Country Code': 'country_code',
    'Series Name': 'series_name',
    'Series Code': 'series_code'
})

wa_long = reshape_to_long(wa_series, ['country', 'country_code', 'series_name', 'series_code'])

# Pivot to get female and male in separate columns
wa_pivot = wa_long.pivot_table(
    index=['country', 'country_code', 'year'],
    columns='series_name',
    values='value',
    aggfunc='first'
).reset_index()

# Calculate total working-age population
wa_pivot['working_age_population'] = (
    wa_pivot['Population ages 15-64, female'] +
    wa_pivot['Population ages 15-64, male']
)

working_age_df = wa_pivot[['country', 'country_code', 'year', 'working_age_population']]

# ---------- Labor Force Data ----------

labor_df = dfs['labor_force'].rename(columns={
    'Country Name': 'country',
    'countrycode': 'country_code',
    'Series Name': 'series_name',
    'Series Code': 'series_code'
})

labor_long = reshape_to_long(
    labor_df,
    ['series_name', 'series_code', 'country', 'country_code']
)

# ---------- Merge working-age population with labor force ----------
labor_merged = labor_long.merge(
    working_age_df,
    on=['country', 'country_code', 'year'],
    how='left'
)
labor_merged

# ---------- Train regression model using years 1990–2023 ----------
train_df = labor_merged[(labor_merged['year'] >= 1990) & (labor_merged['value'].notna())]
train_df_clean = train_df.dropna(subset=['working_age_population', 'value'])

X_train = train_df_clean[['working_age_population']]
y_train = train_df_clean['value']

model = LinearRegression()
model.fit(X_train, y_train)

# ---------- Predict labor force for years before 1990 ----------
predict_df = labor_merged[(labor_merged['year'] < 1990) & (labor_merged['working_age_population'].notna())].copy()
predict_df['predicted_labor_force'] = model.predict(predict_df[['working_age_population']])

# Merge predictions back
labor_merged = labor_merged.merge(
    predict_df[['country', 'country_code', 'year', 'predicted_labor_force']],
    on=['country', 'country_code', 'year'],
    how='left'
)

# Fill labor force with prediction if missing
labor_merged['filled_labor_force'] = labor_merged['value'].combine_first(labor_merged['predicted_labor_force'])

# ---------- Final df ----------
final_labor_df = labor_merged[[
    'series_name', 'series_code', 'country', 'country_code', 'year',
    'working_age_population', 'value', 'filled_labor_force'
]].rename(columns={
    'value': 'observed_labor_force'
})

processed_dfs['labor_force'] = final_labor_df

In [10]:
print(processed_dfs['capital'].describe())
processed_dfs['capital']

            ifscode          year     igov_rppp     kgov_rppp    ipriv_rppp   
count  11640.000000  11640.000000  11640.000000  11640.000000  11640.000000  \
mean     551.195876   1989.500000     16.962758    241.690163     53.714433   
std      259.706840     17.318846     94.481525   1066.671149    211.387675   
min      111.000000   1960.000000      0.000000      0.000000      0.000000   
25%      314.000000   1974.750000      0.000000      4.000000      1.000000   
50%      565.500000   1989.500000      2.000000     21.000000      4.000000   
75%      733.000000   2004.250000      8.000000    113.000000     29.341335   
max      968.000000   2019.000000   3124.000000  30187.000000   5444.000000   

         kpriv_rppp     ippp_rppp     kppp_rppp      GDP_rppp        igov_n   
count  11640.000000  11640.000000  11640.000000  11640.000000  1.164000e+04  \
mean     560.728351      0.832517      9.107517    312.876718  1.272919e+06   
std     1922.291548      3.476959     34.379212   1

Unnamed: 0,country_code,ifscode,country,year,igov_rppp,kgov_rppp,ipriv_rppp,kpriv_rppp,ippp_rppp,kppp_rppp,GDP_rppp,igov_n,kgov_n,ipriv_n,kpriv_n,kppp_n,GDP_n,income
0,AFG,512,Afghanistan,1960,3.0,50.0,1.0,15.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
1,AFG,512,Afghanistan,1961,3.0,52.0,1.0,15.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
2,AFG,512,Afghanistan,1962,4.0,54.0,1.0,16.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
3,AFG,512,Afghanistan,1963,4.0,56.0,1.0,17.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
4,AFG,512,Afghanistan,1964,4.0,59.0,1.0,17.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11635,ZWE,698,Zimbabwe,2015,0.0,8.0,1.0,29.0,0.0,0.0,42.0,0.0,6.0,0.0,19.0,0.0,20.0,Low Income Developing Countries
11636,ZWE,698,Zimbabwe,2016,0.0,8.0,1.0,29.0,0.0,0.0,42.0,0.0,6.0,0.0,19.0,0.0,21.0,Low Income Developing Countries
11637,ZWE,698,Zimbabwe,2017,0.0,8.0,1.0,29.0,0.0,0.0,44.0,0.0,6.0,0.0,19.0,0.0,22.0,Low Income Developing Countries
11638,ZWE,698,Zimbabwe,2018,0.0,8.0,1.0,29.0,0.0,0.0,46.0,0.0,6.0,0.0,19.0,0.0,24.0,Low Income Developing Countries


In [11]:
print(processed_dfs['energy'].describe)
processed_dfs['energy']


<bound method NDFrame.describe of                                         series_name        series_code   
0      Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE  \
1      Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
2      Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
3      Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
4      Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
...                                             ...                ...   
17019  Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
17020  Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
17021  Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
17022  Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   
17023  Energy use (kg of oil equivalent per capita)  EG.USE.PCAP.KG.OE   

           country country_code  year       value  
0      Afghanistan       

Unnamed: 0,series_name,series_code,country,country_code,year,value
0,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1960,0.000000
1,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1961,0.000000
2,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1962,0.000000
3,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1963,0.000000
4,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1964,0.000000
...,...,...,...,...,...,...
17019,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Zimbabwe,ZWE,2019,805.339134
17020,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Zimbabwe,ZWE,2020,805.339134
17021,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Zimbabwe,ZWE,2021,805.339134
17022,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Zimbabwe,ZWE,2022,805.339134


In [12]:
print(processed_dfs['patents'].describe())
processed_dfs['patents']

            year         value
count    34048.0  3.404800e+04
mean      1991.5  1.089502e+04
std    18.473224  8.472374e+04
min       1960.0  0.000000e+00
25%      1975.75  0.000000e+00
50%       1991.5  9.000000e+00
75%      2007.25  2.500000e+02
max       2023.0  2.386300e+06


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1960,0.0
1,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1961,0.0
2,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1962,0.0
3,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1963,0.0
4,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1964,0.0
...,...,...,...,...,...,...
34043,"Patent applications, residents",IP.PAT.RESD,Zimbabwe,ZWE,2019,8.0
34044,"Patent applications, residents",IP.PAT.RESD,Zimbabwe,ZWE,2020,8.0
34045,"Patent applications, residents",IP.PAT.RESD,Zimbabwe,ZWE,2021,8.0
34046,"Patent applications, residents",IP.PAT.RESD,Zimbabwe,ZWE,2022,8.0


In [13]:
print(processed_dfs['rnd'].describe())
processed_dfs['rnd']

            year         value
count    17024.0  17024.000000
mean      1991.5    706.468886
std    18.473496   1282.435104
min       1960.0      0.000000
25%      1975.75      0.000000
50%       1991.5     85.800087
75%      2007.25    714.554627
max       2023.0   9081.935547


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1960,0.000000
1,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1961,0.000000
2,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1962,0.000000
3,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1963,0.000000
4,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1964,0.000000
...,...,...,...,...,...,...
17019,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Zimbabwe,ZWE,2019,99.443527
17020,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Zimbabwe,ZWE,2020,99.443527
17021,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Zimbabwe,ZWE,2021,99.443527
17022,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Zimbabwe,ZWE,2022,99.443527


In [14]:
print(processed_dfs['labor_force'].describe())
processed_dfs['labor_force']

            year  working_age_population  observed_labor_force   
count    17024.0            1.667400e+04          7.986000e+03  \
mean      1991.5            1.221623e+08          1.345144e+08   
std    18.473496            4.283611e+08          4.045214e+08   
min       1960.0            1.384000e+03          1.681200e+04   
25%      1975.75            5.314632e+05          1.348892e+06   
50%       1991.5            3.712378e+06          4.801255e+06   
75%      2007.25            2.469562e+07          3.420624e+07   
max       2023.0            5.238188e+09          3.650969e+09   

       filled_labor_force  
count        1.578600e+04  
mean         9.723952e+07  
std          3.209808e+08  
min          1.681200e+04  
25%          9.654877e+05  
50%          3.407617e+06  
75%          2.276696e+07  
max          3.650969e+09  


Unnamed: 0,series_name,series_code,country,country_code,year,working_age_population,observed_labor_force,filled_labor_force
0,"Labor force, total",SL.TLF.TOTL.IN,Afghanistan,AFG,1960,4.990044e+06,,3.992326e+06
1,"Labor force, total",SL.TLF.TOTL.IN,Albania,ALB,1960,8.581070e+05,,1.099570e+06
2,"Labor force, total",SL.TLF.TOTL.IN,Algeria,DZA,1960,5.865620e+06,,4.605314e+06
3,"Labor force, total",SL.TLF.TOTL.IN,American Samoa,ASM,1960,9.703000e+03,,5.056050e+05
4,"Labor force, total",SL.TLF.TOTL.IN,Andorra,AND,1960,6.275000e+03,,5.032050e+05
...,...,...,...,...,...,...,...,...
17019,"Labor force, total",SL.TLF.TOTL.IN,Sub-Saharan Africa,SSF,2023,7.031632e+08,5.005046e+08,5.005046e+08
17020,"Labor force, total",SL.TLF.TOTL.IN,Sub-Saharan Africa (excluding high income),SSA,2023,7.030776e+08,5.005046e+08,5.005046e+08
17021,"Labor force, total",SL.TLF.TOTL.IN,Sub-Saharan Africa (IDA & IBRD countries),TSS,2023,7.031632e+08,5.005046e+08,5.005046e+08
17022,"Labor force, total",SL.TLF.TOTL.IN,Upper middle income,UMC,2023,1.924485e+09,1.404778e+09,1.404778e+09


## EDA

In [15]:
# Print column names and first 3 rows for each processed DataFrame
for df_name, df in processed_dfs.items():
    print(f"\n=== {df_name.upper()} ===")
    print("Columns:", df.columns.tolist())
    print("Sample Data:")
    display(df.head(3))  # Using display() for better Jupyter formatting
    print("\n" + "-"*80 + "\n")


=== CAPITAL ===
Columns: ['country_code', 'ifscode', 'country', 'year', 'igov_rppp', 'kgov_rppp', 'ipriv_rppp', 'kpriv_rppp', 'ippp_rppp', 'kppp_rppp', 'GDP_rppp', 'igov_n', 'kgov_n', 'ipriv_n', 'kpriv_n', 'kppp_n', 'GDP_n', 'income']
Sample Data:


Unnamed: 0,country_code,ifscode,country,year,igov_rppp,kgov_rppp,ipriv_rppp,kpriv_rppp,ippp_rppp,kppp_rppp,GDP_rppp,igov_n,kgov_n,ipriv_n,kpriv_n,kppp_n,GDP_n,income
0,AFG,512,Afghanistan,1960,3.0,50.0,1.0,15.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
1,AFG,512,Afghanistan,1961,3.0,52.0,1.0,15.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
2,AFG,512,Afghanistan,1962,4.0,54.0,1.0,16.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries



--------------------------------------------------------------------------------


=== ENERGY ===
Columns: ['series_name', 'series_code', 'country', 'country_code', 'year', 'value']
Sample Data:


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1960,0.0
1,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1961,0.0
2,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1962,0.0



--------------------------------------------------------------------------------


=== PATENTS ===
Columns: ['series_name', 'series_code', 'country', 'country_code', 'year', 'value']
Sample Data:


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1960,0.0
1,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1961,0.0
2,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1962,0.0



--------------------------------------------------------------------------------


=== RND ===
Columns: ['series_name', 'series_code', 'country', 'country_code', 'year', 'value']
Sample Data:


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1960,0.0
1,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1961,0.0
2,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1962,0.0



--------------------------------------------------------------------------------


=== LABOR_FORCE ===
Columns: ['series_name', 'series_code', 'country', 'country_code', 'year', 'working_age_population', 'observed_labor_force', 'filled_labor_force']
Sample Data:


Unnamed: 0,series_name,series_code,country,country_code,year,working_age_population,observed_labor_force,filled_labor_force
0,"Labor force, total",SL.TLF.TOTL.IN,Afghanistan,AFG,1960,4990044.0,,3992326.0
1,"Labor force, total",SL.TLF.TOTL.IN,Albania,ALB,1960,858107.0,,1099570.0
2,"Labor force, total",SL.TLF.TOTL.IN,Algeria,DZA,1960,5865620.0,,4605314.0



--------------------------------------------------------------------------------



In [16]:
def analyze_series_categories(df_dict):
    for df_name, df in df_dict.items():
        if 'series_name' in df.columns:
            print(f"\n=== {df_name.upper()} SERIES CATEGORIES ===")
            
            # Show unique categories and counts
            category_counts = df['series_name'].value_counts()
            print(f"Total Categories: {len(category_counts)}")
            print("Category Distribution:")
            print(category_counts)
            
            # Show sample records per category
            for category in category_counts.index:
                print(f"\nCategory: {category}")
                display(df[df['series_name'] == category].head(1))
                print("─" * 50)

analyze_series_categories(processed_dfs)


=== ENERGY SERIES CATEGORIES ===
Total Categories: 1
Category Distribution:
series_name
Energy use (kg of oil equivalent per capita)    17024
Name: count, dtype: int64

Category: Energy use (kg of oil equivalent per capita)


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1960,0.0


──────────────────────────────────────────────────

=== PATENTS SERIES CATEGORIES ===
Total Categories: 2
Category Distribution:
series_name
Patent applications, nonresidents    17024
Patent applications, residents       17024
Name: count, dtype: int64

Category: Patent applications, nonresidents


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,1960,0.0


──────────────────────────────────────────────────

Category: Patent applications, residents


Unnamed: 0,series_name,series_code,country,country_code,year,value
64,"Patent applications, residents",IP.PAT.RESD,Afghanistan,AFG,1960,0.0


──────────────────────────────────────────────────

=== RND SERIES CATEGORIES ===
Total Categories: 1
Category Distribution:
series_name
Researchers in R&D (per million people)    17024
Name: count, dtype: int64

Category: Researchers in R&D (per million people)


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1960,0.0


──────────────────────────────────────────────────

=== LABOR_FORCE SERIES CATEGORIES ===
Total Categories: 1
Category Distribution:
series_name
Labor force, total    17024
Name: count, dtype: int64

Category: Labor force, total


Unnamed: 0,series_name,series_code,country,country_code,year,working_age_population,observed_labor_force,filled_labor_force
0,"Labor force, total",SL.TLF.TOTL.IN,Afghanistan,AFG,1960,4990044.0,,3992326.0


──────────────────────────────────────────────────


In [17]:
# 1. Split the tables
split_tables = {}

for df_name in ['energy', 'patents', 'rnd', 'labor_force']:
    df = processed_dfs[df_name].copy()
    categories = df['series_name'].unique()
    
    if len(categories) > 1:
        for category in categories:
            clean_name = (category.lower()
                          .replace(' ', '_')
                          .replace(',', '')
                          .replace('(', '')
                          .replace(')', ''))
            split_df = df[df['series_name'] == category].drop(columns=['series_name', 'series_code'])
            split_tables[f"{df_name}_{clean_name}"] = split_df.rename(
                columns={'value': f"{clean_name}_value"}
            )
    else:
        clean_name = (categories[0].lower()
                      .replace(' ', '_')
                      .replace(',', '')
                      .replace('(', '')
                      .replace(')', ''))
        split_tables[f"{df_name}_{clean_name}"] = df.drop(columns=['series_name', 'series_code'])

# 2. Access the split patent DataFrames
non_resident_df = split_tables['patents_patent_applications_nonresidents']
resident_df = split_tables['patents_patent_applications_residents']

# 3. Merge the patent DataFrames
merged_patents = non_resident_df.merge(
    resident_df,
    on=['country', 'country_code', 'year'],
    how='outer',
    validate='one_to_one'
)

# 4. Verify the merge
print("Merged Patents Table:")
display(merged_patents.head(3))

Merged Patents Table:


Unnamed: 0,country,country_code,year,patent_applications_nonresidents_value,patent_applications_residents_value
0,Afghanistan,AFG,1960,0.0,0.0
1,Afghanistan,AFG,1961,0.0,0.0
2,Afghanistan,AFG,1962,0.0,0.0


In [18]:
display(merged_patents.head(3))
display(processed_dfs['labor_force'].head(3))
display(processed_dfs['rnd'].head(3))
display(processed_dfs['energy'].head(3))
display(processed_dfs['capital'].head(3))

Unnamed: 0,country,country_code,year,patent_applications_nonresidents_value,patent_applications_residents_value
0,Afghanistan,AFG,1960,0.0,0.0
1,Afghanistan,AFG,1961,0.0,0.0
2,Afghanistan,AFG,1962,0.0,0.0


Unnamed: 0,series_name,series_code,country,country_code,year,working_age_population,observed_labor_force,filled_labor_force
0,"Labor force, total",SL.TLF.TOTL.IN,Afghanistan,AFG,1960,4990044.0,,3992326.0
1,"Labor force, total",SL.TLF.TOTL.IN,Albania,ALB,1960,858107.0,,1099570.0
2,"Labor force, total",SL.TLF.TOTL.IN,Algeria,DZA,1960,5865620.0,,4605314.0


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1960,0.0
1,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1961,0.0
2,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,1962,0.0


Unnamed: 0,series_name,series_code,country,country_code,year,value
0,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1960,0.0
1,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1961,0.0
2,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,1962,0.0


Unnamed: 0,country_code,ifscode,country,year,igov_rppp,kgov_rppp,ipriv_rppp,kpriv_rppp,ippp_rppp,kppp_rppp,GDP_rppp,igov_n,kgov_n,ipriv_n,kpriv_n,kppp_n,GDP_n,income
0,AFG,512,Afghanistan,1960,3.0,50.0,1.0,15.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
1,AFG,512,Afghanistan,1961,3.0,52.0,1.0,15.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries
2,AFG,512,Afghanistan,1962,4.0,54.0,1.0,16.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,Low Income Developing Countries


In [19]:
print("Existing DataFrames in processed_dfs:")
for key in processed_dfs.keys():
    print(f"- {key}")
    print(f"  Columns: {processed_dfs[key].columns.tolist()}")
    print(f"  Shape: {processed_dfs[key].shape}")

Existing DataFrames in processed_dfs:
- capital
  Columns: ['country_code', 'ifscode', 'country', 'year', 'igov_rppp', 'kgov_rppp', 'ipriv_rppp', 'kpriv_rppp', 'ippp_rppp', 'kppp_rppp', 'GDP_rppp', 'igov_n', 'kgov_n', 'ipriv_n', 'kpriv_n', 'kppp_n', 'GDP_n', 'income']
  Shape: (11640, 18)
- energy
  Columns: ['series_name', 'series_code', 'country', 'country_code', 'year', 'value']
  Shape: (17024, 6)
- patents
  Columns: ['series_name', 'series_code', 'country', 'country_code', 'year', 'value']
  Shape: (34048, 6)
- rnd
  Columns: ['series_name', 'series_code', 'country', 'country_code', 'year', 'value']
  Shape: (17024, 6)
- labor_force
  Columns: ['series_name', 'series_code', 'country', 'country_code', 'year', 'working_age_population', 'observed_labor_force', 'filled_labor_force']
  Shape: (17024, 8)


In [22]:
# Create new merged dataframe from scratch
new_df = (
    processed_dfs['capital'][['country', 'year', 'GDP_rppp', 'kgov_rppp','kpriv_rppp','kppp_rppp']]
    .merge(
        merged_patents,
        on=['country', 'year'],
        how='left'
    )
    .merge(
        processed_dfs['energy'][['country', 'year', 'value']].rename(columns={'value': 'energy_use'}),
        on=['country', 'year'],
        how='left'
    )
    .merge(
        processed_dfs['labor_force'][['country', 'year', 'filled_labor_force']],
        on=['country', 'year'],
        how='left'
    )
    .merge(
        processed_dfs['rnd'][['country', 'year', 'value']].rename(columns={'value': 'rnd_researchers'}),
        on=['country','year'],
        how='left'
    )
)

# Clean column names
new_df.columns = [col.lower().replace(' ', '_') for col in new_df.columns]

# Verify the new dataframe
print("New DataFrame Structure:")
print(f"Rows: {len(new_df)}, Columns: {new_df.columns.tolist()}")
display(new_df.head(3))

# Verify original processed_dfs remains unchanged
print("\nOriginal processed_dfs Keys:")
print(list(processed_dfs.keys()))

New DataFrame Structure:
Rows: 11640, Columns: ['country', 'year', 'gdp_rppp', 'kgov_rppp', 'kpriv_rppp', 'kppp_rppp', 'country_code', 'patent_applications_nonresidents_value', 'patent_applications_residents_value', 'energy_use', 'filled_labor_force', 'rnd_researchers']


Unnamed: 0,country,year,gdp_rppp,kgov_rppp,kpriv_rppp,kppp_rppp,country_code,patent_applications_nonresidents_value,patent_applications_residents_value,energy_use,filled_labor_force,rnd_researchers
0,Afghanistan,1960,39.0,50.0,15.0,0.0,AFG,0.0,0.0,0.0,3992326.0,0.0
1,Afghanistan,1961,39.0,52.0,15.0,0.0,AFG,0.0,0.0,0.0,4057179.0,0.0
2,Afghanistan,1962,39.0,54.0,16.0,0.0,AFG,0.0,0.0,0.0,4126221.0,0.0



Original processed_dfs Keys:
['capital', 'energy', 'patents', 'rnd', 'labor_force']


In [23]:
# 1. Total number of rows
print(f"Total Rows: {len(new_df)}")

# 2. Missing values analysis
missing_values = new_df.isnull().sum()
missing_percent = (missing_values / len(new_df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing %': missing_percent
}).sort_values(by='Missing Count', ascending=False)

print("\nMissing Values Analysis:")
display(missing_df[missing_df['Missing Count'] > 0])

# 3. Zero value analysis (for numerical columns)
numeric_cols = new_df.select_dtypes(include=[np.number]).columns
zero_counts = (new_df[numeric_cols] == 0).sum()
zero_percent = (zero_counts / len(new_df)) * 100
zero_df = pd.DataFrame({
    'Zero Count': zero_counts,
    'Zero %': zero_percent
}).sort_values(by='Zero Count', ascending=False)

print("\nZero Values Analysis:")
display(zero_df[zero_df['Zero Count'] > 0])

Total Rows: 11640

Missing Values Analysis:


Unnamed: 0,Missing Count,Missing %
filled_labor_force,1590,13.659794
country_code,1200,10.309278
patent_applications_nonresidents_value,1200,10.309278
patent_applications_residents_value,1200,10.309278
energy_use,1200,10.309278
rnd_researchers,1200,10.309278



Zero Values Analysis:


Unnamed: 0,Zero Count,Zero %
kppp_rppp,5220,44.845361
rnd_researchers,3240,27.835052
patent_applications_residents_value,2220,19.072165
patent_applications_nonresidents_value,1680,14.43299
energy_use,1320,11.340206
kgov_rppp,859,7.379725
kpriv_rppp,695,5.97079
gdp_rppp,571,4.905498
