Advanced ALITE‐Style Integrator
--------------------------------
Clusters columns across heterogeneous tables by both name and content,
then produces the full‐disjunction (n‐way outer join) with meaningful
representative names and diagnostic outputs.


In [1]:
import os
import re
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (
    train_test_split,
    TimeSeriesSplit,
    GroupKFold,
    cross_val_score,
     RandomizedSearchCV
)
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import partial_dependence, PartialDependenceDisplay
import statsmodels.api as sm
from linearmodels.panel import PanelOLS, RandomEffects, compare
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import shap
from xgboost import XGBRegressor


In [2]:
import os
import re
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from itertools import combinations
from collections import defaultdict, deque

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import ks_2samp

Let's treat the raw data tables using same technique as in manual merging. 

In [3]:
def process_time_series(
    df: pd.DataFrame,
    id_vars_map: dict,
    year_col: str = 'year',
    cutoff_year: int = 2019,
    drop_id_na_all: bool = True,
    value_name: str = 'value'
):
    df2 = df.rename(columns=id_vars_map).copy()
    yr_pat = re.compile(r'^(\d{4}) \[YR\d{4}\]$')
    strip  = {c: yr_pat.match(c).group(1)
              for c in df2.columns if yr_pat.match(c)}
    if strip:
        df2 = df2.rename(columns=strip)

    id_vars = list(id_vars_map.values())

    if year_col in df2.columns:
        ivy        = id_vars + [year_col]
        value_vars = [c for c in df2.columns if c not in ivy]
        df_long    = df2.melt(
            id_vars    = ivy,
            value_vars = value_vars,
            var_name   = 'series',
            value_name = value_name
        )
        df_long[year_col] = df_long[year_col].astype(int)
    else:
        year_cols = [c for c in df2.columns if c not in id_vars]
        df_long   = df2.melt(
            id_vars    = id_vars,
            value_vars = year_cols,
            var_name   = year_col,
            value_name = value_name
        )
        df_long[year_col] = df_long[year_col].astype(int)

    df_long[value_name] = pd.to_numeric(df_long[value_name], errors='coerce')
    df_long            = df_long[df_long[year_col] <= cutoff_year]

    if drop_id_na_all:
        df_long = df_long.dropna(subset=id_vars + [year_col], how='all')

    df_long['_missing_before'] = df_long[value_name].isna()

    if 'series' in df_long:
        group_cols = id_vars + ['series']
        metric_col = 'series'
    else:
        group_cols = id_vars
        metric_col = id_vars[0]
    df_long = df_long.sort_values(group_cols + [year_col])
    df_long[value_name] = (
        df_long
        .groupby(group_cols)[value_name]
        .transform(lambda s: s.interpolate('linear').ffill().bfill())
    )

    df_long['_imputed']       = df_long['_missing_before'] & df_long[value_name].notna()
    df_long['_missing_after'] = df_long[value_name].isna()

    summary = (
        df_long
        .groupby(metric_col)
        .agg(
            total           =(value_name,      'size'),
            missing_before  =('_missing_before','sum'),
            imputed         =('_imputed',      'sum'),
            missing_after   =('_missing_after','sum'),
        )
        .assign(
            missing_before_pct=lambda d: d['missing_before']/d['total']*100,
            imputed_pct       =lambda d: d['imputed']/d['total']*100,
            missing_after_pct =lambda d: d['missing_after']/d['total']*100,
        )
        .reset_index()
    )
    return df_long, summary


In [4]:
base_root = '../data/raw'
file_paths = {
    'capital': os.path.join(base_root, 'CapitalStockData.csv'),
    'energy': os.path.join(base_root, 'energy_use.csv'),
    'labor_force': os.path.join(base_root, 'labor_force.csv'),
    'patents': os.path.join(base_root, 'patents_res_nonres.csv'),
    'rnd': os.path.join(base_root, 'R&D.csv'),
    'unemployment': os.path.join(base_root, 'unemployed_ilo_estimate.csv'),
    'population': os.path.join(base_root, 'population_Data.csv'),
    'human_capital': os.path.join(base_root, 'Human_Capital_Data.csv'),
    'penn_table': os.path.join(base_root, 'penn_table.csv'),
}
for name, path in file_paths.items():
    if not os.path.exists(path):
        raise FileNotFoundError(f"Data file not found: {path}")
        
dfs = {}
for name, path in file_paths.items():
    try:
        dfs[name] = pd.read_csv(path)
        print(f"Loaded {name}_df: {dfs[name].shape[0]} rows, {dfs[name].shape[1]} columns")
    except Exception as e:
        raise ValueError(f"Failed to load {path}: {str(e)}")
for name, df in dfs.items():
    print(f"\n{name}_df (first 2 rows):")
    display(df.head(2))

Loaded capital_df: 11640 rows, 18 columns
Loaded energy_df: 266 rows, 68 columns
Loaded labor_force_df: 266 rows, 68 columns
Loaded patents_df: 532 rows, 68 columns
Loaded rnd_df: 266 rows, 68 columns
Loaded unemployment_df: 266 rows, 68 columns
Loaded population_df: 6123 rows, 95 columns
Loaded human_capital_df: 803 rows, 68 columns
Loaded penn_table_df: 12810 rows, 52 columns

capital_df (first 2 rows):


Unnamed: 0,countrycode,ifscode,countryname,year,igov_rppp,kgov_rppp,ipriv_rppp,kpriv_rppp,ippp_rppp,kppp_rppp,GDP_rppp,igov_n,kgov_n,ipriv_n,kpriv_n,kppp_n,GDP_n,income
0,AFG,512,Afghanistan,1960,3.0,50.0,1.0,15.0,,,,,,,,,,Low Income Developing Countries
1,AFG,512,Afghanistan,1961,3.0,52.0,1.0,15.0,,,,,,,,,,Low Income Developing Countries



energy_df (first 2 rows):


Unnamed: 0,Series Name,Series Code,Country Name,countrycode,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Afghanistan,AFG,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
1,Energy use (kg of oil equivalent per capita),EG.USE.PCAP.KG.OE,Albania,ALB,..,..,..,..,..,..,...,808.4558396,..,..,..,..,..,..,..,..,..



labor_force_df (first 2 rows):


Unnamed: 0,Series Name,Series Code,Country Name,countrycode,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,"Labor force, total",SL.TLF.TOTL.IN,Afghanistan,AFG,..,..,..,..,..,..,...,8229616,8575264,8883595,9223903,9215855,9194531,9060510,9183905,8617545,8866507
1,"Labor force, total",SL.TLF.TOTL.IN,Albania,ALB,..,..,..,..,..,..,...,1244156,1295052,1341701,1363232,1393620,1415825,1352399,1367734,1389262,1376643



patents_df (first 2 rows):


Unnamed: 0,Series Name,Series Code,Country Name,countrycode,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,"Patent applications, nonresidents",IP.PAT.NRES,Afghanistan,AFG,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
1,"Patent applications, nonresidents",IP.PAT.NRES,Albania,ALB,..,..,..,..,..,..,...,3,5,5,8,3,1,..,1,..,..



rnd_df (first 2 rows):


Unnamed: 0,Series Name,Series Code,Country Name,countrycode,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Afghanistan,AFG,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
1,Researchers in R&D (per million people),SP.POP.SCIE.RD.P6,Albania,ALB,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..



unemployment_df (first 2 rows):


Unnamed: 0,Series Name,Series Code,Country Name,countrycode,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,Afghanistan,AFG,..,..,..,..,..,..,...,7.915,9.052,10.133,11.184,11.196,11.185,11.71,11.994,14.1,13.991
1,"Unemployment, total (% of total labor force) (...",SL.UEM.TOTL.ZS,Albania,ALB,..,..,..,..,..,..,...,18.055,17.193,15.418,13.616,12.304,11.466,11.69,11.474,10.137,10.108



population_df (first 2 rows):


Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],...,2041 [YR2041],2042 [YR2042],2043 [YR2043],2044 [YR2044],2045 [YR2045],2046 [YR2046],2047 [YR2047],2048 [YR2048],2049 [YR2049],2050 [YR2050]
0,Afghanistan,AFG,Age dependency ratio (% of working-age populat...,SP.POP.DPND,81.0613894386502,81.2840359519206,81.5068083352119,81.8545805002831,82.3730133850468,83.0962139283287,...,64.8010341971567,63.8742481928,62.9710690099967,62.1049065054236,61.2935028902416,60.4908498351356,59.7165081417511,58.985722383206,58.2601597694968,57.5832779071994
1,Afghanistan,AFG,"Age dependency ratio, old",SP.POP.DPND.OL,5.11201905233701,5.09290674973051,5.06917574290293,5.04656900335327,5.02945951585938,5.01989044519751,...,5.4115807557828,5.50825257645877,5.60517537301781,5.70257398087721,5.80326473462997,5.90269830827955,5.99954027587882,6.09781396102577,6.19962014092639,6.31400253014127



human_capital_df (first 2 rows):


Unnamed: 0,Series Name,Series Code,Country Name,countrycode,1960 [YR1960],1961 [YR1961],1962 [YR1962],1963 [YR1963],1964 [YR1964],1965 [YR1965],...,2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017],2018 [YR2018],2019 [YR2019],2020 [YR2020],2021 [YR2021],2022 [YR2022],2023 [YR2023]
0,"Labor force, total",SL.TLF.TOTL.IN,Afghanistan,AFG,..,..,..,..,..,..,...,8229616,8575264,8883595,9223903,9215855,9194531,9060510,9183905,8617545,8866507
1,"Labor force, total",SL.TLF.TOTL.IN,Albania,ALB,..,..,..,..,..,..,...,1244156,1295052,1341701,1363232,1393620,1415825,1352399,1367734,1389262,1376643



penn_table_df (first 2 rows):


Unnamed: 0,countrycode,country,currency_unit,year,rgdpe,rgdpo,pop,emp,avh,hc,...,csh_x,csh_m,csh_r,pl_c,pl_i,pl_g,pl_x,pl_m,pl_n,pl_k
0,ABW,Aruba,Aruban Guilder,1950,,,,,,,...,,,,,,,,,,
1,ABW,Aruba,Aruban Guilder,1951,,,,,,,...,,,,,,,,,,


In [5]:
for name, df in dfs.items():
    if df.empty:
        print(f"\n{name}: Empty DataFrame")
        continue
    print(f"\n{name}:")
    if 'Series Name' in df.columns:
        unique_series = df['Series Name'].dropna().unique().tolist()
        print(f"  Series Names: {unique_series}")
    else:
        skip_cols = {'countrycode', 'ifs', 'currency_unit', 'unnamed'}
        data_columns = [
            col for col in df.columns
            if col.lower() not in skip_cols and not col.lower().startswith('unnamed')
        ]
        print(f"  Columns: {data_columns}")


capital:
  Columns: ['ifscode', 'countryname', 'year', 'igov_rppp', 'kgov_rppp', 'ipriv_rppp', 'kpriv_rppp', 'ippp_rppp', 'kppp_rppp', 'GDP_rppp', 'igov_n', 'kgov_n', 'ipriv_n', 'kpriv_n', 'kppp_n', 'GDP_n', 'income']

energy:
  Series Names: ['Energy use (kg of oil equivalent per capita)']

labor_force:
  Series Names: ['Labor force, total']

patents:
  Series Names: ['Patent applications, nonresidents', 'Patent applications, residents']

rnd:
  Series Names: ['Researchers in R&D (per million people)']

unemployment:
  Series Names: ['Unemployment, total (% of total labor force) (modeled ILO estimate)']

population:
  Series Names: ['Age dependency ratio (% of working-age population)', 'Age dependency ratio, old', 'Age dependency ratio, young', 'Life expectancy at birth, total (years)', 'Life expectancy at birth, male (years)', 'Life expectancy at birth, female (years)', 'Population ages 15-64 (% of total population)', 'Population ages 15-64, female', 'Population ages 15-64, male', '

In [6]:
for name, df in dfs.items():
    if df.empty:
        print(f"\n{name}: Empty DataFrame")
        continue
    print(f"\n{name}:")
    if 'Series Name' in df.columns:
        unique_series = df['Series Name'].dropna().unique().tolist()
        print(f"  Series Names: {unique_series}")
    else:
        skip_cols = {'countrycode', 'ifs', 'currency_unit', 'unnamed'}
        data_columns = [
            col for col in df.columns
            if col.lower() not in skip_cols and not col.lower().startswith('unnamed')
        ]
        print(f"  Columns: {data_columns}")


capital:
  Columns: ['ifscode', 'countryname', 'year', 'igov_rppp', 'kgov_rppp', 'ipriv_rppp', 'kpriv_rppp', 'ippp_rppp', 'kppp_rppp', 'GDP_rppp', 'igov_n', 'kgov_n', 'ipriv_n', 'kpriv_n', 'kppp_n', 'GDP_n', 'income']

energy:
  Series Names: ['Energy use (kg of oil equivalent per capita)']

labor_force:
  Series Names: ['Labor force, total']

patents:
  Series Names: ['Patent applications, nonresidents', 'Patent applications, residents']

rnd:
  Series Names: ['Researchers in R&D (per million people)']

unemployment:
  Series Names: ['Unemployment, total (% of total labor force) (modeled ILO estimate)']

population:
  Series Names: ['Age dependency ratio (% of working-age population)', 'Age dependency ratio, old', 'Age dependency ratio, young', 'Life expectancy at birth, total (years)', 'Life expectancy at birth, male (years)', 'Life expectancy at birth, female (years)', 'Population ages 15-64 (% of total population)', 'Population ages 15-64, female', 'Population ages 15-64, male', '

In [7]:
penn_table_df = dfs['penn_table']
columns = [
    'countrycode', 'country', 'year',
    'rgdpo',      # Output-side Real GDP at PPP
    'emp',        # Employment
    'avh',        # Average hours worked
    'hc',         # Human capital index
    'cn',         # Capital stock (PPP 2017 US$)
    'ck',         # Capital services (USA=1)
    'ctfp'        # TFP at current PPPs (USA=1)
]

penn_df = penn_table_df[columns].copy()
penn_df = penn_df.rename(columns={
    'countrycode': 'country_code',
    'rgdpo':       'real_gdp_ppp_output',
    'emp':         'employment_million',
    'avh':         'avg_hours_per_year',
    'hc':          'human_capital',
    'cn':          'capital_stock_total',
    'ck':          'capital_services_index',
    'ctfp':        'tfp_index'
})

display(penn_df.head())

Unnamed: 0,country_code,country,year,real_gdp_ppp_output,employment_million,avg_hours_per_year,human_capital,capital_stock_total,capital_services_index,tfp_index
0,ABW,Aruba,1950,,,,,,,
1,ABW,Aruba,1951,,,,,,,
2,ABW,Aruba,1952,,,,,,,
3,ABW,Aruba,1953,,,,,,,
4,ABW,Aruba,1954,,,,,,,


In [8]:

numeric_cols = [
    'real_gdp_ppp_output',
    'employment_million',
    'avg_hours_per_year',
    'human_capital',
    'capital_stock_total',
    'capital_services_index',
    'tfp_index'
]

for col in numeric_cols:
    if col in penn_df.columns:
        penn_df[col] = (
            penn_df[col]
            .astype(str)
            .str.replace(',', '')     
            .replace('nan', np.nan)   
            .astype(float)
        )

penn_map = {
    'country_code': 'country_code',
    'country':      'country'
}

penn_long, penn_summary = process_time_series(
    df          = penn_df,
    id_vars_map = penn_map,
    year_col    = 'year',
    cutoff_year = 2019
)

penn_df_processed = (
    penn_long
    .pivot_table(
        index   = list(penn_map.values()) + ['year'],
        columns = 'series',
        values  = 'value'
    )
    .reset_index()
)

print("\nPenn Table Imputation Summary:")
display(penn_summary)



Penn Table Imputation Summary:


Unnamed: 0,series,total,missing_before,imputed,missing_after,missing_before_pct,imputed_pct,missing_after_pct
0,avg_hours_per_year,12810,9318,1338,7980,72.740047,10.444965,62.295082
1,capital_services_index,12810,5720,2500,3220,44.652615,19.516003,25.136612
2,capital_stock_total,12810,2496,2286,210,19.484778,17.845433,1.639344
3,employment_million,12810,3281,3211,70,25.612802,25.066354,0.546448
4,human_capital,12810,4173,1513,2660,32.576112,11.811085,20.765027
5,real_gdp_ppp_output,12810,2411,2411,0,18.821233,18.821233,0.0
6,tfp_index,12810,6403,1853,4550,49.984387,14.465262,35.519126


In [9]:
capital_df = dfs['capital']
capital_df = capital_df.rename(columns={
    'countryname': 'country',
    'countrycode': 'country_code',
    'income':      'income_group'
})
cap_map = {
    'countrycode': 'country_code',   
    'ifscode':     'ifscode',
    'country':     'country',
    'income':      'income_group'
}
cap_long, cap_summary = process_time_series(
    capital_df,
    id_vars_map = cap_map,
    year_col    = 'year',
    cutoff_year = 2019   # we have target variable only up to 2019
)

capital_df_processed = (
    cap_long
    .pivot_table(
        index   = list(cap_map.values()) + ['year'],
        columns = 'series',
        values  = 'value'
    )
    .reset_index()
)

print("\nAfter imputation summary:")
display(cap_summary)



After imputation summary:


Unnamed: 0,series,total,missing_before,imputed,missing_after,missing_before_pct,imputed_pct,missing_after_pct
0,GDP_n,11640,1344,1344,0,11.546392,11.546392,0.0
1,GDP_rppp,11640,1356,1296,60,11.649485,11.134021,0.515464
2,igov_n,11640,3676,2476,1200,31.580756,21.271478,10.309278
3,igov_rppp,11640,1350,90,1260,11.597938,0.773196,10.824742
4,ippp_rppp,11640,6766,3646,3120,58.127148,31.323024,26.804124
5,ipriv_n,11640,3676,2476,1200,31.580756,21.271478,10.309278
6,ipriv_rppp,11640,1350,90,1260,11.597938,0.773196,10.824742
7,kgov_n,11640,3465,2505,960,29.768041,21.520619,8.247423
8,kgov_rppp,11640,1099,79,1020,9.441581,0.678694,8.762887
9,kppp_n,11640,6836,3596,3240,58.728522,30.893471,27.835052


In [10]:
energy_df = dfs['energy']
labor_force_df = dfs['labor_force']
patents_df = dfs['patents']
rnd_df = dfs['rnd']
unemployment_df = dfs['unemployment']
population_df = dfs['population']
human_capital_df = dfs['human_capital']

wb_map = {
    'Series Name':  'series_name',
    'Series Code':  'series_code',
    'Country Name': 'country',
}

wb_dfs = {
    'energy':        energy_df,
    'labor_force':   labor_force_df,
    'patents':       patents_df,
    'rnd':           rnd_df,
    'unemployment':  unemployment_df,
    'population':    population_df,
    'human_capital': human_capital_df,
}

for name, df in wb_dfs.items():
    print(f"{name}_df: {df.shape[0]} rows, {df.shape[1]} columns")

energy_df: 266 rows, 68 columns
labor_force_df: 266 rows, 68 columns
patents_df: 532 rows, 68 columns
rnd_df: 266 rows, 68 columns
unemployment_df: 266 rows, 68 columns
population_df: 6123 rows, 95 columns
human_capital_df: 803 rows, 68 columns


In [11]:
cleaned = {}
for name, df in wb_dfs.items(): 
    df_cleaned = df.drop(columns=[col for col in ['Country Code', 'countrycode'] if col in df.columns])
    cleaned[name] = df_cleaned

processed = {}
summaries = {}
for name, df in cleaned.items():
    long_filled, summary = process_time_series(
        df,
        id_vars_map = wb_map,
        year_col    = 'year',
        cutoff_year = 2019
    )
    processed[name]  = long_filled
    summaries[name]  = summary
id_cols = ['series_name', 'country']

processed_clean = {}

for name, long_df in processed.items():
    initial_len = len(long_df)
    missing_rows = long_df[id_cols].isna().all(axis=1).sum()

    print(f"\n{name.upper()} dataset:")
    print(f"Initial rows: {initial_len}, Missing ID rows: {missing_rows}")

    df_clean = long_df.dropna(subset=id_cols, how='all')
    dropped_rows = initial_len - len(df_clean)
    print(f"Rows after dropping: {len(df_clean)}, Dropped rows: {dropped_rows}")

    processed_clean[name] = df_clean

    wide_df = df_clean.pivot(index=['country', 'year'], columns='series_name', values='value').reset_index()
    wide_df.columns.name = None
    wide_df.columns = wide_df.columns.astype(str)
    print(len(wide_df.columns))
    print(wide_df.columns)

    locals()[f"{name}_df_processed"] = wide_df



ENERGY dataset:
Initial rows: 15960, Missing ID rows: 0
Rows after dropping: 15960, Dropped rows: 0
3
Index(['country', 'year', 'Energy use (kg of oil equivalent per capita)'], dtype='object')

LABOR_FORCE dataset:
Initial rows: 15960, Missing ID rows: 0
Rows after dropping: 15960, Dropped rows: 0
3
Index(['country', 'year', 'Labor force, total'], dtype='object')

PATENTS dataset:
Initial rows: 31920, Missing ID rows: 0
Rows after dropping: 31920, Dropped rows: 0
4
Index(['country', 'year', 'Patent applications, nonresidents',
       'Patent applications, residents'],
      dtype='object')

RND dataset:
Initial rows: 15960, Missing ID rows: 0
Rows after dropping: 15960, Dropped rows: 0
3
Index(['country', 'year', 'Researchers in R&D (per million people)'], dtype='object')

UNEMPLOYMENT dataset:
Initial rows: 15960, Missing ID rows: 0
Rows after dropping: 15960, Dropped rows: 0
3
Index(['country', 'year',
       'Unemployment, total (% of total labor force) (modeled ILO estimate)'],
  

In [12]:
wide_dfs = {
    'capital': capital_df_processed,
    'energy': energy_df_processed,
    'labor_force': labor_force_df_processed,
    'patents': patents_df_processed,
    'rnd': rnd_df_processed,
    'unemployment': unemployment_df_processed,
    'population': population_df_processed,
    'human_capital': human_capital_df_processed,
    'penn': penn_df_processed,
}

for df in wide_dfs.values():
    print(len(df.columns))
    print(df.columns)
tables = list(wide_dfs.values())

18
Index(['country_code', 'ifscode', 'country', 'income_group', 'year', 'GDP_n',
       'GDP_rppp', 'igov_n', 'igov_rppp', 'ippp_rppp', 'ipriv_n', 'ipriv_rppp',
       'kgov_n', 'kgov_rppp', 'kppp_n', 'kppp_rppp', 'kpriv_n', 'kpriv_rppp'],
      dtype='object', name='series')
3
Index(['country', 'year', 'Energy use (kg of oil equivalent per capita)'], dtype='object')
3
Index(['country', 'year', 'Labor force, total'], dtype='object')
4
Index(['country', 'year', 'Patent applications, nonresidents',
       'Patent applications, residents'],
      dtype='object')
3
Index(['country', 'year', 'Researchers in R&D (per million people)'], dtype='object')
3
Index(['country', 'year',
       'Unemployment, total (% of total labor force) (modeled ILO estimate)'],
      dtype='object')
26
Index(['country', 'year', 'nan',
       'Age dependency ratio (% of working-age population)',
       'Age dependency ratio, old', 'Age dependency ratio, young',
       'Life expectancy at birth, female (years)',
  

In [13]:
class AdvancedALITE:
    def __init__(
        self,
        tables: list[pd.DataFrame],
        name_weight: float = 0.5,
        text_weight: float = 0.3,
        numeric_weight: float = 0.2,
        overall_thresh: float = 0.5,
        max_content_samples: int = 200
    ):
        """
        tables            : list of DataFrames to integrate
        name_weight       : weight for name‐based similarity
        text_weight       : weight for text content similarity
        numeric_weight    : weight for numeric distribution similarity
        overall_thresh    : threshold on weighted similarity to cluster
        max_content_samples: max unique values per column for TF-IDF
        """
        self.tables = tables
        self.n_w = name_weight
        self.t_w = text_weight
        self.nm_w = numeric_weight
        self.thresh = overall_thresh
        self.max_samples = max_content_samples

        # will be filled in .integrate()
        self.keys = []       # list of (table_idx, col_name)
        self.col_types = {}  # mapping from key to 'numeric'|'text'|...
        self.sim = {}        # mapping pair→(name_sim, text_sim, num_sim, combined)
        self.clusters = {}   # mapping key→cluster_id
        self.rep_name = {}   # mapping cluster_id→representative column name

    def classify_column(self, series: pd.Series) -> str:
        """Classify column as 'numeric' or 'text'."""
        if pd.api.types.is_numeric_dtype(series):
            return 'numeric'
        else:
            # treat everything else as text for simplicity
            return 'text'

    def compute_name_sim(self, a: str, b: str) -> float:
        return SequenceMatcher(None, a.lower(), b.lower()).ratio()

    def compute_text_sims(self):
        """Compute TF-IDF + cosine for all text columns."""
        docs, keys = [], []
        for idx, tbl in enumerate(self.tables):
            for col in tbl.columns:
                key = (idx, col)
                if self.col_types[key] == 'text':
                    keys.append(key)
                    vals = tbl[col].dropna().astype(str).unique()
                    if len(vals) > self.max_samples:
                        vals = np.random.choice(vals, self.max_samples, replace=False)
                    docs.append(" ".join(vals))

        if not docs:
            return {}

        tfidf = TfidfVectorizer().fit_transform(docs)
        sim_mat = cosine_similarity(tfidf)
        text_sim = {}
        for i, k1 in enumerate(keys):
            for j, k2 in enumerate(keys):
                if i < j:
                    text_sim[(k1, k2)] = sim_mat[i,j]
        return text_sim

    def compute_numeric_sims(self):
        """Compute 1 - KS statistic for all numeric columns."""
        num_keys = [(i,c) for i,t in enumerate(self.tables) for c in t.columns
                    if self.col_types[(i,c)] == 'numeric']
        num_sim = {}
        for (k1, k2) in combinations(num_keys, 2):
            s1 = self.tables[k1[0]][k1[1]].dropna()
            s2 = self.tables[k2[0]][k2[1]].dropna()
            if len(s1)>1 and len(s2)>1:
                stat = ks_2samp(s1, s2).statistic
                num_sim[(k1,k2)] = 1 - stat
        return num_sim

    def build_similarity(self):
        """Compute name/text/num sims and combined score for every pair."""
        # keys and types
        for tidx, tbl in enumerate(self.tables):
            for col in tbl.columns:
                key = (tidx, col)
                self.keys.append(key)
                self.col_types[key] = self.classify_column(tbl[col])

        text_sim = self.compute_text_sims()
        num_sim  = self.compute_numeric_sims()

        for (k1, k2) in combinations(self.keys, 2):
            name_s = self.compute_name_sim(k1[1], k2[1])
            t_s   = text_sim.get((k1,k2), text_sim.get((k2,k1), 0.0))
            n_s   = num_sim.get((k1,k2), num_sim.get((k2,k1), 0.0))
            combined = (
                self.n_w * name_s +
                self.t_w * t_s +
                self.nm_w * n_s
            )
            self.sim[(k1,k2)] = (name_s, t_s, n_s, combined)

    def cluster_columns(self):
        """Build threshold graph and extract connected components."""
        # Build adjacency
        adj = defaultdict(list)
        for (k1,k2), sims in self.sim.items():
            if sims[3] >= self.thresh:
                adj[k1].append(k2)
                adj[k2].append(k1)
        visited = set()
        cid = 0

        # BFS for each component
        for key in self.keys:
            if key in visited:
                continue
            # new component
            queue = deque([key])
            while queue:
                k = queue.popleft()
                if k in visited:
                    continue
                visited.add(k)
                self.clusters[k] = cid
                for nbr in adj.get(k, []):
                    if nbr not in visited:
                        queue.append(nbr)
            cid += 1

    def choose_representative_names(self):
        """Pick a human‐readable name per cluster."""
        id_to_cols = defaultdict(list)
        for k, c in self.clusters.items():
            id_to_cols[c].append(k[1])  # just the column name
        used = set()
        for c, cols in id_to_cols.items():
            # shortest wins
            rep = min(cols, key=len)
            if rep in used:
                rep = f"{rep}_{c}"
            used.add(rep)
            self.rep_name[c] = rep

    def expand_and_merge(self):
        """Perform the full‐disjunction merge with representative names."""
        # expand each table
        expanded = []
        for tidx, tbl in enumerate(self.tables):
            df = pd.DataFrame()
            for (i,col), c in self.clusters.items():
                if i == tidx and col in tbl.columns:
                    df[self.rep_name[c]] = tbl[col]
            expanded.append(df)

        # full outer join with concat fallback
        result = expanded[0]
        for nxt in expanded[1:]:
            common = result.columns.intersection(nxt.columns).tolist()
            if common:
                result = pd.merge(result, nxt, how='outer', on=common)
            else:
                result = pd.concat(
                    [result.reset_index(drop=True),
                     nxt.reset_index(drop=True)],
                    axis=1
                )
        return result

    def diagnostics(self) -> pd.DataFrame:
        """Return DataFrame of column pairs with their similarity scores."""
        rows = []
        for (k1,k2), (ns, ts, nums, cmb) in self.sim.items():
            rows.append({
                'table1': k1[0], 'col1': k1[1],
                'table2': k2[0], 'col2': k2[1],
                'name_sim': ns, 'text_sim': ts,
                'numeric_sim': nums, 'combined_sim': cmb,
                'cluster1': self.clusters.get(k1),
                'cluster2': self.clusters.get(k2)
            })
        return pd.DataFrame(rows)

    def integrate(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Run the full pipeline:
         1) build similarity
         2) cluster
         3) choose rep names
         4) merge
        Returns:
          integrated_df, diagnostics_df
        """
        self.build_similarity()
        self.cluster_columns()
        self.choose_representative_names()
        integrated = self.expand_and_merge()
        diag = self.diagnostics()
        return integrated, diag

In [14]:
# Integrate
integrator = AdvancedALITE(
    tables,
    name_weight=0.3,
    text_weight=0.3,
    numeric_weight=0.2,
    overall_thresh=0.65
)
integrated_df, diagnostics_df = integrator.integrate()

# Show results
print("Integrated shape:", integrated_df.shape)
print("Missing-value ratio:", f"{integrated_df.isna().mean().mean():.2%}")
print("\nSample integrated columns:", list(integrated_df.columns)[:10])
print("\nDiagnostics (sample):")
print(diagnostics_df.sort_values('combined_sim', ascending=False).head(10))

Integrated shape: (16080, 77)
Missing-value ratio: 19.77%

Sample integrated columns: ['country_code', 'ifscode', 'country', 'income_group', 'year', 'GDP_n', 'GDP_rppp', 'igov_n', 'igov_rppp', 'ippp_rppp']

Diagnostics (sample):
      table1          col1  table2          col2  name_sim  text_sim   
1395       2       country       5       country       1.0  0.912672  \
1388       2       country       3       country       1.0  0.906692   
1554       3       country       5       country       1.0  0.905677   
66         0  country_code       8  country_code       1.0  0.898429   
1752       4       country       5       country       1.0  0.896938   
1583       3       country       7       country       1.0  0.895031   
1919       5       country       7       country       1.0  0.893854   
1551       3       country       4       country       1.0  0.893755   
1392       2       country       4       country       1.0  0.893331   
1424       2       country       7       country   

In [15]:
integrated_df.head()

Unnamed: 0,country_code,ifscode,country,income_group,year,GDP_n,GDP_rppp,igov_n,igov_rppp,ippp_rppp,...,country_code_67,country_68,year_69,avg_hours_per_year,capital_services_index,capital_stock_total,employment_million,human_capital,real_gdp_ppp_output,tfp_index
0,ABW,314.0,Aruba,Emerging Market Economies,1960.0,0.0,0.0,,,,...,ABW,Aruba,1950.0,,0.0,1224.0,0.0,,463.0,
1,ABW,314.0,Aruba,Emerging Market Economies,1961.0,0.0,0.0,,,,...,ABW,Aruba,1951.0,,0.0,1224.0,0.0,,463.0,
2,ABW,314.0,Aruba,Emerging Market Economies,1962.0,0.0,0.0,,,,...,ABW,Aruba,1952.0,,0.0,1224.0,0.0,,463.0,
3,ABW,314.0,Aruba,Emerging Market Economies,1963.0,0.0,0.0,,,,...,ABW,Aruba,1953.0,,0.0,1224.0,0.0,,463.0,
4,ABW,314.0,Aruba,Emerging Market Economies,1964.0,0.0,0.0,,,,...,ABW,Aruba,1954.0,,0.0,1224.0,0.0,,463.0,


In [16]:
integrated_df.columns

Index(['country_code', 'ifscode', 'country', 'income_group', 'year', 'GDP_n',
       'GDP_rppp', 'igov_n', 'igov_rppp', 'ippp_rppp', 'ipriv_n', 'ipriv_rppp',
       'kgov_n', 'kgov_rppp', 'kppp_n', 'kppp_rppp', 'kpriv_n', 'kpriv_rppp',
       'country_18', 'year_19', 'Energy use (kg of oil equivalent per capita)',
       'country_21', 'year_22', 'Labor force, total', 'country_24', 'year_25',
       'Patent applications, nonresidents', 'Patent applications, residents',
       'country_28', 'year_29', 'Researchers in R&D (per million people)',
       'country_31', 'year_32',
       'Unemployment, total (% of total labor force) (modeled ILO estimate)',
       'country_34', 'year_35', 'nan',
       'Age dependency ratio (% of working-age population)',
       'Age dependency ratio, old', 'Age dependency ratio, young',
       'Life expectancy at birth, female (years)',
       'Life expectancy at birth, male (years)',
       'Life expectancy at birth, total (years)',
       'Population ages 1

In [17]:
merged_data = integrated_df.copy()

In [18]:

merged_data.to_csv("alite_merge_before_processing.csv", index=False)


In [19]:
print("Rows in merged_data before operation:", len(merged_data))
merged_data['GDP_rppp'] = merged_data['GDP_rppp'].replace(0, np.nan)

merged_data['real_gdp_ppp_bil'] = merged_data['real_gdp_ppp_output'] / 1000
mask_both = merged_data[['GDP_rppp', 'real_gdp_ppp_bil']].notna().all(axis=1)
correlation = merged_data.loc[mask_both, 'GDP_rppp'].corr(merged_data.loc[mask_both, 'real_gdp_ppp_bil'])
print("\nCorrelation between GDP_rppp and real_gdp_ppp_output / 1000:", correlation)

def impute_gdp_group(group):
    gdp_preds = ['GDP_n', 'real_gdp_ppp_bil']
    mask_train = group[['GDP_rppp'] + gdp_preds].notna().all(axis=1)
    if mask_train.sum() >= 3: 
        X_train = group.loc[mask_train, gdp_preds]
        y_train = group.loc[mask_train, 'GDP_rppp']
        model = LinearRegression()
        model.fit(X_train, y_train)
        mask_predict = group['GDP_rppp'].isna() & group[gdp_preds].notna().all(axis=1)
        if mask_predict.any():
            group.loc[mask_predict, 'GDP_rppp'] = model.predict(group.loc[mask_predict, gdp_preds])
    return group

merged_data = merged_data.groupby('country').apply(impute_gdp_group).reset_index(drop=True)
components = ['kgov_rppp', 'kpriv_rppp', 'kppp_rppp']

for comp in components:
    merged_data[comp] = merged_data.groupby('country')[comp].transform(lambda x: x.interpolate())

merged_data['capital_stock_bil'] = merged_data['capital_stock_total'] / 1000

def impute_capital_group(group):
    for comp in components:
        other_comps = [c for c in components if c != comp]
        mask = (
            group[comp].isna() & 
            group[other_comps].notna().all(axis=1) & 
            group['capital_stock_bil'].notna()
        )
        if mask.any():
            other_sum = group.loc[mask, other_comps].sum(axis=1)
            group.loc[mask, comp] = group.loc[mask, 'capital_stock_bil'] - other_sum
    return group

merged_data = merged_data.groupby('country').apply(impute_capital_group).reset_index(drop=True)

for col in ['GDP_rppp'] + components:
    merged_data[col] = merged_data[col].fillna(0)

mask_all_components_zero = (merged_data[components] == 0).all(axis=1)
mask_gdp_zero = (merged_data['GDP_rppp'] == 0)
mask_drop = mask_all_components_zero | mask_gdp_zero

rows_to_drop = mask_drop.sum()
print(f"\nRows to drop: {rows_to_drop}")
merged_data = merged_data[~mask_drop]

print("Rows in merged_data after operation:", len(merged_data))

print("\nFinal merged shape:", merged_data.shape)
print("Columns in merged_data:")
print(merged_data.columns.tolist())

Rows in merged_data before operation: 16080

Correlation between GDP_rppp and real_gdp_ppp_output / 1000: -0.038117438656051134

Rows to drop: 1176
Rows in merged_data after operation: 10464

Final merged shape: (10464, 79)
Columns in merged_data:
['country_code', 'ifscode', 'country', 'income_group', 'year', 'GDP_n', 'GDP_rppp', 'igov_n', 'igov_rppp', 'ippp_rppp', 'ipriv_n', 'ipriv_rppp', 'kgov_n', 'kgov_rppp', 'kppp_n', 'kppp_rppp', 'kpriv_n', 'kpriv_rppp', 'country_18', 'year_19', 'Energy use (kg of oil equivalent per capita)', 'country_21', 'year_22', 'Labor force, total', 'country_24', 'year_25', 'Patent applications, nonresidents', 'Patent applications, residents', 'country_28', 'year_29', 'Researchers in R&D (per million people)', 'country_31', 'year_32', 'Unemployment, total (% of total labor force) (modeled ILO estimate)', 'country_34', 'year_35', 'nan', 'Age dependency ratio (% of working-age population)', 'Age dependency ratio, old', 'Age dependency ratio, young', 'Life expe

In [20]:
metrics_cols = [c for c in merged_data.columns if c not in ['country', 'country_code', 'year']]

summary_by_country = (
    merged_data.groupby(['country', 'country_code'])
    .apply(lambda g: pd.Series({
        'periods': len(g),
        'total_cells': g[metrics_cols].size,
        'missing_cells': g[metrics_cols].isna().sum().sum(),
        'zero_cells': (g[metrics_cols] == 0).sum().sum()
    }))
    .reset_index()
)

summary_by_country['missing_pct'] = (summary_by_country['missing_cells'] / summary_by_country['total_cells'] * 100).round(2)
summary_by_country['zero_pct'] = (summary_by_country['zero_cells'] / summary_by_country['total_cells'] * 100).round(2)

print(summary_by_country.sort_values('missing_pct', ascending=False).head(10))


           country country_code  periods  total_cells  missing_cells   
99      Madagascar          MDG       60         4560           1920  \
123           Oman          OMN       60         4560           1860   
111        Morocco          MAR       60         4560           1850   
102       Maldives          MDV       36         2736           1100   
171      Venezuela          VEN       60         4560           1830   
107         Mexico          MEX       60         4560           1680   
7            Aruba          ABW       60         4560           1200   
170     Uzbekistan          UZB       60         4560            990   
168  United States          USA       60         4560            900   
144        Somalia          SOM       60         4560            850   

     zero_cells  missing_pct  zero_pct  
99          311        42.11      6.82  
123         106        40.79      2.32  
111         151        40.57      3.31  
102         214        40.20      7.82  
17

In [21]:
drop_countries = summary_by_country.query(
    'missing_pct > 25 or (missing_pct + zero_pct) > 35'
)['country'].tolist()

print(f"Countries dropped ({len(drop_countries)}):", drop_countries)

filtered_data = merged_data[~merged_data['country'].isin(drop_countries)].copy()

print(f"Rows before filtering: {len(merged_data)}, after filtering: {len(filtered_data)}")


Countries dropped (7): ['Aruba', 'Madagascar', 'Maldives', 'Mexico', 'Morocco', 'Oman', 'Venezuela']
Rows before filtering: 10464, after filtering: 10068


In [22]:
filtered_data['kppp_rppp'] = filtered_data['kppp_rppp'].fillna(0)

n_rows_filtered = filtered_data.shape[0]
filtered_col_stats = pd.DataFrame({
    'missing_count': filtered_data.isna().sum(),
    'missing_pct':   (filtered_data.isna().mean() * 100),
    'zero_count':    (filtered_data == 0).sum(),
    'zero_pct':      ((filtered_data == 0).mean() * 100)
})

filtered_col_stats['total_void'] = filtered_col_stats['missing_count'] + filtered_col_stats['zero_count']
filtered_col_stats['total_void_pct'] = (filtered_col_stats['total_void'] / n_rows_filtered) * 100

cols_100_missing = filtered_col_stats[filtered_col_stats['missing_pct'] == 100].index.tolist()
cols_high_void = filtered_col_stats[filtered_col_stats['total_void_pct'] > 75].index.tolist()
cols_to_drop = list(set(cols_100_missing + cols_high_void))  # Combine and remove duplicates

print(f"Columns with 100% missing data (to be dropped): {cols_100_missing}")
print(f"Columns with total void percentage > 75% (to be dropped): {cols_high_void}")

Columns with 100% missing data (to be dropped): ['nan', 'Data from database: World Development Indicators', 'Last Updated: 01/28/2025']
Columns with total void percentage > 75% (to be dropped): ['ippp_rppp', 'kppp_n', 'nan', 'Data from database: World Development Indicators', 'Last Updated: 01/28/2025', 'capital_services_index']


In [23]:
filtered_data = filtered_data.drop(columns=cols_to_drop)
filtered_col_stats_updated = pd.DataFrame({
    'missing_count': filtered_data.isna().sum(),
    'missing_pct':   (filtered_data.isna().mean() * 100).round(2),
    'zero_count':    (filtered_data == 0).sum(),
    'zero_pct':      ((filtered_data == 0).mean() * 100).round(2)
})

filtered_col_stats_updated['total_void'] = filtered_col_stats_updated['missing_count'] + filtered_col_stats_updated['zero_count']
filtered_col_stats_updated['total_void_pct'] = (filtered_col_stats_updated['total_void'] / n_rows_filtered * 100).round(2)

filtered_col_stats_updated = filtered_col_stats_updated[
    ['missing_count', 'missing_pct', 'zero_count', 'zero_pct', 'total_void', 'total_void_pct']
].sort_values('missing_pct', ascending=False)

print(f"Dropped columns: {cols_to_drop}")
print(f"Updated filtered_data shape: {filtered_data.shape}")
# Sort the DataFrame alphabetically by column names
sorted_df = filtered_col_stats_updated.sort_index(axis=1)

# Display the sorted DataFrame
display(sorted_df)


Dropped columns: ['capital_services_index', 'Data from database: World Development Indicators', 'nan', 'ippp_rppp', 'Last Updated: 01/28/2025', 'kppp_n']
Updated filtered_data shape: (10068, 73)


Unnamed: 0,missing_count,missing_pct,total_void,total_void_pct,zero_count,zero_pct
avg_hours_per_year,6441,63.97,6441,63.97,0,0.00
Human capital index (HCI) (scale 0-1),3934,39.07,3934,39.07,0,0.00
tfp_index,3585,35.61,5264,52.28,1679,16.68
Researchers in R&D (per million people),3554,35.30,3554,35.30,0,0.00
"Patent applications, residents",3545,35.21,3545,35.21,0,0.00
...,...,...,...,...,...,...
"Age dependency ratio, old",0,0.00,0,0.00,0,0.00
ifscode,0,0.00,0,0.00,0,0.00
Population ages 15-64 (% of total population),0,0.00,0,0.00,0,0.00
"Population ages 15-64, female",0,0.00,0,0.00,0,0.00


In [24]:

keep = [
    'country', 'income_group', 'country_code', 'year',
    'GDP_rppp',
    'kppp_rppp', 'kpriv_rppp', 'kgov_rppp',  # Capital group
    'Labor force, total', 'Human capital index (HCI) (scale 0-1)', 'Unemployment, total (% of total labor force) (modeled ILO estimate)',  # Labor group
    'tfp_index', 'Researchers in R&D (per million people)', 'Patent applications, residents', 'Patent applications, nonresidents', 'Energy use (kg of oil equivalent per capita)',  # Technology group
    'Population, total', 'Age dependency ratio (% of working-age population)', 'Urban population (% of total population)'  # Demographics group
]

df = filtered_data[keep].copy()



In [114]:

df.to_csv("alite_merge.csv", index=False)


In [None]:
rename_map = {
    'kppp_rppp': 'ppp_capital',
    'kpriv_rppp': 'priv_capital',
    'kgov_rppp': 'gov_capital',
    'Labor force, total': 'labor_force',
    'Unemployment, total (% of total labor force) (modeled ILO estimate)': 'unemployment_pct',
    'Human capital index (HCI) (scale 0-1)': 'HCI',
    'tfp_index': 'tfp_idx',
    'Population, total': 'pop_total',
    'Age dependency ratio (% of working-age population)': 'age_dep_ratio',
    'Urban population (% of total population)': 'urban_pop_pct',
    'Researchers in R&D (per million people)': 'rnd_per_mil',
    'Patent applications, residents': 'patents_res',
    'Patent applications, nonresidents': 'patents_nonres',
    'Energy use (kg of oil equivalent per capita)': 'energy_pc'
}

df.rename(columns=rename_map, inplace=True)

df['effective_labor'] = df['labor_force'] * df['HCI']

df['total_patents'] = df['patents_res'] + df['patents_nonres']
df['energy_per_labor'] = (df['energy_pc'] * df['pop_total']) / df['labor_force']

df.drop(columns=['patents_res', 'patents_nonres'], inplace=True)
print("\nStarting Imputation Process")

df['labor_force'] = df.groupby('country')['labor_force'].transform(
    lambda x: x.fillna(filtered_data.loc[x.index, 'Labor force, total'])
)
df['unemployment_pct'] = df.groupby('country')['unemployment_pct'].transform(
    lambda x: x.fillna(filtered_data.loc[x.index, 'Unemployment, total (% of total labor force) (modeled ILO estimate)'])
)

print("\nHCI value range before adjustment:")
print(f"Min: {df['HCI'].min()}, Max: {df['HCI'].max()}")

if df['HCI'].max() > 1 or df['HCI'].min() < 0:
    human_capital_scaled = (filtered_data['human_capital'] - filtered_data['human_capital'].min()) / (
        filtered_data['human_capital'].max() - filtered_data['human_capital'].min()
    )
    df['HCI'] = df.groupby('country')['HCI'].transform(lambda x: x.fillna(human_capital_scaled.loc[x.index]))
else:
    df['HCI'] = df.groupby('country')['HCI'].transform(
        lambda x: x.fillna(filtered_data.loc[x.index, 'human_capital'])
    )

df['HCI'] = df['HCI'].clip(0, 1)

rnd_medians = df[df['rnd_per_mil'] != 0].groupby('country')['rnd_per_mil'].median()
df['rnd_per_mil'] = df.apply(
    lambda row: rnd_medians[row['country']] if row['rnd_per_mil'] == 0 and row['country'] in rnd_medians else row['rnd_per_mil'],
    axis=1
)
df['rnd_per_mil'] = df['rnd_per_mil'].fillna(df['rnd_per_mil'].median())

group_medians = df.groupby('country')['energy_pc'].median()
df['energy_pc'] = df.apply(
    lambda row: group_medians[row['country']] if pd.isna(row['energy_pc']) else row['energy_pc'],
    axis=1
)

categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_features = [col for col in numerical_features if col != 'GDP_rppp' and col not in ['effective_labor', 'total_patents', 'energy_per_labor']]

for col in categorical_features:
    mode_per_country = df.groupby('country')[col].apply(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
    df[col] = df.apply(
        lambda row: mode_per_country[row['country']] if pd.isna(row[col]) else row[col],
        axis=1
    )

df = df.sort_values(['country', 'year'])
for col in numerical_features:
    df[col] = df.groupby('country')[col].transform(lambda x: x.interpolate(method='linear', limit_direction='both'))
    df[col] = df.groupby('country')[col].transform(lambda x: x.ffill())
    df[col] = df.groupby('country')[col].transform(lambda x: x.bfill())
    df[col] = df[col].fillna(df[col].median())



In [113]:
xxxxxxxxx

NameError: name 'xxxxxxxxx' is not defined

In [None]:





# ------------------------------------------------------------------------------
# Usage example (when run as script)
# ------------------------------------------------------------------------------
if __name__ == "__main__":
    # Load your tables

    # Optionally: integrated_df.to_csv("advanced_integrated.csv", index=False)
    #               diagnostics_df.to_csv("integration_diagnostics.csv", index=False)


In [None]:
integrated_df

In [None]:
integrated_df.shape

In [None]:
xxxxx

#### Barebone Version

In [None]:
import os
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from collections import defaultdict
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
BASE_DIR = '../data/raw'
FILE_PATHS = {
    'capital':      os.path.join(BASE_DIR, 'CapitalStockData.csv'),
    'energy':       os.path.join(BASE_DIR, 'energy_use.csv'),
    'labor_force':  os.path.join(BASE_DIR, 'labor_force.csv'),
    'patents':      os.path.join(BASE_DIR, 'patents_res_nonres.csv'),
    'rnd':          os.path.join(BASE_DIR, 'R&D.csv'),
    'unemployment': os.path.join(BASE_DIR, 'unemployed_ilo_estimate.csv'),
    'population':   os.path.join(BASE_DIR, 'population_Data.csv'),
}

In [None]:
dfs    = {name: pd.read_csv(path) for name, path in FILE_PATHS.items()}
tables = list(dfs.values())

In [None]:
def column_name_similarity(a: str, b: str) -> float:
    """Normalized SequenceMatcher ratio between two column names."""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def compute_content_similarity(tables, max_samples: int = 100) -> dict:
    """
    Build a TF-IDF vector for each column (sampling up to max_samples unique values),
    then compute cosine similarities between every pair of columns.
    Returns a dict keyed by ((table_index, col1), (table_index, col2)).
    """
    keys, docs = [], []
    for tidx, tbl in enumerate(tables):
        for col in tbl.columns:
            keys.append((tidx, col))
            vals = tbl[col].dropna().astype(str).unique()
            if len(vals) > max_samples:
                vals = np.random.choice(vals, max_samples, replace=False)
            docs.append(" ".join(vals))
    vec = TfidfVectorizer()
    tfidf = vec.fit_transform(docs)
    sim_mat = cosine_similarity(tfidf)
    content_sim = {}
    for i, ki in enumerate(keys):
        for j, kj in enumerate(keys):
            if i < j:
                content_sim[(ki, kj)] = sim_mat[i, j]
    return content_sim


In [None]:
def assign_integration_ids(tables, name_thresh=0.8, content_thresh=0.3):
    pool = [(i, col) for i, tbl in enumerate(tables) for col in tbl.columns]
    content_sim = compute_content_similarity(tables)
    col_to_id, next_id = {}, 0

    # Group by name or content
    for (i1, c1), (i2, c2) in combinations(pool, 2):
        if (i1, c1) in col_to_id or (i2, c2) in col_to_id:
            continue
        nm = column_name_similarity(c1, c2)
        ct = content_sim.get(((i1, c1), (i2, c2)),
             content_sim.get(((i2, c2), (i1, c1)), 0.0))
        if nm >= name_thresh or ct >= content_thresh:
            col_to_id[(i1, c1)] = next_id
            col_to_id[(i2, c2)] = next_id
            next_id += 1

    # Unique IDs for leftovers
    for key in pool:
        if key not in col_to_id:
            col_to_id[key] = next_id
            next_id += 1

    return col_to_id


In [None]:
def full_disjunction_preserve_names(tables, col_to_id):
    # Reverse map: ID -> original names
    id_to_cols = defaultdict(list)
    for (tidx, col), cid in col_to_id.items():
        id_to_cols[cid].append(col)

    # Pick representative names
    rep_names, used = {}, set()
    for cid, cols in id_to_cols.items():
        rep = min(cols, key=len)
        if rep in used:
            rep = f"{rep}_{cid}"
        used.add(rep)
        rep_names[cid] = rep

    # Expand each table
    expanded = []
    for idx, tbl in enumerate(tables):
        df_exp = pd.DataFrame()
        for (i, col), cid in col_to_id.items():
            if i == idx and col in tbl.columns:
                df_exp[rep_names[cid]] = tbl[col]
        expanded.append(df_exp)

    # Sequential full outer join with fallback to concat when no shared cols
    result = expanded[0]
    for nxt in expanded[1:]:
        common = result.columns.intersection(nxt.columns).tolist()
        if common:
            result = pd.merge(result, nxt, how='outer', on=common)
        else:
            # side-by-side concat when no merge key
            result = pd.concat([result.reset_index(drop=True),
                                nxt.reset_index(drop=True)],
                               axis=1)
    return result

def alite_integrate(tables, **kwargs):
    ids = assign_integration_ids(tables, **kwargs)
    return full_disjunction_preserve_names(tables, ids)

In [None]:
integrated_df = alite_integrate(
        tables,
        name_thresh=0.8,
        content_thresh=0.3
    )
print("Integrated shape:", integrated_df.shape)
print("Missing-value ratio:",
      f"{integrated_df.isna().mean().mean():.2%}")
print("\nPreview:")
display(integrated_df.head(5))