In [5]:
import os
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from collections import defaultdict
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
BASE_DIR = '../data/raw'
FILE_PATHS = {
    'capital':      os.path.join(BASE_DIR, 'CapitalStockData.csv'),
    'energy':       os.path.join(BASE_DIR, 'energy_use.csv'),
    'labor_force':  os.path.join(BASE_DIR, 'labor_force.csv'),
    'patents':      os.path.join(BASE_DIR, 'patents_res_nonres.csv'),
    'rnd':          os.path.join(BASE_DIR, 'R&D.csv'),
    'unemployment': os.path.join(BASE_DIR, 'unemployed_ilo_estimate.csv'),
    'population':   os.path.join(BASE_DIR, 'population_Data.csv'),
}

In [7]:
dfs    = {name: pd.read_csv(path) for name, path in FILE_PATHS.items()}
tables = list(dfs.values())

In [8]:
def column_name_similarity(a: str, b: str) -> float:
    """Normalized SequenceMatcher ratio between two column names."""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

def compute_content_similarity(tables, max_samples: int = 100) -> dict:
    """
    Build a TF-IDF vector for each column (sampling up to max_samples unique values),
    then compute cosine similarities between every pair of columns.
    Returns a dict keyed by ((table_index, col1), (table_index, col2)).
    """
    keys, docs = [], []
    for tidx, tbl in enumerate(tables):
        for col in tbl.columns:
            keys.append((tidx, col))
            vals = tbl[col].dropna().astype(str).unique()
            if len(vals) > max_samples:
                vals = np.random.choice(vals, max_samples, replace=False)
            docs.append(" ".join(vals))
    vec = TfidfVectorizer()
    tfidf = vec.fit_transform(docs)
    sim_mat = cosine_similarity(tfidf)
    content_sim = {}
    for i, ki in enumerate(keys):
        for j, kj in enumerate(keys):
            if i < j:
                content_sim[(ki, kj)] = sim_mat[i, j]
    return content_sim


In [9]:
def assign_integration_ids(tables, name_thresh=0.8, content_thresh=0.3):
    pool = [(i, col) for i, tbl in enumerate(tables) for col in tbl.columns]
    content_sim = compute_content_similarity(tables)
    col_to_id, next_id = {}, 0

    # Group by name or content
    for (i1, c1), (i2, c2) in combinations(pool, 2):
        if (i1, c1) in col_to_id or (i2, c2) in col_to_id:
            continue
        nm = column_name_similarity(c1, c2)
        ct = content_sim.get(((i1, c1), (i2, c2)),
             content_sim.get(((i2, c2), (i1, c1)), 0.0))
        if nm >= name_thresh or ct >= content_thresh:
            col_to_id[(i1, c1)] = next_id
            col_to_id[(i2, c2)] = next_id
            next_id += 1

    # Unique IDs for leftovers
    for key in pool:
        if key not in col_to_id:
            col_to_id[key] = next_id
            next_id += 1

    return col_to_id


In [10]:
def full_disjunction_preserve_names(tables, col_to_id):
    # Reverse map: ID -> original names
    id_to_cols = defaultdict(list)
    for (tidx, col), cid in col_to_id.items():
        id_to_cols[cid].append(col)

    # Pick representative names
    rep_names, used = {}, set()
    for cid, cols in id_to_cols.items():
        rep = min(cols, key=len)
        if rep in used:
            rep = f"{rep}_{cid}"
        used.add(rep)
        rep_names[cid] = rep

    # Expand each table
    expanded = []
    for idx, tbl in enumerate(tables):
        df_exp = pd.DataFrame()
        for (i, col), cid in col_to_id.items():
            if i == idx and col in tbl.columns:
                df_exp[rep_names[cid]] = tbl[col]
        expanded.append(df_exp)

    # Sequential full outer join with fallback to concat when no shared cols
    result = expanded[0]
    for nxt in expanded[1:]:
        common = result.columns.intersection(nxt.columns).tolist()
        if common:
            result = pd.merge(result, nxt, how='outer', on=common)
        else:
            # side-by-side concat when no merge key
            result = pd.concat([result.reset_index(drop=True),
                                nxt.reset_index(drop=True)],
                               axis=1)
    return result

def alite_integrate(tables, **kwargs):
    ids = assign_integration_ids(tables, **kwargs)
    return full_disjunction_preserve_names(tables, ids)

In [11]:
integrated_df = alite_integrate(
        tables,
        name_thresh=0.8,
        content_thresh=0.3
    )
print("Integrated shape:", integrated_df.shape)
print("Missing-value ratio:",
      f"{integrated_df.isna().mean().mean():.2%}")
print("\nPreview:")
display(integrated_df.head(5))

Integrated shape: (12531, 232)
Missing-value ratio: 71.61%

Preview:


Unnamed: 0,countrycode,countryname,igov_rppp,ipriv_rppp,ippp_rppp,igov_n,ipriv_n,ifscode,year,GDP_rppp,...,2040 [YR2040],2042 [YR2042],2044 [YR2044],2046 [YR2046],2048 [YR2048],Country Name_227,Country Code,Series Name_229,Series Code_230,2050 [YR2050]
0,AFG,Afghanistan,50.0,15.0,,,,512.0,1960.0,,...,64.8010341971567,62.9710690099967,61.2935028902416,59.7165081417511,58.2601597694968,Afghanistan,AFG,Age dependency ratio (% of working-age populat...,SP.POP.DPND,57.5832779071994
1,AFG,Afghanistan,52.0,15.0,,,,512.0,1961.0,,...,5.4115807557828,5.60517537301781,5.80326473462997,5.99954027587882,6.19962014092639,Afghanistan,AFG,"Age dependency ratio, old",SP.POP.DPND.OL,6.31400253014127
2,AFG,Afghanistan,54.0,16.0,,,,512.0,1962.0,,...,59.3894534413739,57.3658936369789,55.4902381556116,53.7169678658722,52.0605396285704,Afghanistan,AFG,"Age dependency ratio, young",SP.POP.DPND.YG,51.2692753770581
3,AFG,Afghanistan,56.0,17.0,,,,512.0,1963.0,,...,..,..,69.456,..,..,Afghanistan,AFG,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,70.263
4,AFG,Afghanistan,59.0,17.0,,,,512.0,1964.0,,...,..,..,66.297,..,..,Afghanistan,AFG,"Life expectancy at birth, male (years)",SP.DYN.LE00.MA.IN,67.057


In [1]:
#!/usr/bin/env python3
"""
Advanced ALITE‐Style Integrator
--------------------------------
Clusters columns across heterogeneous tables by both name and content,
then produces the full‐disjunction (n‐way outer join) with meaningful
representative names and diagnostic outputs.
"""

import os
import pandas as pd
import numpy as np
from difflib import SequenceMatcher
from itertools import combinations
from collections import defaultdict, deque

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import ks_2samp

class AdvancedALITE:
    def __init__(
        self,
        tables: list[pd.DataFrame],
        name_weight: float = 0.5,
        text_weight: float = 0.3,
        numeric_weight: float = 0.2,
        overall_thresh: float = 0.5,
        max_content_samples: int = 200
    ):
        """
        tables            : list of DataFrames to integrate
        name_weight       : weight for name‐based similarity
        text_weight       : weight for text content similarity
        numeric_weight    : weight for numeric distribution similarity
        overall_thresh    : threshold on weighted similarity to cluster
        max_content_samples: max unique values per column for TF-IDF
        """
        self.tables = tables
        self.n_w = name_weight
        self.t_w = text_weight
        self.nm_w = numeric_weight
        self.thresh = overall_thresh
        self.max_samples = max_content_samples

        # will be filled in .integrate()
        self.keys = []       # list of (table_idx, col_name)
        self.col_types = {}  # mapping from key to 'numeric'|'text'|...
        self.sim = {}        # mapping pair→(name_sim, text_sim, num_sim, combined)
        self.clusters = {}   # mapping key→cluster_id
        self.rep_name = {}   # mapping cluster_id→representative column name

    def classify_column(self, series: pd.Series) -> str:
        """Classify column as 'numeric' or 'text'."""
        if pd.api.types.is_numeric_dtype(series):
            return 'numeric'
        else:
            # treat everything else as text for simplicity
            return 'text'

    def compute_name_sim(self, a: str, b: str) -> float:
        return SequenceMatcher(None, a.lower(), b.lower()).ratio()

    def compute_text_sims(self):
        """Compute TF-IDF + cosine for all text columns."""
        docs, keys = [], []
        for idx, tbl in enumerate(self.tables):
            for col in tbl.columns:
                key = (idx, col)
                if self.col_types[key] == 'text':
                    keys.append(key)
                    vals = tbl[col].dropna().astype(str).unique()
                    if len(vals) > self.max_samples:
                        vals = np.random.choice(vals, self.max_samples, replace=False)
                    docs.append(" ".join(vals))

        if not docs:
            return {}

        tfidf = TfidfVectorizer().fit_transform(docs)
        sim_mat = cosine_similarity(tfidf)
        text_sim = {}
        for i, k1 in enumerate(keys):
            for j, k2 in enumerate(keys):
                if i < j:
                    text_sim[(k1, k2)] = sim_mat[i,j]
        return text_sim

    def compute_numeric_sims(self):
        """Compute 1 - KS statistic for all numeric columns."""
        num_keys = [(i,c) for i,t in enumerate(self.tables) for c in t.columns
                    if self.col_types[(i,c)] == 'numeric']
        num_sim = {}
        for (k1, k2) in combinations(num_keys, 2):
            s1 = self.tables[k1[0]][k1[1]].dropna()
            s2 = self.tables[k2[0]][k2[1]].dropna()
            if len(s1)>1 and len(s2)>1:
                stat = ks_2samp(s1, s2).statistic
                num_sim[(k1,k2)] = 1 - stat
        return num_sim

    def build_similarity(self):
        """Compute name/text/num sims and combined score for every pair."""
        # keys and types
        for tidx, tbl in enumerate(self.tables):
            for col in tbl.columns:
                key = (tidx, col)
                self.keys.append(key)
                self.col_types[key] = self.classify_column(tbl[col])

        text_sim = self.compute_text_sims()
        num_sim  = self.compute_numeric_sims()

        for (k1, k2) in combinations(self.keys, 2):
            name_s = self.compute_name_sim(k1[1], k2[1])
            t_s   = text_sim.get((k1,k2), text_sim.get((k2,k1), 0.0))
            n_s   = num_sim.get((k1,k2), num_sim.get((k2,k1), 0.0))
            combined = (
                self.n_w * name_s +
                self.t_w * t_s +
                self.nm_w * n_s
            )
            self.sim[(k1,k2)] = (name_s, t_s, n_s, combined)

    def cluster_columns(self):
        """Build threshold graph and extract connected components."""
        # Build adjacency
        adj = defaultdict(list)
        for (k1,k2), sims in self.sim.items():
            if sims[3] >= self.thresh:
                adj[k1].append(k2)
                adj[k2].append(k1)
        visited = set()
        cid = 0

        # BFS for each component
        for key in self.keys:
            if key in visited:
                continue
            # new component
            queue = deque([key])
            while queue:
                k = queue.popleft()
                if k in visited:
                    continue
                visited.add(k)
                self.clusters[k] = cid
                for nbr in adj.get(k, []):
                    if nbr not in visited:
                        queue.append(nbr)
            cid += 1

    def choose_representative_names(self):
        """Pick a human‐readable name per cluster."""
        id_to_cols = defaultdict(list)
        for k, c in self.clusters.items():
            id_to_cols[c].append(k[1])  # just the column name
        used = set()
        for c, cols in id_to_cols.items():
            # shortest wins
            rep = min(cols, key=len)
            if rep in used:
                rep = f"{rep}_{c}"
            used.add(rep)
            self.rep_name[c] = rep

    def expand_and_merge(self):
        """Perform the full‐disjunction merge with representative names."""
        # expand each table
        expanded = []
        for tidx, tbl in enumerate(self.tables):
            df = pd.DataFrame()
            for (i,col), c in self.clusters.items():
                if i == tidx and col in tbl.columns:
                    df[self.rep_name[c]] = tbl[col]
            expanded.append(df)

        # full outer join with concat fallback
        result = expanded[0]
        for nxt in expanded[1:]:
            common = result.columns.intersection(nxt.columns).tolist()
            if common:
                result = pd.merge(result, nxt, how='outer', on=common)
            else:
                result = pd.concat(
                    [result.reset_index(drop=True),
                     nxt.reset_index(drop=True)],
                    axis=1
                )
        return result

    def diagnostics(self) -> pd.DataFrame:
        """Return DataFrame of column pairs with their similarity scores."""
        rows = []
        for (k1,k2), (ns, ts, nums, cmb) in self.sim.items():
            rows.append({
                'table1': k1[0], 'col1': k1[1],
                'table2': k2[0], 'col2': k2[1],
                'name_sim': ns, 'text_sim': ts,
                'numeric_sim': nums, 'combined_sim': cmb,
                'cluster1': self.clusters.get(k1),
                'cluster2': self.clusters.get(k2)
            })
        return pd.DataFrame(rows)

    def integrate(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Run the full pipeline:
         1) build similarity
         2) cluster
         3) choose rep names
         4) merge
        Returns:
          integrated_df, diagnostics_df
        """
        self.build_similarity()
        self.cluster_columns()
        self.choose_representative_names()
        integrated = self.expand_and_merge()
        diag = self.diagnostics()
        return integrated, diag


# ------------------------------------------------------------------------------
# Usage example (when run as script)
# ------------------------------------------------------------------------------
if __name__ == "__main__":
    # Load your tables
    BASE_DIR = '../data/raw'
    paths = {
        'capital':      os.path.join(BASE_DIR, 'CapitalStockData.csv'),
        'energy':       os.path.join(BASE_DIR, 'energy_use.csv'),
        'labor_force':  os.path.join(BASE_DIR, 'labor_force.csv'),
        'patents':      os.path.join(BASE_DIR, 'patents_res_nonres.csv'),
        'rnd':          os.path.join(BASE_DIR, 'R&D.csv'),
        'unemployment': os.path.join(BASE_DIR, 'unemployed_ilo_estimate.csv'),
        'population':   os.path.join(BASE_DIR, 'population_Data.csv'),
    }
    tables = [pd.read_csv(p) for p in paths.values()]

    # Integrate
    integrator = AdvancedALITE(
        tables,
        name_weight=0.5,
        text_weight=0.3,
        numeric_weight=0.2,
        overall_thresh=0.5
    )
    integrated_df, diagnostics_df = integrator.integrate()

    # Show results
    print("Integrated shape:", integrated_df.shape)
    print("Missing-value ratio:", f"{integrated_df.isna().mean().mean():.2%}")
    print("\nSample integrated columns:", list(integrated_df.columns)[:10])
    print("\nDiagnostics (sample):")
    print(diagnostics_df.sort_values('combined_sim', ascending=False).head(10))
    # Optionally: integrated_df.to_csv("advanced_integrated.csv", index=False)
    #               diagnostics_df.to_csv("integration_diagnostics.csv", index=False)


Integrated shape: (19186, 72)
Missing-value ratio: 34.71%

Sample integrated columns: ['countrycode', 'ifscode', 'countryname', 'year', 'ipriv_n', 'ippp_rppp', 'igov_n', 'kppp_n', 'GDP_n', 'income']

Diagnostics (sample):
       table1          col1  table2          col2  name_sim  text_sim   
9121        1  Country Name       5  Country Name       1.0  0.922635  \
58489       3  Country Name       4  Country Name       1.0  0.918351   
58557       3  Country Name       5  Country Name       1.0  0.917172   
36015       2  Country Name       3  Country Name       1.0  0.916578   
36083       2  Country Name       4  Country Name       1.0  0.914820   
8917        1  Country Name       2  Country Name       1.0  0.913077   
8985        1  Country Name       3  Country Name       1.0  0.912968   
36151       2  Country Name       5  Country Name       1.0  0.907158   
9053        1  Country Name       4  Country Name       1.0  0.901999   
9187        1  Country Name       6  Country Nam

In [2]:
integrated_df

Unnamed: 0,countrycode,ifscode,countryname,year,ipriv_n,ippp_rppp,igov_n,kppp_n,GDP_n,income,...,2041 [YR2041],2042 [YR2042],2043 [YR2043],2044 [YR2044],2045 [YR2045],2046 [YR2046],2047 [YR2047],2048 [YR2048],2049 [YR2049],2050 [YR2050]
0,AFG,512.0,Afghanistan,1960.0,,,,,,Low Income Developing Countries,...,,,,,,,,,,
1,AFG,512.0,Afghanistan,1961.0,,,,,,Low Income Developing Countries,...,,,,,,,,,,
2,AFG,512.0,Afghanistan,1962.0,,,,,,Low Income Developing Countries,...,,,,,,,,,,
3,AFG,512.0,Afghanistan,1963.0,,,,,,Low Income Developing Countries,...,,,,,,,,,,
4,AFG,512.0,Afghanistan,1964.0,,,,,,Low Income Developing Countries,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19181,,,,,,,,,,,...,,,,,,,,,,
19182,,,,,,,,,,,...,,,,,,,,,,
19183,,,,,,,,,,,...,,,,,,,,,,
19184,,,Data from database: Population estimates and p...,,,,,,,,...,,,,,,,,,,


In [3]:
integrated_df.shape

(19186, 72)