In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("Merged_WB_PWT_DEMO.csv", encoding="utf-8-sig")
df.columns = [c.lower().strip().replace(" ", "_") for c in df.columns]
df = df.replace('..', np.nan)
wb_wide = df.pivot_table(
    index=["country", "year"],
    columns="series_code",
    values="wb_value",
    aggfunc="first"
).reset_index()
wb_rename = {
    "NY.GDP.PCAP.KD": "gdp_pc",             
    "NE.TRD.GNFS.ZS": "trade_openness",     
    "FP.CPI.TOTL.ZG": "inflation",         
    "BX.KLT.DINV.WD.GD.ZS": "FDI",         
    "NE.GDI.TOTL.ZS": "investment_ratio"    
}
wb_wide = wb_wide.rename(columns={k: v for k, v in wb_rename.items() if k in wb_wide.columns})
if "gdp_pc" in wb_wide.columns:
    wb_wide["gdp_pc"] = pd.to_numeric(wb_wide["gdp_pc"], errors="coerce")
    wb_wide = wb_wide.sort_values(["country", "year"])
    wb_wide["gdp_pc_growth"] = (
        wb_wide.groupby("country")["gdp_pc"].pct_change() * 100
    )
base_cols = [
    "country", "country_code", "series_name", "series_code",
    "year", "wb_value", "countrycode", "currency_unit",
    "demographic_indicator", "unnamed:_3", "unnamed:_4",
    "unnamed:_5", "unnamed:_6"
]
base_cols = [c for c in base_cols if c in df.columns]
pwt_cols = [c for c in df.columns if c not in base_cols]
pwt_panel = df.groupby(["country", "year"], as_index=False)[pwt_cols].first()
if "pop" in pwt_panel.columns:
    pwt_panel = pwt_panel.sort_values(["country", "year"])
    pwt_panel["pop_growth"] = (
        pwt_panel.groupby("country")["pop"].pct_change() * 100
    )
if "hc" in pwt_panel.columns:
    pwt_panel = pwt_panel.rename(columns={"hc": "education"})
wgi = pd.read_excel("wgidataset.xlsx")
wgi.columns = [c.lower().strip() for c in wgi.columns]
wgi = wgi.rename(columns={"countryname": "country"})
wgi["estimate"] = pd.to_numeric(wgi["estimate"], errors="coerce")
wgi_pivot = wgi.pivot_table(
    index=["country", "year"],
    columns="indicator",
    values="estimate",
    aggfunc="mean"
).reset_index()
print("WGI columns:", wgi_pivot.columns)
panel = wb_wide.merge(pwt_panel, on=["country", "year"], how="left") \
               .merge(wgi_pivot, on=["country", "year"], how="left")
print(panel.columns.tolist())
print(panel.head())
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
target = "gdp_pc_growth"
candidate_features = [
    "va", "pv", "ge", "rq", "rl", "cc",
    "pop_growth", "education",
    "trade_openness", "inflation", "FDI", "investment_ratio",
    "rgdpe", "rgdpo", "pop"
]
features = [f for f in candidate_features if f in panel.columns]
print("Using features:", features)
model_df = panel.dropna(subset=[target]).copy()
model_df = model_df.replace('..', np.nan)
for col in features + [target]:
    model_df[col] = pd.to_numeric(model_df[col], errors='coerce')
model_df = model_df.dropna(subset=features + [target])
X = model_df[features]
y = model_df[target]
print(X.dtypes)
print("Any '..' left in X? ->", (X == '..').any().any())
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5         
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print("\n=== Linear Regression (GDP per capita growth) ===")
print(f"RMSE: {rmse:.3f}")
print(f"R²:   {r2:.3f}")
print(f"MAPE: {mape:.2f}%")
coef_df = pd.DataFrame({
    "feature": features,
    "coefficient": model.coef_
}).sort_values("coefficient", ascending=False)
print("\n=== Coefficients ===")
print(coef_df)
model_df["pred_gdp_pc_growth"] = model.predict(model_df[features])
recent = model_df[model_df["year"] >= 2010]
country_rank = recent.groupby("country")["pred_gdp_pc_growth"] \
                     .mean() \
                     .sort_values(ascending=False)
print("\n=== Predicted fastest-growing countries (since 2010) ===")
print(country_rank.head(10))



  df = pd.read_csv("Merged_WB_PWT_DEMO.csv", encoding="utf-8-sig")
  wb_wide.groupby("country")["gdp_pc"].pct_change() * 100
  pwt_panel.groupby("country")["pop"].pct_change() * 100


WGI columns: Index(['country', 'year', 'cc', 'ge', 'pv', 'rl', 'rq', 'va'], dtype='object', name='indicator')
['country', 'year', 'EG.ELC.ACCS.ZS', 'GC.DOD.TOTL.GD.ZS', 'GC.NLD.TOTL.GD.ZS', 'MS.MIL.XPND.GD.ZS', 'trade_openness', 'NY.ADJ.NNTY.KD.ZG', 'NY.GDP.MKTP.KD.ZG', 'gdp_pc', 'gdp_pc_growth', 'rgdpe', 'rgdpo', 'pop', 'emp', 'avh', 'education', 'ccon', 'cda', 'cgdpe', 'cgdpo', 'cn', 'ck', 'ctfp', 'cwtfp', 'rgdpna', 'rconna', 'rdana', 'rnna', 'rkna', 'rtfpna', 'rwtfpna', 'labsh', 'irr', 'delta', 'xr', 'pl_con', 'pl_da', 'pl_gdpo', 'i_cig', 'i_xm', 'i_xr', 'i_outlier', 'i_irr', 'cor_exp', 'csh_c', 'csh_i', 'csh_g', 'csh_x', 'csh_m', 'csh_r', 'pl_c', 'pl_i', 'pl_g', 'pl_x', 'pl_m', 'pl_n', 'pl_k', 'pop_growth', 'cc', 'ge', 'pv', 'rl', 'rq', 'va']
   country  year EG.ELC.ACCS.ZS GC.DOD.TOTL.GD.ZS GC.NLD.TOTL.GD.ZS  \
0  Albania  1980            NaN               NaN               NaN   
1  Albania  1981            NaN               NaN               NaN   
2  Albania  1982            Na