In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [26]:
df_rank_1=pd.read_csv("QS World University Rankings 2019.csv")

In [32]:
df_rank_1.dtypes

year              int64
rank_display     object
university       object
score           float64
link             object
country          object
city             object
region           object
logo             object
dtype: object

In [35]:
df_rank_1.isna().sum()

year              0
rank_display     20
university        0
score           515
link              0
country           0
city             25
region            0
logo              0
dtype: int64

In [None]:
duplicate_rank = df_rank_uni[df_rank_uni.duplicated()]
duplicate_gdp = df_gdp[df_gdp.duplicated()]

print("\nDuplicate rows in Ranking Dataset:")
print(duplicate_rank)

print("\n Duplicate rows in GDP Dataset:")
print(duplicate_gdp)    

In [None]:
print("Duplicate Rows in Ranking Dataset:", df_rank.duplicated().sum())
print("Duplicate Rows in GDP Dataset:", df_gdp.duplicated().sum())

In [54]:
df_rank_clean = df_rank_1.reset_index(drop=True).copy()
missing_idx = np.where(df_rank_clean['score'].isna())[0]
n = len(df_rank_clean)

for i in missing_idx:
    left  = df_rank_clean['score'].iloc[i-1] if i-1 >= 0 else np.nan
    right = df_rank_clean['score'].iloc[i+1] if i+1 < n   else np.nan
    df_rank_clean.at[i, 'score'] = np.nanmean([left, right])

# In case both neighbors were NaN:
df_rank_clean['score'] = df_rank_clean['score'].interpolate(limit_direction='both')

In [56]:
df_rank_clean_2.to_csv("df_rank_clean_new_2.csv", index = False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_top10_all_years(df_clean):
    # Calculate average score per country-year
    avg_score = (
        df_clean.groupby(["country", "year"], as_index=False)["score"]
        .mean()
        .rename(columns={"score": "avg_score"})
    )

    # Get top 10 per year
    top10_per_year = (
        avg_score.sort_values(["year", "avg_score"], ascending=[True, False])
        .groupby("year")
        .head(10)
    )

    # Get unique years sorted
    years = sorted(top10_per_year["year"].unique())
    n_years = len(years)

    # Set up subplot grid
    fig, axes = plt.subplots(
        nrows=n_years, ncols=1, figsize=(10, 4 * n_years), constrained_layout=True
    )

    # In case there is only one year (avoid iterable error)
    if n_years == 1:
        axes = [axes]

    # Plot each year's top 10
    for ax, year in zip(axes, years):
        data = top10_per_year[top10_per_year["year"] == year]
        sns.barplot(
            data=data,
            x="avg_score",
            y="country",
            palette="viridis",
            ax=ax
        )
        ax.set_title(f"Top 10 Countries by Average Score ({year})", fontsize=14)
        ax.set_xlabel("Average Score")
        ax.set_ylabel("Country")

    plt.show()

In [None]:
plot_top10_all_years(df_clean)

In [34]:
def tidy_university_dataset(df_clean: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregates by [year, country] and computes:
      - total_score (sum of 'score')
      - num_universities (unique 'university' count)
    Then drops raw columns (if present) from the returned aggregated frame.
    """
    # Aggregate
    agg_df = (
        df_clean.groupby(["year", "country"], as_index=False)
                .agg(total_score=("score", "sum"),
                     num_universities=("university", "nunique"))
    )

    # Drop the originals if you no longer need them (no-op here, but kept for clarity)
    agg_df = agg_df.drop(columns=["university", "score"], errors="ignore")

    return agg_df

Forecasting GDP and ARIMAX Modeling
Forecast GDP for 2023 using ARIMA

GDP as exogenous variable. Target: 2023 university scores and/or number of universities

Evaluate forecast vs actual 2023 data. Compare with ARIMA without GDP

Document how you chose lag values and whether GDP improves forecasts.

In [None]:
gdp_long = gdp_long[["country", "year", "gdp_per_capita"]]
edu_long = edu_long[["country", "year", "gov_exp_edu"]]
gov_eff_long = gov_eff_long [["country", "year", "gov_effectiveness"]]
rd_exp_long = rd_exp_long [["country", "year", "rd_exp_gdp"]]
merged_df_gov = merged_df_gov[["country", "year", "gov_effectiveness", "total_score"]]

In [None]:
import pandas as pd
from functools import reduce

# --- 0) Minimal sanity cleaning for each source ---
def prep(df, cols):
    out = df[cols].copy()
    # standardize column names and types
    out["country"] = out["country"].astype(str).str.strip()
    out["year"] = pd.to_numeric(out["year"], errors="coerce").astype("Int64")
    # drop exact duplicates
    out = out.drop_duplicates(subset=["country", "year"])
    return out

gdp_long      = prep(gdp_long,      ["country", "year", "gdp_per_capita"])
edu_long      = prep(edu_long,      ["country", "year", "gov_exp_edu"])
gov_eff_long  = prep(gov_eff_long,  ["country", "year", "gov_effectiveness"])
rd_exp_long   = prep(rd_exp_long,   ["country", "year", "rd_exp_gdp"])
merged_df_gov = prep(merged_df_gov, ["country", "year", "gov_effectiveness", "total_score"])

# If merged_df_gov also has gov_effectiveness, keep the score from there
# and prefer the indicator column from the dedicated gov_eff_long source.
merged_df_gov = merged_df_gov.rename(columns={"gov_effectiveness": "gov_effectiveness_from_score"})

# --- 1) Merge all indicators onto the score base (left join preserves your scoring rows) ---
dfs_to_merge = [
    merged_df_gov,
    gdp_long,
    edu_long,
    gov_eff_long,   # authoritative gov_effectiveness
    rd_exp_long
]

def left_merge(a, b):
    # prevent accidental suffix collisions
    cols_before = set(a.columns)
    out = a.merge(b, on=["country", "year"], how="left", validate="one_to_one")
    # sanity check for duplicate indicators
    dupes = [c for c in out.columns if c.endswith("_x") or c.endswith("_y")]
    if dupes:
        raise ValueError(f"Unexpected duplicate columns after merge: {dupes}")
    return out

full_df = reduce(left_merge, dfs_to_merge)

# Replace the possibly-missing 'gov_effectiveness' with the one from the score table if needed
if "gov_effectiveness" in full_df.columns:
    full_df["gov_effectiveness"] = full_df["gov_effectiveness"].fillna(full_df["gov_effectiveness_from_score"])
else:
    full_df["gov_effectiveness"] = full_df["gov_effectiveness_from_score"]

# Drop helper column
full_df = full_df.drop(columns=["gov_effectiveness_from_score"], errors="ignore")

# --- 2) Restrict to the modeling window 2017–2023 and ensure clean dtypes ---
full_df = full_df[(full_df["year"] >= 2017) & (full_df["year"] <= 2023)].copy()
full_df["year"] = full_df["year"].astype(int)

# --- 3) If there are still multiple rows per (country,year), aggregate safely ---
if full_df.duplicated(subset=["country", "year"]).any():
    agg_map = {
        "total_score": "sum",             # country-year total from top 300
        "gdp_per_capita": "mean",
        "gov_exp_edu": "mean",
        "gov_effectiveness": "mean",
        "rd_exp_gdp": "mean"
    }
    full_df = (
        full_df.groupby(["country", "year"], as_index=False)
               .agg(agg_map)
    )

# --- 4) Quick completeness report ---
needed_cols = ["total_score", "gdp_per_capita", "gov_exp_edu", "gov_effectiveness", "rd_exp_gdp"]
missing_summary = (
    full_df.assign(missing_any = full_df[needed_cols].isna().any(axis=1))
           .groupby("year")["missing_any"].value_counts(dropna=False)
           .unstack(fill_value=0)
           .rename(columns={True:"rows_with_NA", False:"rows_complete"})
)
print("Completeness by year (rows):\n", missing_summary)

# --- 5) Split train/test frames we’ll use for the three simple prediction approaches ---
train_2017_2022 = full_df[(full_df["year"] >= 2017) & (full_df["year"] <= 2022)].dropna(subset=needed_cols).copy()
test_2023        = full_df[(full_df["year"] == 2023)].dropna(subset=needed_cols).copy()

print("\nTrain rows:", len(train_2017_2022), " | Test 2023 rows:", len(test_2023))
print("\nColumns:", list(full_df.columns))
full_df.head()

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

# 0) Merge target + GDP
panel = (
    merged_df_gov[["country","year","total_score"]]
    .merge(gdp_long[["country","year","gdp_per_capita"]], on=["country","year"], how="left")
    .copy()
)

# Hygiene
panel["country"] = panel["country"].astype(str).str.strip()
panel["year"] = panel["year"].astype(int)
panel["gdp_per_capita"] = pd.to_numeric(panel["gdp_per_capita"], errors="coerce")

# Use log GDP (often more linear)
panel["log_gdp"] = np.log(panel["gdp_per_capita"])

# 1) Train on 2017–2022
train = panel.query("2017 <= year <= 2022").dropna(subset=["total_score","log_gdp"]).copy()

# 2) Top-10 countries by avg total_score (2017–2022)
top10 = (train.groupby("country")["total_score"].mean()
               .nlargest(10).index.tolist())

# Restrict training to those countries only (optional, keeps focus consistent)
train_top10 = train[train["country"].isin(top10)].copy()

# 3) Fit fixed-effects OLS with a time trend
# total_score ~ country dummies + year + log_gdp
fe = smf.ols("total_score ~ C(country) + year + log_gdp", data=train_top10)\
        .fit(cov_type="cluster", cov_kwds={"groups": train_top10["country"]})
print(fe.summary())

# 4) Build 2023 prediction frame for the same top-10 countries (pull 2023 GDP)
future_2023 = (
    gdp_long.query("year == 2023 and country in @top10")
            [["country","year","gdp_per_capita"]]
            .drop_duplicates()
            .copy()
)
future_2023["log_gdp"] = np.log(pd.to_numeric(future_2023["gdp_per_capita"], errors="coerce"))

# In case any GDP is missing after coercion:
future_2023 = future_2023.dropna(subset=["log_gdp"])

# 5) Predict 2023
future_2023["forecast_2023"] = fe.predict(future_2023)

pred_lr = future_2023[["country","forecast_2023"]].sort_values("country").reset_index(drop=True)
print(pred_lr)

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

# 0) Merge target + GDP
panel = (
    merged_df_gov[["country","year","total_score"]]
    .merge(gdp_long[["country","year","gdp_per_capita"]], on=["country","year"], how="left")
    .dropna(subset=["total_score","gdp_per_capita"])
    .copy()
)

# 1) Transform GDP (often helps linearity)
panel["log_gdp"] = np.log(panel["gdp_per_capita"])

# 2) Train on 2017–2022
train = panel.query("2017 <= year <= 2022").copy()

# 3) Fit Fixed-Effects OLS with time trend
fe_model = smf.ols("total_score ~ C(country) + year + log_gdp", data=train).fit(cov_type="cluster", cov_kwds={"groups": train["country"]})
print(fe_model.summary())

# 4) Prepare 2023 prediction rows for top-10 countries by 2017–2022 avg score
top10 = (train.groupby("country")["total_score"].mean().nlargest(10).index.tolist())

future_2023 = (
    panel.query("year == 2023 and country in @top10")[["country","year","gdp_per_capita"]]
    .drop_duplicates()
    .copy()
)

# If some GDP 2023 are missing, pull directly from gdp_long
if future_2023.empty or future_2023["gdp_per_capita"].isna().any():
    fallback = gdp_long.query("year == 2023 and country in @top10")[["country","year","gdp_per_capita"]]
    future_2023 = fallback.copy()

future_2023["log_gdp"] = np.log(future_2023["gdp_per_capita"])

# 5) Predict 2023
future_2023["forecast_2023"] = fe_model.predict(future_2023)

pred_fe = future_2023[["country","forecast_2023"]].sort_values("country").reset_index(drop=True)
print(pred_fe)

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX

def zscore_train(x):
    mu = float(np.nanmean(x))
    sd = float(np.nanstd(x, ddof=0))
    if sd == 0 or np.isnan(sd):
        sd = 1.0
    return (x - mu) / sd, mu, sd

def make_year_end_index(years):
    # Convert int years to year-end timestamps with explicit freq
    return pd.PeriodIndex(years.astype(str), freq="Y-DEC").to_timestamp()

pred_rows = []

# Use your df_gdp_exog with the OUTER merge from earlier
top10 = (df_gdp_exog.query("year <= 2022")
         .dropna(subset=["total_score"])
         .groupby("country")["total_score"].mean()
         .nlargest(10).index.tolist())

for c in top10:
    cd = df_gdp_exog[df_gdp_exog["country"] == c].copy()
    train = cd[(cd["year"] >= 2017) & (cd["year"] <= 2022)].dropna(subset=["total_score","gdp_per_capita"])
    fut  = cd[(cd["year"] == 2023)][["gdp_per_capita"]]

    if len(train) < 4 or fut.empty or fut.isna().any().any():
        pred_rows.append({"country": c, "forecast_2023": None, "lo80": None, "hi80": None, "order": "", "note": "too little data or missing 2023 GDP"})
        continue

    # Index with explicit yearly frequency
    idx = make_year_end_index(train["year"])
    y  = pd.Series(train["total_score"].to_numpy(float), index=idx, name="total_score")
    X  = pd.DataFrame({"gdp_per_capita": train["gdp_per_capita"].to_numpy(float)}, index=idx)

    # Standardize endog and exog (fit-time stats)
    y_z, y_mu, y_sd = zscore_train(y.values)
    X_z = X.copy()
    X_z["gdp_per_capita"], x_mu, x_sd = zscore_train(X["gdp_per_capita"].values)

    y_z = pd.Series(y_z, index=idx, name="y_z")
    X_z.index = idx

    # 2023 exog (apply SAME scaling as training)
    x2023_raw = float(fut["gdp_per_capita"].iloc[0])
    x2023_z   = (x2023_raw - x_mu) / (x_sd if x_sd != 0 else 1.0)
    X_2023_z  = pd.DataFrame({"gdp_per_capita":[x2023_z]},
                             index=make_year_end_index(pd.Series([2023], dtype=int)))

    # Try a few robust orders for tiny samples
    tried_orders = [(0,1,1), (1,0,0), (1,1,0), (0,1,0)]
    fitted = None
    last_err = ""
    for order in tried_orders:
        try:
            model = SARIMAX(
                y_z, exog=X_z, order=order, trend="c",  # include constant
                enforce_stationarity=False, enforce_invertibility=False
            )
            res = model.fit(disp=False, maxiter=1000, method="lbfgs")
            fitted = (order, res)
            break
        except Exception as e:
            last_err = str(e)
            continue

    if fitted is None:
        pred_rows.append({"country": c, "forecast_2023": None, "lo80": None, "hi80": None, "order": "", "note": f"fit failed: {last_err[:80]}..."})
        continue

    order_used, res = fitted

    # One-step forecast on standardized scale
    fc = res.get_forecast(steps=1, exog=X_2023_z)
    mean_z = float(fc.predicted_mean.iloc[0])
    ci_z   = fc.conf_int(alpha=0.2)  # 80% CI
    lo_z   = float(ci_z.iloc[0, 0])
    hi_z   = float(ci_z.iloc[0, 1])

    # Unscale back to original total_score units
    mean = mean_z * y_sd + y_mu
    lo80 = lo_z   * y_sd + y_mu
    hi80 = hi_z   * y_sd + y_mu

    pred_rows.append({
        "country": c,
        "forecast_2023": mean,
        "lo80": lo80,
        "hi80": hi80,
        "order": str(order_used),
        "note": ""
    })

pred_df = pd.DataFrame(pred_rows).sort_values("country").reset_index(drop=True)
print(pred_df)