In [None]:
import numpy as np

df_rank_clean = df_rank.reset_index(drop=True).copy()
missing_idx = np.where(df_rank_clean['score'].isna())[0]
n = len(df_rank_clean)

for i in missing_idx:
    left  = df_rank_clean['score'].iloc[i-1] if i-1 >= 0 else np.nan
    right = df_rank_clean['score'].iloc[i+1] if i+1 < n   else np.nan
    df_rank_clean.at[i, 'score'] = np.nanmean([left, right])

# In case both neighbors were NaN:
df_rank_clean['score'] = df_rank_clean['score'].interpolate(limit_direction='both')

In [None]:
# Load dataset
df = pd.read_csv("qs-world-university-rankings-2017-to-2022-V2.csv")

# Clean the data
df_clean = df.dropna(subset=["country", "year", "score"]).copy()

# Convert data types safely
df_clean.loc[:, "year"] = df_clean["year"].astype(int)
df_clean.loc[:, "score"] = pd.to_numeric(df_clean["score"], errors="coerce")
df_clean = df_clean.dropna(subset=["score"]).copy()

# Strip spaces in country names
df_clean.loc[:, "country"] = df_clean["country"].str.strip()

# Group by year and country, calculate average score
avg_scores = df_clean.groupby(["year", "country"], as_index=False)["score"].mean()

# ✅ Correct usage of include_group=False (outside the lambda!)
top10_by_year = (
    avg_scores.groupby("year", group_keys=False)
    .apply(lambda x: x.sort_values("score", ascending=False).head(10))
    .reset_index(drop=True)
)

# Display results
for year in sorted(top10_by_year["year"].unique()):
    print(f"\n📅 Top 10 Countries in {year}:")
    display(top10_by_year[top10_by_year["year"] == year])

In [None]:
df_clean.groupby(["year", "country"]).size().reset_index(name="university_count")

In [None]:
# Count universities per country per year
counts_per_year = (
    df_clean.groupby(["year", "country"])
    .size()
    .reset_index(name="university_count")
)

# Get top 10 countries for each year
top10_by_universities = (
    counts_per_year.groupby("year", group_keys=False)
    .apply(lambda x: x.sort_values("university_count", ascending=False).head(10))
)
top10_by_universities

In [None]:
# Step 1: Sort original data by year, country, and score descending
df_sorted = df_clean.sort_values(by=["year", "country", "score"], ascending=[True, True, False])

# Step 2: Get top 5 universities per country per year
df_top5 = df_sorted.groupby(["year", "country"]).head(10)

# Step 3: Compute average score from top 5
avg_top5_scores = df_top5.groupby(["year", "country"], as_index=False)["score"].mean()

# Step 4: Get top 10 countries per year
top10_by_year = (
    avg_top5_scores.groupby("year", group_keys=False)
    .apply(lambda x: x.sort_values("score", ascending=False).head(10))
    .reset_index(drop=True)    
)
# Display results
for year in sorted(top10_by_year["year"].unique()):
    print(f"\n📅 Top 10 Countries in {year}:")
    display(top10_by_year[top10_by_year["year"] == year])

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Step 1: Prepare data (we use the same top countries DataFrame)
X = filtered_df[["gdp_per_capita"]]
y = filtered_df["score"]

# Step 2: Initialize and fit the model
model = LinearRegression()
model.fit(X, y)

# Step 3: Get coefficients
slope = model.coef_[0]
intercept = model.intercept_
r_squared = model.score(X, y)

print(f"📈 Linear Regression Model:")
print(f"score = {slope:.4f} * gdp_per_capita + {intercept:.2f}")
print(f"R² (explained variance): {r_squared:.4f}")

In [None]:
# Scatterplot + regression line
plt.figure(figsize=(8, 5))
sns.scatterplot(x="gdp_per_capita", y="score", data=filtered_df, label="Data")
plt.plot(X, model.predict(X), color="red", label="Regression Line")

plt.title("GDP per Capita vs. University Score (Top Countries)")
plt.xlabel("GDP per Capita (USD)")
plt.ylabel("Average University Score")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Define new feature list
log_features = [
    "log_gdp_per_capita",
    "gov_exp_pct_gdp",
    "lit_rate_adult_pct",
    "school_enrol_primary_pct",
    "school_enrol_secondary_pct",
    "school_enrol_tertiary_pct"
]

# Drop missing
log_model_data = merged_df.dropna(subset=log_features + ["score"])
X_log = log_model_data[log_features]
y_log = log_model_data["score"]

# Fit the model
log_model = LinearRegression()
log_model.fit(X_log, y_log)
y_log_pred = log_model.predict(X_log)

# Print results
print("📊 Multiple Regression with log(GDP):")
for f, coef in zip(log_features, log_model.coef_):
    print(f"{f}: {coef:.4f}")

print(f"\nIntercept: {log_model.intercept_:.2f}")
print(f"R² (explained variance): {r2_score(y_log, y_log_pred):.4f}")

In [None]:
import numpy as np
import pandas as pd

# Step 1: Define the features used in your log-GDP regression model
features = [
    "log_gdp_per_capita",
    "gov_exp_pct_gdp",
    "lit_rate_adult_pct",
    "school_enrol_primary_pct",
    "school_enrol_secondary_pct",
    "school_enrol_tertiary_pct"
]

# Step 2: Create a minimal input row for 2024
# Replace these values with averages or reasonable estimates
input_row_2024 = pd.DataFrame([{
    "log_gdp_per_capita": np.log(forecasted_gdp),
    "gov_exp_pct_gdp": 5.0,                  # average: ~5% of GDP
    "lit_rate_adult_pct": 99.0,              # high literacy for US
    "school_enrol_primary_pct": 102.0,       # slightly over 100% (common due to repeaters)
    "school_enrol_secondary_pct": 95.0,
    "school_enrol_tertiary_pct": 70.0
}])

# Step 3: Predict score
predicted_score_2024 = log_model.predict(input_row_2024)[0]
print(f"🎓 Predicted university score for United States in 2024 (GDP-based): {predicted_score_2024:.2f}")
