In [7]:
import time
import nbimporter
import os
import pandas as pd
from scraper import scrape_nirf_data
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from merge_parameter_scores import merge_csvs

years = list(range(2017, 2026))
all_data = []
institutes_2018_not_in_others = []
institutes_2017_not_in_others = []

os.makedirs("csv_data", exist_ok=True)

print("\nProcessing and mapping data for 2017...")
df_2017 = scrape_nirf_data(2017)
mapped_df_2017 = None
if df_2017 is not None:
    if "Institute ID" not in df_2017.columns:
        print("Skipping 2017: 'Institute ID' column missing")
    else:
        base_cols_2017 = ["Institute ID", "Name"]
        metric_cols = ["Rank", "Score", "TLR (100)", "RPC (100)", "GO (100)", "OI (100)", "PERCEPTION (100)"]
        existing_metrics_2017 = [col for col in metric_cols if col in df_2017.columns]
        df_2017_processed = df_2017[base_cols_2017 + existing_metrics_2017].copy()
        rename_map_2017 = {col: f"{col}_{2017}" for col in existing_metrics_2017}
        df_2017_processed = df_2017_processed.rename(columns=rename_map_2017)

        institute_names_later_years = {}
        for year in range(2018, 2026):  
            temp_df = scrape_nirf_data(year)
            if temp_df is not None and "Name" in temp_df.columns and "Institute ID" in temp_df.columns:
                name_id_map = temp_df[["Name", "Institute ID"]].drop_duplicates(subset=["Name"]).set_index("Name")["Institute ID"].to_dict()
                institute_names_later_years.update(name_id_map)

        mapped_ids_2017 = {}
        unmapped_2017 = []
        for index, row in df_2017_processed.iterrows():
            best_match, score = process.extractOne(row["Name"], institute_names_later_years.keys(), scorer=fuzz.ratio)
            if best_match and score > 85:
                mapped_ids_2017[row["Institute ID"]] = institute_names_later_years[best_match]
            else:
                unmapped_2017.append(row.to_dict())

        df_2017_processed["Consistent Institute ID"] = df_2017_processed["Institute ID"].map(mapped_ids_2017).fillna(df_2017_processed["Institute ID"])
        mapped_df_2017 = df_2017_processed.drop(columns=["Institute ID"], errors="ignore").rename(columns={"Consistent Institute ID": "Institute ID"})
        mapped_df_2017.to_csv("csv_data/nirf_data_2017.csv", index=False)
        print("Saved and mapped: nirf_data_2017.")
        if unmapped_2017:
            institutes_2017_not_in_others = pd.DataFrame(unmapped_2017)
        else:
            institutes_2017_not_in_others = pd.DataFrame()

else:
    print("Could not retrieve 2017 data.")


print("\nProcessing and mapping data for 2018...")
df_2018 = scrape_nirf_data(2018)
mapped_df_2018 = None
if df_2018 is not None:
    if "Institute ID" not in df_2018.columns:
        print("Skipping 2018: 'Institute ID' column missing")
    else:
        base_cols_2018 = ["Institute ID", "Name"]
        metric_cols = ["Rank", "Score", "TLR (100)", "RPC (100)", "GO (100)", "OI (100)", "PERCEPTION (100)"]
        existing_metrics_2018 = [col for col in metric_cols if col in df_2018.columns]
        df_2018_processed = df_2018[base_cols_2018 + existing_metrics_2018].copy()
        rename_map_2018 = {col: f"{col}_{2018}" for col in existing_metrics_2018}
        df_2018_processed = df_2018_processed.rename(columns=rename_map_2018)

        institute_names_later_years = {}
        for year in range(2019, 2026):
            temp_df = scrape_nirf_data(year)
            if temp_df is not None and "Name" in temp_df.columns and "Institute ID" in temp_df.columns:
                name_id_map = temp_df[["Name", "Institute ID"]].drop_duplicates(subset=["Name"]).set_index("Name")["Institute ID"].to_dict()
                institute_names_later_years.update(name_id_map)

        mapped_ids_2018 = {}
        unmapped_2018 = []
        for index, row in df_2018_processed.iterrows():
            best_match, score = process.extractOne(row["Name"], institute_names_later_years.keys(), scorer=fuzz.ratio)
            if best_match and score > 85:
                mapped_ids_2018[row["Institute ID"]] = institute_names_later_years[best_match]
            else:
                unmapped_2018.append(row.to_dict())

        df_2018_processed["Consistent Institute ID"] = df_2018_processed["Institute ID"].map(mapped_ids_2018).fillna(df_2018_processed["Institute ID"])
        mapped_df_2018 = df_2018_processed.drop(columns=["Institute ID"], errors="ignore").rename(columns={"Consistent Institute ID": "Institute ID"})
        mapped_df_2018.to_csv("csv_data/nirf_data_2018.csv", index=False)
        print("Saved and mapped: nirf_data_2018.")
        if unmapped_2018:
            institutes_2018_not_in_others = pd.DataFrame(unmapped_2018)
        else:
            institutes_2018_not_in_others = pd.DataFrame()

else:
    print("Could not retrieve 2018 data.")


for year in range(2019, 2026):
    print(f"\nProcessing data for {year}...")
    try:
        csv_path = f"csv_data/nirf_data_{year}.csv"
        if os.path.exists(csv_path):
            print(f"üìÅ Cached file found: {csv_path}")
            df = pd.read_csv(csv_path)
        else:
            df = scrape_nirf_data(year)
            if df is not None:
                if "Institute ID" not in df.columns:
                    print(f"Skipping {year}: 'Institute ID' column missing")
                    continue

                base_cols = ["Institute ID", "Name"]
                metric_cols = ["Rank", "Score", "TLR (100)", "RPC (100)", "GO (100)", "OI (100)", "PERCEPTION (100)"]
                existing_metrics = [col for col in metric_cols if col in df.columns]
                df_processed = df[base_cols + existing_metrics].copy()
                rename_map = {col: f"{col}_{year}" for col in existing_metrics}
                df_processed = df_processed.rename(columns=rename_map)
                all_data.append(df_processed)
                df_processed.to_csv(csv_path, index=False)
                print(f"Saved: nirf_data_{year}.")
                time.sleep(1)
                continue

        if df is not None and "Institute ID" in df.columns:
            base_cols = ["Institute ID", "Name"]
            metric_cols = ["Rank", "Score", "TLR (100)", "RPC (100)", "GO (100)", "OI (100)", "PERCEPTION (100)"]
            existing_metrics = [col for col in metric_cols if f"{col}_{year}" in df.columns or col in df.columns]

            needs_rename = any(col in df.columns for col in metric_cols)
            if needs_rename:
                df = df[base_cols + existing_metrics].copy()
                rename_map = {col: f"{col}_{year}" for col in existing_metrics}
                df = df.rename(columns=rename_map)

            all_data.append(df)

    except Exception as e:
        print(f"Error in {year}: {e}")
        continue


if all_data and mapped_df_2017 is not None:
    combined_df = mapped_df_2017.copy()  
    all_data_to_merge = [mapped_df_2018] + all_data  

    for df in all_data_to_merge:
        if 'Institute ID' in combined_df.columns and 'Institute ID' in df.columns:
            combined_df = pd.merge(combined_df, df.drop(columns=["Name"], errors="ignore"), on="Institute ID", how="outer", suffixes=('', '_y'))
            combined_df = combined_df.loc[:, ~combined_df.columns.duplicated(keep='first')]
        elif 'Institute ID' in combined_df.columns:
            combined_df = pd.merge(combined_df, df.drop(columns=["Name"], errors="ignore"), left_on="Institute ID", right_on="Institute ID", how="outer")
        elif 'Institute ID' in df.columns:
            combined_df = pd.merge(combined_df, df.drop(columns=["Name"], errors="ignore"), left_on="Institute ID", right_on="Institute ID", how="outer")
        else:
            print("Warning: 'Institute ID' column not found in one of the DataFrames during merge.")

    all_data_with_2017 = [mapped_df_2017] + [mapped_df_2018] + all_data
    latest_names = {}
    for df in reversed(all_data_with_2017):
        if 'Institute ID' in df.columns and 'Name' in df.columns:
            name_map = df.set_index("Institute ID")["Name"].dropna().to_dict()
            latest_names.update(name_map)

    combined_df["Name"] = combined_df["Institute ID"].map(latest_names)

    fixed_cols = ["Institute ID", "Name"]
    ordered_cols = []
    metric_cols_all = ["Rank", "Score", "TLR (100)", "RPC (100)", "GO (100)", "OI (100)", "PERCEPTION (100)"]
    for metric in metric_cols_all:
        yearly_cols = sorted([col for col in combined_df.columns if col.startswith(metric + "_")])
        ordered_cols.extend(yearly_cols)

    remaining_cols = [col for col in combined_df.columns if col not in fixed_cols + ordered_cols]
    combined_df = combined_df[fixed_cols + ordered_cols + remaining_cols]

    cols = combined_df.columns.tolist()
    if "Name" in cols:
        cols.remove("Name")
        cols.insert(1, "Name")
    combined_df = combined_df[cols]

    combined_df.to_csv("csv_data/nirf_combined_data.csv", index=False)
    combined_df.to_json("nirf_combined_data.json", orient="records", indent=2)
    print("\nAll data saved to nirf_combined_data.")
    merge_csvs()

else:
    print("\nNo data extracted for merging.")


print("\nüîó Combining all years...")
csv_dir = os.path.join("csv_data")
all_years_data = []
name_map_latest = {}

for year in range(2017, 2026):
    csv_file = os.path.join(csv_dir, f"nirf_data_{year}.csv")
    if not os.path.exists(csv_file):
        print(f"‚ùå Missing: {csv_file}")
        continue

    df = pd.read_csv(csv_file)

    if "Name" in df.columns and "Institute ID" in df.columns:
        latest_name_map = df.set_index("Institute ID")["Name"].dropna().to_dict()
        name_map_latest.update(latest_name_map)

    all_years_data.append(df)


def merge_yearwise_data(dfs):
    combined = dfs[0]
    for df in dfs[1:]:
        df = df.drop(columns=["Name"], errors="ignore")
        combined = pd.merge(combined, df, on="Institute ID", how="outer")
    return combined


if all_years_data:
    combined_df = merge_yearwise_data(all_years_data)
    combined_df["Name"] = combined_df["Institute ID"].map(name_map_latest)

    fixed_cols = ["Institute ID", "Name"]
    rest_cols = [col for col in combined_df.columns if col not in fixed_cols]

    import re
    from collections import defaultdict

    col_groups = defaultdict(list)
    for col in rest_cols:
        match = re.match(r"(.+)_\d{4}$", col)
        if match:
            key = match.group(1)
            col_groups[key].append(col)
        else:
            col_groups["Other"].append(col)

    ordered_cols = []
    for metric in ["Rank", "Score", "TLR (100)", "RPC (100)", "GO (100)", "OI (100)", "PERCEPTION (100)"]:
        if metric in col_groups:
            ordered_cols.extend(sorted(col_groups[metric]))
            del col_groups[metric]

    sample_param_path = None
    for y in range(2025, 2018, -1):
        output_folder = f"output{y}"
        if os.path.isdir(output_folder):
            for college in os.listdir(output_folder):
                param_file = os.path.join(output_folder, college, "parameter_scores.csv")
                if os.path.isfile(param_file):
                    sample_param_path = param_file
                    break
        if sample_param_path:
            break

    if sample_param_path:
        param_df = pd.read_csv(sample_param_path)
        parameter_order = param_df.columns.tolist()
    else:
        parameter_order = []

    for param in parameter_order:
        if param in col_groups:
            ordered_cols.extend(sorted(col_groups[param]))
            del col_groups[param]

    for param, cols in col_groups.items():
        ordered_cols.extend(sorted(cols))

    combined_df = combined_df[fixed_cols + ordered_cols]

    combined_df.to_csv(os.path.join(csv_dir, "nirf_combined_data.csv"), index=False)
    combined_df.to_json("nirf_combined_data.json", orient="records", indent=2)

    print("‚úÖ Combined data saved to:")
    print("  ‚Üí csv_data/nirf_combined_data.csv")
    print("  ‚Üí nirf_combined_data.json")

else:
    print("‚ö†Ô∏è No yearly data to combine.")


ModuleNotFoundError: No module named 'scraper'

In [4]:
!pip  install nbimporter

Defaulting to user installation because normal site-packages is not writeable
