<a href="https://colab.research.google.com/github/kihagama/hero/blob/main/allmodels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [95]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load the dataset
data = pd.read_csv("cleaned_data.csv")
df = pd.DataFrame(data)

# Strip spaces from column names and replace with underscores
df.columns = [col.strip().replace(' ', '_') for col in df.columns]

# Print column names and first 5 rows for debugging
print("Column names in the dataset (after stripping spaces):")
print(df.columns.tolist())
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Define expected targets (updated to match renamed columns)
expected_targets = ['Application', 'Admision', 'Academic', 'student_Portal']
# Use only columns that exist in the DataFrame
targets = [col for col in expected_targets if col in df.columns]

# Check if any expected columns are missing
missing_cols = [col for col in expected_targets if col not in df.columns]
if missing_cols:
    print(f"\nWarning: The following expected columns are missing from the dataset: {missing_cols}")
if not targets:
    raise ValueError("No valid target columns found in the dataset. Available columns: " + str(df.columns.tolist()))

# Train-test split
train = df[df['year'] <= 2019]
test = df[df['year'] > 2019]
X_train = train[['year']]
X_test = test[['year']]

# Initialize results dictionary
results = {'Model': [], 'Target': [], 'MSE': [], 'R2': []}

# Function to evaluate models
def evaluate_model(y_true, y_pred, model_name, target):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    results['Model'].append(model_name)
    results['Target'].append(target)
    results['MSE'].append(mse)
    results['R2'].append(r2)

# Loop through each target variable
for target in targets:
    y_train = train[target]
    y_test = test[target]

    # ARIMA
    try:
        arima_model = ARIMA(y_train, order=(1,1,1)).fit()
        arima_pred = arima_model.forecast(steps=len(test))
        evaluate_model(y_test, arima_pred, 'ARIMA', target)
    except:
        results['Model'].append('ARIMA')
        results['Target'].append(target)
        results['MSE'].append(np.nan)
        results['R2'].append(np.nan)

    # Exponential Smoothing
    try:
        es_model = ExponentialSmoothing(y_train, trend='add', seasonal=None).fit()
        es_pred = es_model.forecast(len(test))
        evaluate_model(y_test, es_pred, 'Exponential Smoothing', target)
    except:
        results['Model'].append('Exponential Smoothing')
        results['Target'].append(target)
        results['MSE'].append(np.nan)
        results['R2'].append(np.nan)

    # Linear Regression
    lr_model = LinearRegression().fit(X_train, y_train)
    lr_pred = lr_model.predict(X_test)
    evaluate_model(y_test, lr_pred, 'Linear Regression', target)

    # Decision Tree
    dt_model = DecisionTreeRegressor().fit(X_train, y_train)
    dt_pred = dt_model.predict(X_test)
    evaluate_model(y_test, dt_pred, 'Decision Tree', target)

    # XGBoost
    xgb_model = XGBRegressor().fit(X_train, y_train)
    xgb_pred = xgb_model.predict(X_test)
    evaluate_model(y_test, xgb_pred, 'XGBoost', target)

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Average metrics across targets
avg_results = results_df.groupby('Model')[['MSE', 'R2']].mean().reset_index()
print("\nAverage Performance Across All Targets:")
print(avg_results.sort_values(by='MSE'))

# Detailed results
print("\nDetailed Results:")
print(results_df)

Column names in the dataset (after stripping spaces):
['year', 'Application', 'Admision', 'Academic', 'student_Portal']

First 5 rows of the dataset:
   year  Application  Admision  Academic  student_Portal
0  1996           63       100        88              94
1  1997           52        78        92              75
2  1998           54       100       100              82
3  2000           65        80        95              89
4  2001           58        87        88              88

Average Performance Across All Targets:
                   Model         MSE        R2
0                  ARIMA  439.236753 -0.108143
3      Linear Regression  441.968376 -0.112610
2  Exponential Smoothing  448.437746 -0.127969
4                XGBoost  455.199295 -0.114864
1          Decision Tree  455.200000 -0.114856

Detailed Results:
                    Model          Target          MSE        R2
0                   ARIMA     Application   186.173176 -0.050639
1   Exponential Smoothing     Applic