### Import CSV and Required Packages

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Warnings
import warnings
warnings.filterwarnings('ignore')


# Create Dataframe
df = pd.read_csv(".\EDA_data\EDA_UG_Student_CGPA_Prediction (1).csv")

# Print shape of dataset
print(df.shape)

  df = pd.read_csv(".\EDA_data\EDA_UG_Student_CGPA_Prediction (1).csv")


(1000, 12)


**Split X and y**
- Why do we split our data?
> Training Dataset is the part of Original Dataset that we use to train our ML model. The model learns on this data by running the algorithm and maps a function F(x) where “x” in the independent variable (inputs) for “y” where “y” is the dependent variable(output).

In [2]:
df

Unnamed: 0,Gender,Age,Attendance (%),HighSchool_GPA,Internal_Assessments (%),Participation_Score,Projects_Completed,Study_Hours_Per_Week,Backlogs,Sem_1_GPA,Sem_2_GPA,Target_CGPA
0,Male,17,90.0,73.3,82.8,2.9,5,18.1,0,7.92,8.33,8.18
1,Female,18,90.4,69.4,69.4,3.3,1,2.0,0,7.22,6.91,7.08
2,Male,22,75.9,58.1,92.6,3.0,0,11.1,0,5.34,5.48,5.35
3,Male,21,88.8,67.8,71.5,4.2,3,7.9,0,6.50,6.93,6.70
4,Male,17,87.1,51.5,59.1,4.9,0,12.8,1,5.32,5.49,5.46
...,...,...,...,...,...,...,...,...,...,...,...,...
995,Male,20,96.8,69.8,71.3,3.4,2,12.2,2,5.93,6.41,6.30
996,Female,20,81.5,100.0,73.9,4.7,5,16.7,0,10.00,10.00,9.94
997,Male,19,85.2,89.8,81.3,3.2,0,19.1,0,9.02,9.28,9.35
998,Female,17,83.6,79.9,75.2,2.7,2,8.1,2,6.66,6.93,6.89


In [None]:
# Convert Gender to numeric
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

In [4]:
X = df.drop("Target_CGPA", axis=1) #dropping the target column which is 'cluster'
y = df["Target_CGPA"] #target column

## Select the best model
- so here we have some list of the best classification algorithms we imported. Now we will compare each model's score and see which model is performing better than rest of the others

In [5]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Regression Models Dictionary

models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "XGB Regressor": XGBRegressor(),
    "CatBoost Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [None]:
# Create a function which can evaluate models and return a report 
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    

    models_list = []
    scores = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_pred = model.predict(X_test)

        score = r2_score(y_test,y_pred)
        
        model_name = list(models.keys())[i]
        print(f'---- score for --- {model_name} ----')
        print(f"{score}")
        models_list.append(model_name)
        scores.append(score)
    
    print()
    
    report = pd.DataFrame()
    report['Model_name'] = models_list
    report['Score'] = scores        
    return report

In [7]:
### Let's check the report

report = evaluate_models(X, y, models)

---- score for --- Linear Regression ----
0.9939892283184453
---- score for --- Random Forest ----
0.991484011609508
---- score for --- Decision Tree ----
0.9853226059650664
---- score for --- Gradient Boosting ----
0.9928385516026889
---- score for --- Ridge Regression ----
0.9939912510976296
---- score for --- Lasso Regression ----
0.8083999392202442
---- score for --- K-Neighbors Regressor ----
0.7792808800914421
---- score for --- XGB Regressor ----
0.9898548271703007
---- score for --- CatBoost Regressor ----
0.9920178311484148
---- score for --- AdaBoost Regressor ----
0.985532736647144



In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
report.sort_values('Score')
from sklearn.linear_model import LinearRegression
import joblib

# Fit Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Save the trained model
joblib.dump(lr_model, "model.pkl")

['model.pkl']

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit CatBoost model
best_cb_model = CatBoostRegressor(verbose=False)
best_cb_model.fit(X_train, y_train)

# Predict on test set
y_pred = best_cb_model.predict(X_test)

# Mean Absolute Error
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

# R² Score
print("R² Score:", r2_score(y_test, y_pred))

Mean Absolute Error: 0.027525139775072677
R² Score: 0.999150687353549


In [None]:
from sklearn.linear_model import LinearRegression
import joblib

# Fit Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Save the trained model
joblib.dump(lr_model, "model.pkl")

['model.pkl']