# Model Training and Preparation for Deployment

## Imports

### Import Dependencies

In [4]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

### Import Dataset

In [5]:
df = pd.read_csv('../dataset/StudentPerformanceFactors.csv')
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [6]:
df.columns

Index(['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Internet_Access',
       'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
       'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities',
       'Parental_Education_Level', 'Distance_from_Home', 'Gender',
       'Exam_Score'],
      dtype='object')

## Selecting Features and Target Variable

In [8]:
features = ['Hours_Studied', 'Attendance', 'Access_to_Resources', 'Previous_Scores', 'Teacher_Quality', 'Tutoring_Sessions']
target = 'Exam_Score'

In [9]:
df_model = df[features + [target]].copy()

In [10]:
df_model

Unnamed: 0,Hours_Studied,Attendance,Access_to_Resources,Previous_Scores,Teacher_Quality,Tutoring_Sessions,Exam_Score
0,23,84,High,73,Medium,0,67
1,19,64,Medium,59,Medium,2,61
2,24,98,Medium,91,Medium,2,74
3,29,89,Medium,98,Medium,1,71
4,19,92,Medium,65,High,3,70
...,...,...,...,...,...,...,...
6602,25,69,Medium,76,Medium,1,68
6603,23,76,Medium,81,High,3,69
6604,20,90,Low,65,Medium,3,68
6605,10,86,High,91,Medium,2,68


## Encoding Object Features into Numeric Values

In [11]:
le = LabelEncoder()

In [12]:
df_model['Teacher_Quality'] = le.fit_transform(df_model['Teacher_Quality'])
df_model['Access_to_Resources'] = le.fit_transform(df_model['Access_to_Resources'])

In [13]:
X = df_model[features]
y = df_model[target]

## Splitting Data into Train and Test Sets

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [24]:
models = {
    "LinearRegression": {
        "model": LinearRegression(),
        "params": {}
    },
    "DecisionTree": {
        "model": DecisionTreeRegressor(),
        "params": {"max_depth": [3,5,10], "min_samples_split": [2,5]}
    },
    "RandomForest": {
        "model": RandomForestRegressor(),
        "params": {"n_estimators": [50,100], "max_depth": [5,10]}
    }
}

In [25]:
best_models = []

In [26]:
for name, config in models.items():
    print(f"Training {name}")

    grid = GridSearchCV(config["model"], config["params"], cv=5, scoring="neg_mean_squared_error")
    grid.fit(X_train, y_train)

    y_pred = grid.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    best_models.append({
        "model": name,
        "best_params": grid.best_params_,
        "rmse": rmse,
        "r2": r2
    })    

Training LinearRegression
Training DecisionTree
Training RandomForest


In [27]:
results_df = pd.DataFrame(best_models)

In [29]:
results_df.sort_values(by='rmse')

Unnamed: 0,model,best_params,rmse,r2
0,LinearRegression,{},2.151168,0.646355
2,RandomForest,"{'max_depth': 10, 'n_estimators': 100}",2.289375,0.599454
1,DecisionTree,"{'max_depth': 5, 'min_samples_split': 2}",2.496967,0.52352
