## Importing the libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib
%matplotlib inline
import math
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("Student Info.csv")
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [4]:
categorical_col = df.select_dtypes(include=['object']).columns

In [85]:
# label encoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for i in categorical_col:
    df[i] = label_encoder.fit_transform(df[i])

df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,18,1,0,0,4,4,0,4,...,4,3,4,1,1,3,6,5,6,6
1,0,0,17,1,0,1,1,1,0,2,...,5,3,3,1,1,3,4,5,5,6
2,0,0,15,1,1,1,1,1,0,2,...,4,3,2,2,3,3,10,7,8,10
3,0,0,15,1,0,1,4,2,1,3,...,3,2,2,1,1,5,2,15,14,15
4,0,0,16,1,0,1,3,3,2,2,...,4,3,2,1,2,5,4,6,10,10


## Feature selection using SelectKBest

In [86]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor

# Define the pipeline
pipeline = Pipeline([
    ('feature_selection', SelectKBest(mutual_info_regression)),
    ('regressor', RandomForestRegressor())
])

# Define the parameter grid
param_grid = {
    'feature_selection__k': [5, 10, 15],  # Adjust the number of features
    'regressor__n_estimators': [50, 100, 200]  # Adjust other model hyperparameters
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X, y)


In [87]:
# Access the best feature selection model from the grid search
best_feature_selection_model = grid_search.best_estimator_['feature_selection']

# Access the selected features
selected_features_indices = best_feature_selection_model.get_support(indices=True)

# Get the feature names from the original DataFrame
selected_feature_names = X.columns[selected_features_indices]

# Display the selected features
print("Selected Features:")
print(selected_feature_names)


Selected Features:
Index(['school', 'address', 'Medu', 'Fedu', 'guardian', 'traveltime',
       'failures', 'paid', 'higher', 'famrel', 'freetime', 'goout', 'absences',
       'G1', 'G2'],
      dtype='object')


## Building the model on selected features

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Assuming X_selected contains the selected features and df is your original DataFrame
X_selected = df[['school', 'address', 'Medu', 'Fedu', 'guardian', 'traveltime',
       'failures', 'paid', 'higher', 'famrel', 'freetime', 'goout', 'absences',
       'G1', 'G2']]
y = df['G3']


In [89]:
X_selected.columns

Index(['school', 'address', 'Medu', 'Fedu', 'guardian', 'traveltime',
       'failures', 'paid', 'higher', 'famrel', 'freetime', 'goout', 'absences',
       'G1', 'G2'],
      dtype='object')

In [90]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split
import numpy as np

def train(model, X, y):
    # train the model
    x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=42)
    model.fit(x_train, y_train)
    
    # predict the training set
    pred = model.predict(x_test)
    
    # perform cross-validation
    cv_score = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=5)
    cv_score = np.abs(np.mean(cv_score))
    
    # calculate R2 score
    r2 = r2_score(y_test, pred)
    
    print("Model Report")
    print("MSE:", mean_squared_error(y_test, pred))
    print('CV Score:', cv_score)
    print('R2 Score:', r2)


In [92]:
import lightgbm as lgb
model_1 = lgb.LGBMRegressor(objective='regression', random_state=42)
train(model_1, X_selected, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 107
[LightGBM] [Info] Number of data points in the train set: 783, number of used features: 15
[LightGBM] [Info] Start training from score 11.375479
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 107
[LightGBM] [Info] Number of data points in the train set: 835, number of used features: 15
[LightGBM] [Info] Start training from score 11.525749
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 107
[LightGBM] [Info] Number of data points in the train set: 835, number of used features: 15
[LightGBM] [Info] Start training from score 11.170060
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 108
[LightGBM] [Info] Number of data points in the train set: 835, number of used features: 15
[LightGBM] [Info] Start training from score 10.950898
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000243 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1

## Saving the model

In [93]:
import joblib
joblib.dump(model_1, 'V2_M1.pkl')

['V2_M1.pkl']

In [95]:
model = joblib.load('V2_M1.pkl' ) 
input_array = X_selected.iloc[0].values.reshape(1, -1)

# Predict using the reshaped array
prediction = model.predict(input_array)
print(round(prediction[0]))

7
