# importation and setup

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# data preprocessing

In [43]:
df = pd.read_csv('student_performance.csv')
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [44]:
df = df.dropna()

In [45]:
df = pd.get_dummies(df, columns=['Extracurricular Activities'])

In [46]:
X = np.array(df.drop(columns=['Performance Index']))
y = np.array(df['Performance Index'])

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# model training

In [49]:
from sklearn.linear_model import LinearRegression

In [50]:
lr_model = LinearRegression()

In [51]:
lr_model.fit(X_train, y_train)

In [52]:
y_pred = lr_model.predict(X_test)

In [53]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Model: {lr_model}")
print(f"MSE: {mse}")
print(f"R squared: {r2}")

Model: LinearRegression()
MSE: 4.105609215835831
R squared: 0.9880686410711422


In [54]:
dt_model = DecisionTreeRegressor()

# Train a model
dt_model.fit(X_train, y_train)

# Predict with the model
y_pred = dt_model.predict(X_test)

# Evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Model: {dt_model}")
print(f"MSE: {mse}")
print(f"R squared: {r2}")

Model: DecisionTreeRegressor()
MSE: 9.002888888888888
R squared: 0.9738365993734523


In [55]:
rf_model = RandomForestRegressor()

# Train a model
rf_model.fit(X_train, y_train)

# Predict with the model
y_pred = rf_model.predict(X_test)

# Evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Model: {rf_model}")
print(f"MSE: {mse}")
print(f"R squared: {r2}")

Model: RandomForestRegressor()
MSE: 5.402843268469671
R squared: 0.9842987340285987


In [65]:
import joblib


joblib.dump(rf_model, 'rf_model.pkl') 



['rf_model.pkl']