In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Load the CSV file
df = pd.read_csv("form3.csv")

# Display first few rows
df.head(5)


Unnamed: 0,Timestamp,Name,Roll Number,Branch,Year,Before,During,Study_Method,Environment,CGPA
0,2/20/2025 16:05:00,Ani,BTECH/11026/23,PRODUCTION,K23,8.0,8.0,"Solo(Offline via books and lecture notes), Cla...",Library,7.2
1,2/20/2025 16:05:12,Ankit Kumar,BTech/10233/24,CSE,K24,8.0,8.0,Solo(Online via video lectures),Library,9.2
2,2/20/2025 16:05:54,Tathya Varma,btech1080024,AIML,K24,0.0,6.0,Solo(Online via video lectures),Room(Group Study),7.8
3,2/20/2025 16:09:15,Saubhagya Shashank,IED/10009/24,QEDS,K24,3.0,5.0,"Class lectures and notes only, Solo(Online via...",Room(Solo),8.7
4,2/20/2025 16:12:07,Pratyay Banerjee,39,CSE,K22,6.0,12.0,Solo(Online via video lectures),Room(Solo),9.3


In [51]:
# Selecting relevant columns for regression
df = df[['Before', 'During', 'Study_Method', 'Environment', 'CGPA']]


In [52]:
# Checking for missing values
df.isnull().sum()


Before          0
During          0
Study_Method    0
Environment     0
CGPA            0
dtype: int64

In [53]:
import re  # Regular expressions for text cleaning

# Function to extract numeric CGPA values
def extract_numeric_cgpa(value):
    match = re.search(r'\d+(\.\d+)?', str(value))  # Find first number in the string
    return float(match.group()) if match else None  # Convert to float if found

# Apply function to clean CGPA
df['CGPA'] = df['CGPA'].apply(extract_numeric_cgpa)


In [54]:
#testing if cleaned 
df.head()

Unnamed: 0,Before,During,Study_Method,Environment,CGPA
0,8.0,8.0,"Solo(Offline via books and lecture notes), Cla...",Library,7.2
1,8.0,8.0,Solo(Online via video lectures),Library,9.2
2,0.0,6.0,Solo(Online via video lectures),Room(Group Study),7.8
3,3.0,5.0,"Class lectures and notes only, Solo(Online via...",Room(Solo),8.7
4,6.0,12.0,Solo(Online via video lectures),Room(Solo),9.3


In [55]:
# One-hot encoding categorical variables
df_encoded = pd.get_dummies(df, columns=['Study_Method', 'Environment'], drop_first=True)


In [56]:
# testing if the above task is completed or not 
# i have not done inplace for testing
df_encoded.head(10)

Unnamed: 0,Before,During,CGPA,"Study_Method_Class lectures and notes only, Solo(Online via video lectures)","Study_Method_I prefer both class notes, solo study with online lectures when needed",Study_Method_Solo(Offline via books and lecture notes),"Study_Method_Solo(Offline via books and lecture notes), Class lectures and notes only","Study_Method_Solo(Offline via books and lecture notes), Class lectures and notes only, Solo(Online via video lectures)","Study_Method_Solo(Offline via books and lecture notes), Solo(Online via video lectures)",Study_Method_Solo(Online via video lectures),"Study_Method_Solo(Online via video lectures),","Study_Method_Solo(Online via video lectures), Friend teaches me",Environment_Library,Environment_Room(Group Study),Environment_Room(Solo)
0,8.0,8.0,7.2,False,False,False,False,True,False,False,False,False,True,False,False
1,8.0,8.0,9.2,False,False,False,False,False,False,True,False,False,True,False,False
2,0.0,6.0,7.8,False,False,False,False,False,False,True,False,False,False,True,False
3,3.0,5.0,8.7,True,False,False,False,False,False,False,False,False,False,False,True
4,6.0,12.0,9.3,False,False,False,False,False,False,True,False,False,False,False,True
5,0.0,5.0,6.67,False,False,False,False,False,False,True,False,False,False,True,False
6,0.0,15.0,8.3,False,False,True,False,False,False,False,False,False,False,True,False
7,2.0,4.0,8.1,True,False,False,False,False,False,False,False,False,False,False,True
8,4.0,5.0,8.86,False,False,False,False,False,False,True,False,False,False,False,True
9,1.5,4.0,8.3,True,False,False,False,False,False,False,False,False,True,False,False


In [None]:
print(df_encoded.columns)


Index(['Before', 'During', 'CGPA',
       'Study_Method_Class lectures and notes only, Solo(Online via video lectures)',
       'Study_Method_I prefer both class notes, solo study with online lectures when needed ',
       'Study_Method_Solo(Offline via books and lecture notes)',
       'Study_Method_Solo(Offline via books and lecture notes), Class lectures and notes only',
       'Study_Method_Solo(Offline via books and lecture notes), Class lectures and notes only, Solo(Online via video lectures)',
       'Study_Method_Solo(Offline via books and lecture notes), Solo(Online via video lectures)',
       'Study_Method_Solo(Online via video lectures)',
       'Study_Method_Solo(Online via video lectures), ',
       'Study_Method_Solo(Online via video lectures), Friend teaches me',
       'Environment_Library', 'Environment_Room(Group Study)',
       'Environment_Room(Solo)'],
      dtype='object')


In [None]:
# to save the file in the same directory 
df_encoded.to_csv("df_encoded.csv", index=False)

In [58]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['CGPA'])  # Independent variables 

# that is X contains study hours, study environment, study material (everything except CGPA).
y = df_encoded['CGPA']  # Target variable

# Splitting into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [59]:
# Train Model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Predictions
y_pred_lr = model_lr.predict(X_test)

# Error Metrics
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)


In [60]:
print(y_pred_lr) # data obtained due to linear regression 

[7.93705149 7.88637437 8.42995161 8.86867277 8.13135182 7.96614705
 8.15484447 8.61544835 8.80408181 8.77300368 8.5670494  8.33384999
 8.64630546 8.22079666 8.83238356 7.98660459 8.26731464 8.04232801
 8.33948344 7.91706067 8.81730969 8.41926706]


In [61]:
print(f"Mean Absolute Error (MAE): {mae_lr}")
print(f"Mean Squared Error (MSE): {mse_lr}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr}")

Mean Absolute Error (MAE): 0.6893442373425428
Mean Squared Error (MSE): 0.6159086891936512
Root Mean Squared Error (RMSE): 0.784798502288104


In [62]:

# for polynomial regression 
# Polynomial Features
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train Model
model_poly = LinearRegression()
model_poly.fit(X_train_poly, y_train)

# Predictions
y_pred_poly = model_poly.predict(X_test_poly)

# Error Metrics
mae_poly = mean_absolute_error(y_test, y_pred_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)
rmse_poly = np.sqrt(mse_poly)


In [63]:
print(f"Mean Absolute Error (MAE) for Polynomial Regression: {mae_poly}")
print(f"Mean Squared Error (MSE) for Polynomial Regression: {mse_poly}")
print(f"Root Mean Squared Error (RMSE) for Polynomial Regression: {rmse_poly}")

Mean Absolute Error (MAE) for Polynomial Regression: 1.9170528814203946
Mean Squared Error (MSE) for Polynomial Regression: 11.032375738325152
Root Mean Squared Error (RMSE) for Polynomial Regression: 3.321502030456274


In [64]:
# decision tree regression 
# Train Model
model_tree = DecisionTreeRegressor(random_state=42)
model_tree.fit(X_train, y_train)

# Predictions
y_pred_tree = model_tree.predict(X_test)

# Error Metrics
mae_tree = mean_absolute_error(y_test, y_pred_tree)
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = np.sqrt(mse_tree)



In [65]:
print(f"Mean Absolute Error (MAE) for Decision Tree Regression: {mae_tree}")
print(f"Mean Squared Error (MSE) for Decision Tree Regression: {mse_tree}")
print(f"Root Mean Squared Error (RMSE) for Decision Tree Regression: {rmse_tree}")

Mean Absolute Error (MAE) for Decision Tree Regression: 0.9095454545454543
Mean Squared Error (MSE) for Decision Tree Regression: 1.1895931818181813
Root Mean Squared Error (RMSE) for Decision Tree Regression: 1.0906847307165262


In [66]:
# Train Model
# for random forest model 
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

# Predictions
y_pred_rf = model_rf.predict(X_test)

# Error Metrics
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)



In [67]:
print(f"Mean Absolute Error (MAE) for Random Forest Regression: {mae_rf}")
print(f"Mean Squared Error (MSE) for Random Forest Regression: {mse_rf}")
print(f"Root Mean Squared Error (RMSE) for Random Forest Regression: {rmse_rf}")

Mean Absolute Error (MAE) for Random Forest Regression: 0.7343760606060609
Mean Squared Error (MSE) for Random Forest Regression: 0.6792667970116365
Root Mean Squared Error (RMSE) for Random Forest Regression: 0.8241764356080781


In [68]:
print(f"Mean Absolute Error (MAE) for Linear Regression: {mae_lr}")
print(f"Mean Squared Error (MSE) for Linear Regression: {mse_lr}")
print(f"Root Mean Squared Error (RMSE) for Linear Regression: {rmse_lr}")

print(f"Mean Absolute Error (MAE) for Polynomial Regression: {mae_poly}")
print(f"Mean Squared Error (MSE) for Polynomial Regression: {mse_poly}")
print(f"Root Mean Squared Error (RMSE) for Polynomial Regression: {rmse_poly}")

print(f"Mean Absolute Error (MAE) for Decision Tree Regression: {mae_tree}")
print(f"Mean Squared Error (MSE) for Decision Tree Regression: {mse_tree}")
print(f"Root Mean Squared Error (RMSE) for Decision Tree Regression: {rmse_tree}")

print(f"Mean Absolute Error (MAE) for Random Forest Regression: {mae_rf}")
print(f"Mean Squared Error (MSE) for Random Forest Regression: {mse_rf}")
print(f"Root Mean Squared Error (RMSE) for Random Forest Regression: {rmse_rf}")

Mean Absolute Error (MAE) for Linear Regression: 0.6893442373425428
Mean Squared Error (MSE) for Linear Regression: 0.6159086891936512
Root Mean Squared Error (RMSE) for Linear Regression: 0.784798502288104
Mean Absolute Error (MAE) for Polynomial Regression: 1.9170528814203946
Mean Squared Error (MSE) for Polynomial Regression: 11.032375738325152
Root Mean Squared Error (RMSE) for Polynomial Regression: 3.321502030456274
Mean Absolute Error (MAE) for Decision Tree Regression: 0.9095454545454543
Mean Squared Error (MSE) for Decision Tree Regression: 1.1895931818181813
Root Mean Squared Error (RMSE) for Decision Tree Regression: 1.0906847307165262
Mean Absolute Error (MAE) for Random Forest Regression: 0.7343760606060609
Mean Squared Error (MSE) for Random Forest Regression: 0.6792667970116365
Root Mean Squared Error (RMSE) for Random Forest Regression: 0.8241764356080781
