In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

**Creating target variable**

In [21]:
df['AcademicSuccess'] = df[['TestScore_Math', 'TestScore_Reading', 'TestScore_Science']].mean(axis=1)

**Prepare features**

X = df.drop(columns=['TestScore_Math', 'TestScore_Reading', 'TestScore_Science', 'AcademicSuccess'])
y = df['AcademicSuccess']

In [25]:
num_cols = X.select_dtypes(include=np.number).columns

In [27]:
#standardizing numeric features

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [29]:
#encoding
obj_cols = X.select_dtypes(include='object').columns
X = pd.get_dummies(X, columns=obj_cols, drop_first=True)

**Model Time**

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,        # 20% for testing, 80% for training (common choice)
    random_state=42,      # ensures reproducibility
    shuffle=True          # shuffles data before splitting (default)
)

In [33]:
#training model
model = LinearRegression()
model.fit(X_train, y_train)

In [35]:
#making predictions
y_pred = model.predict(X_test)

**Evaluating Model Performance**

In [37]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R^2 Score: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")

R^2 Score: 0.869
RMSE: 3.219


**Want to use Lasso**



In [40]:
from sklearn.linear_model import Lasso

# Initialize Lasso regression with a regularization strength (alpha)
lasso_model = Lasso(alpha=0.1, random_state=42)

# Fit the model
lasso_model.fit(X_train, y_train)

# Predict
y_pred_lasso = lasso_model.predict(X_test)


Evaluate performance

In [43]:
from sklearn.metrics import mean_squared_error, r2_score

In [51]:
from sklearn.metrics import root_mean_squared_error

rmse = root_mean_squared_error(y_test, y_pred)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 3.2193165898656577


In [53]:
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

R^2 Score: 0.8692744685581472


**Use Ridge Regression**

In [56]:
from sklearn.linear_model import Ridge

# Initialize Ridge regression with a regularization strength (alpha)
ridge_model = Ridge(alpha=1.0, random_state=42)

# Fit the model
ridge_model.fit(X_train, y_train)

# Predict
y_pred_ridge = ridge_model.predict(X_test)


In [60]:
# For Ridge
rmse = root_mean_squared_error(y_test, y_pred_ridge)
print("Ridge RMSE:", rmse)

r2 = r2_score(y_test, y_pred_ridge)
print("Ridge R^2:", r2)

Ridge RMSE: 3.2193165887201807
Ridge R^2: 0.8692744686511751


In [62]:
from sklearn.linear_model import LassoCV

lasso_cv = LassoCV(alphas=[0.01, 0.1, 1, 10, 100], cv=5, random_state=42)
lasso_cv.fit(X_train, y_train)
print("Best alpha:", lasso_cv.alpha_)
print("Lasso coefficients:", lasso_cv.coef_)

Best alpha: 0.01
Lasso coefficients: [ 0.          0.         -0.          2.1240325   6.24580301  0.79671776
  0.         -0.          0.          0.          0.         -0.
  0.          0.         -0.          0.         -0.          0.
 -0.          0.          0.         -0.         -0.          0.
  0.         -0.        ]


Zero coefficients mean those features are considered not useful for predicting your target variable (AcademicSuccess) by the Lasso model at the chosen alpha value

Lasso’s L1 penalty encourages sparsity, especially when you have many features or correlated variables. It will keep only the most predictive variables and set the rest to zero


**Map coefficients to features**

In [67]:
# Get feature names after preprocessing (e.g., after get_dummies)
feature_names = X_train.columns

# Print nonzero coefficients and their corresponding features
for coef, name in zip(lasso_cv.coef_, feature_names):
    if coef != 0:
        print(f"{name}: {coef}")

GPA: 2.1240325037558097
AttendanceRate: 6.245803008307738
StudyHours: 0.7967177607099242


According to the Lasso regression, only GPA, AttendanceRate, and StudyHours are significant predictors of AcademicSuccess. For each standardized unit increase in AttendanceRate, AcademicSuccess increases by 6.25 units, holding other variables constant.

**we're trying this again**

In [80]:
selected_features = ['GPA', 'AttendanceRate', 'StudyHours']
X_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [82]:
ols_model = LinearRegression()
ols_model.fit(X_selected, y_train)
y_pred_ols = ols_model.predict(X_test_selected)

In [86]:
rmse = root_mean_squared_error(y_test, y_pred_ols)
print("Root Mean Squared Error (RMSE):", rmse)

# Calculate R^2
r2 = r2_score(y_test, y_pred_ols)
print("R^2 Score:", r2)

Root Mean Squared Error (RMSE): 3.2193022820796497
R^2 Score: 0.8692756305369219


**Making a table**

In [91]:
results = {
    'Model': ['Linear Regression', 'Ridge Regression', 'Lasso Regression'],
    'RMSE': [3.219, 3.2193165887201807, 3.2193165887201807],
    'R^2': [0.869, 0.8692744686511751, 0.8692744686511751]
}

# Create a comparison table
comparison_df = pd.DataFrame(results)

print(comparison_df)

               Model      RMSE       R^2
0  Linear Regression  3.219000  0.869000
1   Ridge Regression  3.219317  0.869274
2   Lasso Regression  3.219317  0.869274
