In [1]:
# WEEK 5: SUPERVISED LEARNING - REGRESSION
# Dataset: students.csv

# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Step 2: Load the dataset
df = pd.read_csv('/content/students.csv', sep=';')
print("Dataset Loaded Successfully!\n")
print("Columns in dataset:\n", df.columns.tolist(), "\n")

# Step 3: Select input (features) and output (target)
# We will predict the final grade (G3)
# using G1, G2, and failures as features
X = df[['G1', 'G2', 'failures']]   # Features
y = df['G3']                       # Target variable

print("Features used: G1, G2, failures")
print("Target variable: G3 (Final Grade)\n")

# Step 4: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test), "\n")

# Step 5: Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
print("Model trained successfully!\n")

# Step 6: Show model coefficients
print("Model Coefficients (Importance of each feature):")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef:.3f}")
print("Intercept (constant value):", round(model.intercept_, 3), "\n")

# Step 7: Make predictions
y_pred = model.predict(X_test)
print("Sample predictions (first 10):")
for i in range(10):
    print(f"Actual G3: {y_test.iloc[i]} | Predicted G3: {y_pred[i]:.2f}")
print()

# Step 8: Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Model Evaluation Results:")
print("Mean Absolute Error (MAE):", round(mae, 2))
print("Root Mean Squared Error (RMSE):", round(rmse, 2), "\n")

# Step 9: Interpretation
print("Interpretation:")
print("- G1 and G2 have positive coefficients → higher earlier grades mean higher final grades.")
print("- 'failures' has a negative coefficient → more past failures lower the final grade.")
print("- MAE and RMSE show how close predictions are to real grades.")
print("- Errors around 1–2 mean predictions are fairly accurate for this dataset.")
print("\nBaseline Linear Regression Model built successfully!")



Columns in dataset:
 ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3'] 

Features used: G1, G2, failures
Target variable: G3 (Final Grade)

Training samples: 519
Testing samples: 130 

Model trained successfully!

Model Coefficients (Importance of each feature):
G1: 0.181
G2: 0.876
failures: -0.148
Intercept (constant value): -0.243 

Sample predictions (first 10):
Actual G3: 19 | Predicted G3: 18.60
Actual G3: 12 | Predicted G3: 11.38
Actual G3: 18 | Predicted G3: 18.78
Actual G3: 11 | Predicted G3: 11.20
Actual G3: 11 | Predicted G3: 11.74
Actual G3: 17 | Predicted G3: 16.66
Actual G3: 18 | Predicted G3: 17.72
Actual G3: 8 | Predicted G3: 9.00
Actual G3: 10 | Predicted G3: 10.32
Actual G3: 11 | Predicted G3: 1