In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
df = pd.read_csv('C:\\Users\\Hardik\\Desktop\\ML_Pipeline\\Student-Score-Predictor\\student_scores_large.csv')
print("Dataset Shape:", df.shape)
df.head()

In [None]:
print("\nDataset Info:")
print(df.info())

In [None]:
print("\nSummary Statistics:")
print(df.describe())

In [None]:
print("\nMissing values:\n", df.isnull().sum())

In [None]:
#EDA - Exploratory Data Analysis

In [None]:
# Histogram for Hours and Scores
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.histplot(df['Hours'], bins=15, kde=True, color='blue')
plt.title('Distribution of Study Hours')



In [None]:
plt.subplot(1,2,2)
sns.histplot(df['Scores'], bins=15, kde=True, color='green')
plt.title('Distribution of Scores')
plt.show()

In [None]:
# Scatter Plot (Hours vs Scores)
plt.figure(figsize=(6,5))
sns.scatterplot(x='Hours', y='Scores', data=df, color='blue', s=70)
plt.title('Hours vs Scores')
plt.xlabel('Study Hours')
plt.ylabel('Exam Score')
plt.show()

In [None]:
# Correlation Heatmap
plt.figure(figsize=(5,4))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
X = df[['Hours']]
y = df['Scores']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [None]:
print("\nModel Coefficient (Slope):", lr_model.coef_[0])
print("Model Intercept:", lr_model.intercept_)

In [None]:
print(f"Interpretation: For every additional hour studied, score increases by approximately {lr_model.coef_[0]:.2f} marks.")


In [None]:
y_pred = lr_model.predict(X_test)

In [None]:
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("\nActual vs Predicted:\n", comparison.head())


In [None]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:
print("\nEvaluation Metrics:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

In [None]:
#Visualization – Actual vs Predicted
plt.figure(figsize=(8,5))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
plt.title('Actual vs Predicted Scores')
plt.xlabel('Study Hours')
plt.ylabel('Scores')
plt.legend()
plt.show()

In [None]:
residuals = y_test - y_pred
plt.figure(figsize=(8,4))
sns.histplot(residuals, bins=10, kde=True, color='purple')
plt.title('Residual Distribution')
plt.xlabel('Residual (Actual - Predicted)')
plt.show()


In [None]:
#User Input Prediction
try:
    user_hours = float(input("\nEnter the number of study hours: "))
    user_pred = lr_model.predict([[user_hours]])
    print(f"Predicted Score for {user_hours} hours: {user_pred[0]:.2f}")
except:
    print("Invalid input! Please enter a numeric value.")

In [None]:
#BONUS: Polynomial Regression (Degree 2)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

In [None]:
poly_model = LinearRegression()
poly_model.fit(X_poly, y)

In [None]:
y_poly_pred = poly_model.predict(poly.transform(X_test))

In [None]:
poly_r2 = r2_score(y_test, y_poly_pred)
print(f"\nPolynomial Regression R² Score: {poly_r2:.2f}")

In [None]:
# Compare Linear vs Polynomial visually
plt.figure(figsize=(8,5))
plt.scatter(X, y, color='blue', label='Actual Data')
plt.plot(X, lr_model.predict(X), color='red', label='Linear Regression')
plt.scatter(X_test, y_poly_pred, color='green', label='Polynomial Prediction')
plt.title('Linear vs Polynomial Regression')
plt.xlabel('Study Hours')
plt.ylabel('Scores')
plt.legend()
plt.show()

In [None]:
import os

# Create 'images' folder if it doesn't exist
if not os.path.exists('images'):
    os.makedirs('images')

# -----------------------------
# Save plots

# 1. Histogram for Hours and Scores
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.histplot(df['Hours'], bins=15, kde=True, color='blue')
plt.title('Distribution of Study Hours')
plt.subplot(1,2,2)
sns.histplot(df['Scores'], bins=15, kde=True, color='green')
plt.title('Distribution of Scores')
plt.savefig('images/histograms.png')
plt.close()

# 2. Scatter Plot (Hours vs Scores)
plt.figure(figsize=(6,5))
sns.scatterplot(x='Hours', y='Scores', data=df, color='blue', s=70)
plt.title('Hours vs Scores')
plt.xlabel('Study Hours')
plt.ylabel('Exam Score')
plt.savefig('images/scatter_plot.png')
plt.close()

# 3. Correlation Heatmap
plt.figure(figsize=(5,4))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig('images/heatmap.png')
plt.close()

# 4. Actual vs Predicted
plt.figure(figsize=(8,5))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.plot(X_test, y_pred, color='red', linewidth=2, label='Predicted')
plt.title('Actual vs Predicted Scores')
plt.xlabel('Study Hours')
plt.ylabel('Scores')
plt.legend()
plt.savefig('images/actual_vs_predicted.png')
plt.close()

# 5. Residual Analysis
plt.figure(figsize=(8,4))
sns.histplot(residuals, bins=10, kde=True, color='purple')
plt.title('Residual Distribution')
plt.xlabel('Residual (Actual - Predicted)')
plt.savefig('images/residual_distribution.png')
plt.close()

# 6. Linear vs Polynomial Regression
plt.figure(figsize=(8,5))
plt.scatter(X, y, color='blue', label='Actual Data')
plt.plot(X, lr_model.predict(X), color='red', label='Linear Regression')
plt.scatter(X_test, y_poly_pred, color='green', label='Polynomial Prediction')
plt.title('Linear vs Polynomial Regression')
plt.xlabel('Study Hours')
plt.ylabel('Scores')
plt.legend()
plt.savefig('images/linear_vs_polynomial.png')
plt.close()

print("✅ All plots have been saved in the 'images' folder.")
