In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load Excel data
df = pd.read_excel('StudentsPerformance.xlsx')

# Drop any missing values
df = df.dropna()

# Define numerical and categorical features
numerical_features = ['reading score', 'writing score']
categorical_features = ['gender', 'lunch', 'test preparation course']

# Features (X) and Target (y)
X = df[numerical_features + categorical_features]
y = df['math score']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

# Preprocess features
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Model
model = LinearRegression()
model.fit(X_train_processed, y_train)

# Predictions
y_pred = model.predict(X_test_processed)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

print("\nSummary:")
print("This Multiple Linear Regression model was trained to predict math scores based on reading score, writing score, and demographics.")


Mean Squared Error: 28.936701731223728
R^2 Score: 0.8810845237281756

Summary:
This Multiple Linear Regression model was trained to predict math scores based on reading score, writing score, and demographics.
