# Data Science and Machine Learning Lab Assignment

This notebook contains solutions for the lab questions on linear regression and gradient descent.

**Student:** [Your Name]  
**Date:** December 21, 2025

## Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

---
# Question 1: Simple Linear Regression

We're given data about area (in sq.km) and corresponding prices. The task is to:
1. Perform simple linear regression using Least Squares method
2. Predict the price for an area of 2500 sq.km
3. Plot the line of regression

### Part (a): Fitting the Linear Regression Model

In [None]:
# data from the given table
area = np.array([1000, 2000, 2900, 2900, 3100, 3200])
price = np.array([300000, 424000, 433000, 483000, 497564, 521856])

# need to reshape for sklearn
X = area.reshape(-1, 1)
y = price

# fit the model
model = LinearRegression()
model.fit(X, y)

print("Simple Linear Regression Results:")
print(f"Slope (coefficient): {model.coef_[0]:.2f}")
print(f"Intercept: {model.intercept_:.2f}")
print(f"\nRegression Equation: Price = {model.intercept_:.2f} + {model.coef_[0]:.2f} × Area")

The model has been fitted successfully. The slope shows how much the price increases per unit increase in area.

### Part (b): Predicting Price for Area = 2500 sq.km

In [None]:
area_predict = 2500
price_predicted = model.predict([[area_predict]])

print(f"For area = {area_predict} sq.km")
print(f"Predicted price = ${price_predicted[0]:,.2f}")

### Part (c): Plotting the Line of Regression

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(area, price, color='blue', s=100, label='Actual data points')
plt.plot(area, model.predict(X), color='red', linewidth=2, label='Regression line')
plt.scatter(area_predict, price_predicted, color='green', s=150, marker='*', 
            label=f'Predicted point (Area={area_predict})', zorder=5)
plt.xlabel('Area (sq.km)', fontsize=12)
plt.ylabel('Price ($)', fontsize=12)
plt.title('Simple Linear Regression: Area vs Price', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# calculating R-squared to see how well the model fits
y_pred = model.predict(X)
r2 = r2_score(y, y_pred)
mse = mean_squared_error(y, y_pred)

print(f"Model Performance:")
print(f"R-squared: {r2:.4f}")
print(f"Mean Squared Error: ${mse:,.2f}")

The R-squared value tells us how well our model explains the variance in price based on area. A value close to 1 means the model fits well.

---
# Question 2: Optimize Using Gradient Descent

Now we'll implement gradient descent from scratch to find the optimal line of regression. This is an iterative optimization method that minimizes the cost function.

### Understanding Gradient Descent

Gradient descent works by:
- Starting with random parameter values (θ₀ and θ₁)
- Computing the gradient (derivative) of the cost function
- Updating parameters in the direction that reduces cost
- Repeating until convergence

In [None]:
# normalizing features for better gradient descent performance
X_norm = (area - area.mean()) / area.std()
y_norm = (price - price.mean()) / price.std()

def gradient_descent(X, y, learning_rate=0.01, iterations=1000):
    """
    Performs gradient descent to learn theta parameters
    
    Parameters:
    X: input features
    y: target values
    learning_rate: step size for parameter updates
    iterations: number of iterations to run
    
    Returns:
    theta0, theta1, cost_history
    """
    m = len(y)
    theta0 = 0  # intercept
    theta1 = 0  # slope
    
    cost_history = []
    
    for i in range(iterations):
        # make predictions
        y_pred = theta0 + theta1 * X
        
        # compute gradients
        d_theta0 = (-2/m) * np.sum(y - y_pred)
        d_theta1 = (-2/m) * np.sum((y - y_pred) * X)
        
        # update parameters
        theta0 = theta0 - learning_rate * d_theta0
        theta1 = theta1 - learning_rate * d_theta1
        
        # calculate cost (MSE)
        cost = np.mean((y - y_pred) ** 2)
        cost_history.append(cost)
        
        # print progress every 200 iterations
        if (i + 1) % 200 == 0:
            print(f"Iteration {i+1}: Cost = {cost:.6f}")
    
    return theta0, theta1, cost_history

In [None]:
# running gradient descent
print("Running Gradient Descent...")
theta0_gd, theta1_gd, costs = gradient_descent(X_norm, y_norm, learning_rate=0.1, iterations=1000)

print(f"\nFinal Results (on normalized data):")
print(f"θ₀ (intercept): {theta0_gd:.4f}")
print(f"θ₁ (slope): {theta1_gd:.4f}")
print(f"Final cost: {costs[-1]:.6f}")

### Visualizing Cost Reduction Over Iterations

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(costs, linewidth=2)
plt.xlabel('Iteration Number', fontsize=12)
plt.ylabel('Cost (MSE)', fontsize=12)
plt.title('Gradient Descent: Cost Function Over Iterations', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

The cost decreases rapidly in the initial iterations and then plateaus, showing that the algorithm has converged to the optimal parameters.

---
# Question 3: Multiple Linear Regression

In this question, we have multiple predictor variables (Item, Education, Gender) and need to predict Salary.

### Loading and Exploring the Data

In [None]:
# creating dataframe from the given table
data = {
    'Item': [5, 7, 8, 4],
    'Education': ['Bachelor', 'Bachelor', 'Master', 'Master'],
    'Salary': [135000, 95000, 55000, 75000],
    'Gender': ['F', 'M', 'F', 'M']
}

df = pd.DataFrame(data)

print("Original Dataset:")
print(df)
print(f"\nDataset shape: {df.shape}")

### Encoding Categorical Variables

Machine learning models need numerical input, so we convert Education and Gender to numbers:
- Education: Bachelor = 0, Master = 1
- Gender: Female = 0, Male = 1

In [None]:
# encoding categorical variables
df['Education_encoded'] = df['Education'].map({'Bachelor': 0, 'Master': 1})
df['Gender_encoded'] = df['Gender'].map({'F': 0, 'M': 1})

print("Data after encoding:")
print(df)

### Fitting the Multiple Linear Regression Model

In [None]:
# preparing features and target
X_multi = df[['Item', 'Education_encoded', 'Gender_encoded']]
y_multi = df['Salary']

# fit the model
mlr_model = LinearRegression()
mlr_model.fit(X_multi, y_multi)

print("Multiple Linear Regression Results:")
print("="*50)
print(f"\nCoefficients:")
print(f"  Item coefficient: {mlr_model.coef_[0]:.2f}")
print(f"  Education coefficient: {mlr_model.coef_[1]:.2f}")
print(f"  Gender coefficient: {mlr_model.coef_[2]:.2f}")
print(f"\nIntercept: {mlr_model.intercept_:.2f}")

print(f"\nRegression Equation:")
print(f"Salary = {mlr_model.intercept_:.2f} + {mlr_model.coef_[0]:.2f}×Item + {mlr_model.coef_[1]:.2f}×Education + {mlr_model.coef_[2]:.2f}×Gender")

### Making Predictions and Evaluating the Model

In [None]:
# making predictions
y_pred_multi = mlr_model.predict(X_multi)

print("Actual vs Predicted Salaries:")
print("="*50)
comparison_df = pd.DataFrame({
    'Actual': y_multi,
    'Predicted': y_pred_multi,
    'Difference': y_multi - y_pred_multi
})
print(comparison_df)

In [None]:
# model evaluation metrics
r2_multi = r2_score(y_multi, y_pred_multi)
mse_multi = mean_squared_error(y_multi, y_pred_multi)
rmse_multi = np.sqrt(mse_multi)

print("\nModel Performance Metrics:")
print("="*50)
print(f"R-squared: {r2_multi:.4f}")
print(f"Mean Squared Error: ${mse_multi:,.2f}")
print(f"Root Mean Squared Error: ${rmse_multi:,.2f}")

### Visualizing the Results

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# plot 1: actual vs predicted
axes[0].scatter(y_multi, y_pred_multi, s=100, alpha=0.7, color='blue')
axes[0].plot([y_multi.min(), y_multi.max()], [y_multi.min(), y_multi.max()], 
             'r--', linewidth=2, label='Perfect prediction')
axes[0].set_xlabel('Actual Salary ($)', fontsize=12)
axes[0].set_ylabel('Predicted Salary ($)', fontsize=12)
axes[0].set_title('Actual vs Predicted Salaries', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# plot 2: residuals
residuals = y_multi - y_pred_multi
axes[1].scatter(y_pred_multi, residuals, s=100, alpha=0.7, color='green')
axes[1].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[1].set_xlabel('Predicted Salary ($)', fontsize=12)
axes[1].set_ylabel('Residuals ($)', fontsize=12)
axes[1].set_title('Residual Plot', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Summary

We successfully completed all three questions:

1. **Simple Linear Regression**: Built a model to predict price from area, achieving good fit with the data
2. **Gradient Descent**: Implemented the optimization algorithm from scratch and visualized how it converges
3. **Multiple Linear Regression**: Created a model with multiple predictors to estimate salary

All models were evaluated using appropriate metrics like R-squared and MSE.