<a href="https://colab.research.google.com/github/itripathi423-a11y/Concept-of-AI/blob/master/Worksheet_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


# ----------- TO-DO 1: Load & Observe Data -----------
data = pd.read_csv("student.csv")

print("Top 5 rows:\n", data.head())
print("\nBottom 5 rows:\n", data.tail())
print("\nDataset Info:")
print(data.info())
print("\nDataset Description:\n", data.describe())


# ----------- Split Features & Labels -----------
X = data[['Math', 'Reading']].values
Y = data['Writing'].values


# ----------- TO-DO 3: Train-Test Split -----------
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)


# ----------- TO-DO 4: Cost Function (MSE) -----------
def cost_function(X, Y, W):
    n = len(Y)
    Y_pred = np.dot(X, W)
    cost = (1 / (2 * n)) * np.sum((Y_pred - Y) ** 2)
    return cost


# ----------- TO-DO 5: Test Cost Function -----------
X_test_case = np.array([[1, 2], [3, 4], [5, 6]])
Y_test_case = np.array([3, 7, 11])
W_test_case = np.array([1, 1])

cost = cost_function(X_test_case, Y_test_case, W_test_case)

if cost == 0:
    print("\nProceed Further")
else:
    print("\nSomething went wrong")

print("Cost function output:", cost)


# ----------- TO-DO 6: Gradient Descent -----------
def gradient_descent(X, Y, W, alpha, iterations):
    cost_history = [0] * iterations
    m = len(Y)

    for i in range(iterations):
        Y_pred = np.dot(X, W)
        loss = Y_pred - Y
        dw = (1 / m) * np.dot(X.T, loss)
        W = W - alpha * dw
        cost_history[i] = cost_function(X, Y, W)

    return W, cost_history


# ----------- TO-DO 7: Gradient Descent Test -----------
np.random.seed(0)
X_rand = np.random.rand(100, 3)
Y_rand = np.random.rand(100)
W_rand = np.random.rand(3)

alpha = 0.01
iterations = 1000

final_params, cost_history = gradient_descent(
    X_rand, Y_rand, W_rand, alpha, iterations
)

print("\nFinal Parameters (Test):", final_params)
print("Final Cost (Test):", cost_history[-1])


# ----------- TO-DO 8: RMSE Function -----------
def rmse(Y, Y_pred):
    return np.sqrt(np.mean((Y - Y_pred) ** 2))


# ----------- TO-DO 9: RÂ² Function -----------
def r2(Y, Y_pred):
    mean_y = np.mean(Y)
    ss_tot = np.sum((Y - mean_y) ** 2)
    ss_res = np.sum((Y - Y_pred) ** 2)
    return 1 - (ss_res / ss_tot)


# ----------- TO-DO 10: Main Function -----------
def main():
    data = pd.read_csv("student.csv")

    X = data[['Math', 'Reading']].values
    Y = data['Writing'].values

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=42
    )

    W = np.zeros(X_train.shape[1])
    alpha = 0.00001
    iterations = 1000

    W_optimal, cost_history = gradient_descent(
        X_train, Y_train, W, alpha, iterations
    )

    Y_pred = np.dot(X_test, W_optimal)

    model_rmse = rmse(Y_test, Y_pred)
    model_r2 = r2(Y_test, Y_pred)

    print("\n========== FINAL MODEL RESULTS ==========")
    print("Final Weights:", W_optimal)
    print("Cost History (First 10):", cost_history[:10])
    print("RMSE on Test Set:", model_rmse)
    print("R-Squared on Test Set:", model_r2)


# ----------- Run Program -----------
if __name__ == "__main__":
    main()

Top 5 rows:
    Math  Reading  Writing
0    48       68       63
1    62       81       72
2    79       80       78
3    76       83       79
4    59       64       62

Bottom 5 rows:
      Math  Reading  Writing
995    72       74       70
996    73       86       90
997    89       87       94
998    83       82       78
999    66       66       72

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     1000 non-null   int64
 1   Reading  1000 non-null   int64
 2   Writing  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB
None

Dataset Description:
               Math      Reading      Writing
count  1000.000000  1000.000000  1000.000000
mean     67.290000    69.872000    68.616000
std      15.085008    14.657027    15.241287
min      13.000000    19.000000    14.000000
25%      58.000000    60.750000    58.000000
50%      