In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Aim is to predict the marks of students of the test data

In [38]:
# Load training data
data = pd.read_excel("D:\\IIT\\Extras\\Learner's Space\\ML\\Assignment1\\Training data.xlsx")

# Separate features and target
x_train = data.iloc[:, :-1]  # Features
y_train = data.iloc[:, -1]  # Target (marks)

# Convert to numpy arrays
x_train = np.array(x_train)
y_train = np.array(y_train).reshape(-1, 1)

# Function to plot features against target
def plot_features_vs_target(data):
    features = data.columns[:-1]
    target = data.columns[-1]
    for feature in features:
        plt.figure()
        plt.scatter(data[feature], data[target])
        plt.xlabel(feature)
        plt.ylabel(target)
        plt.title(f'{feature} vs {target}')
        plt.show()

# Uncomment to visualize feature-target relationships
# plot_features_vs_target(data)

# Function for feature encoding and preprocessing
def feature_changing(x_data):
    if isinstance(x_data, np.ndarray):
        x_data = pd.DataFrame(x_data, columns=['internet', 'sex', 'traveltime', 'studytime', 'freetime', 'absences', 'age', 'iq'])
    
    le_internet = LabelEncoder()
    le_sex = LabelEncoder()

    x_data['internet'] = le_internet.fit_transform(x_data['internet'])
    x_data['sex'] = le_sex.fit_transform(x_data['sex'])
    
    return x_data.to_numpy()

# Apply feature changes
x_train = feature_changing(x_train)

In [39]:
# Function for feature scaling
def z_score(x_data):
    x_mean = np.mean(x_data, axis=0)
    x_std = np.std(x_data, axis=0)
    x_data = (x_data - x_mean) / x_std
    
    return x_data, x_std, x_mean

# Apply Z-score normalization
x_train = x_train.astype(np.float64)
x_train, x_std, x_mean = z_score(x_train)


In [40]:
# Define cost function (mean squared error)
def cost(x_train, y_train, w, b):
    m = len(y_train)
    y_pred = np.dot(x_train, w) + b
    loss = (1 / (2 * m)) * np.sum((y_pred - y_train) ** 2)
    return loss


In [41]:
# Define gradient descent function
def gradient_descent(x_train, y_train, w, b, learning_rate=0.01, num_iterations=1000):
    m = len(y_train)
    for i in range(num_iterations):
        y_pred = np.dot(x_train, w) + b
        dw = (1 / m) * np.dot(x_train.T, (y_pred - y_train))
        db = (1 / m) * np.sum(y_pred - y_train)
        w -= learning_rate * dw
        b -= learning_rate * db
        if i % 100 == 0:
            current_cost = cost(x_train, y_train, w, b)
            print(f"Iteration {i}, Cost: {current_cost}")
    return w, b


In [42]:
# Initialize weights and bias
np.random.seed(2147483647)
w = np.random.randn(x_train.shape[1], 1)
b = np.random.randn(1)

# Train the model
old_cost = float('inf')
current_cost = cost(x_train, y_train, w, b)
iteration = 0

while abs(old_cost - current_cost) > 0.00001:
    old_cost = current_cost
    w, b = gradient_descent(x_train, y_train, w, b)
    current_cost = cost(x_train, y_train, w, b)
    iteration += 1
    print(f"Iteration {iteration}, Cost: {current_cost}")

print(f"Final Cost: {current_cost}")

# Load test data
x_predict = pd.read_excel("D:\\IIT\\Extras\\Learner's Space\\ML\\Assignment1\\Test data.xlsx").iloc[:, :-1].to_numpy()
ans = pd.read_excel("D:\\IIT\\Extras\\Learner's Space\\ML\\Assignment1\\Test data.xlsx").iloc[:, -1].to_numpy()

# Apply same preprocessing steps
x_predict = feature_changing(x_predict)
x_predict = (x_predict - x_mean) / x_std

# Predict marks
y_predict = np.dot(x_predict, w) + b

# Evaluate accuracy
accuracy = np.sum(np.abs(y_predict.flatten() - ans) < 0.5) / len(ans) * 100
accuracy = round(accuracy, 2)
ok = 'Congratulations' if accuracy > 95 else 'Optimization required'
print(f"{ok}, your accuracy is {accuracy}%")


Iteration 0, Cost: 2444.0423300034918
Iteration 100, Cost: 327.30545449819175
Iteration 200, Cost: 43.837315566020074
Iteration 300, Cost: 5.871893770939658
Iteration 400, Cost: 0.7866032490520842
Iteration 500, Cost: 0.1053876738704565
Iteration 600, Cost: 0.014125032523688439
Iteration 700, Cost: 0.0018974593221238183
Iteration 800, Cost: 0.000259036067689191
Iteration 900, Cost: 3.94771402058059e-05
Iteration 1, Cost: 1.0144580425993177e-05
Iteration 0, Cost: 1.005212890284038e-05
Iteration 100, Cost: 6.1082372454842815e-06
Iteration 200, Cost: 5.579573698363449e-06
Iteration 300, Cost: 5.5087001473968045e-06
Iteration 400, Cost: 5.4991974881886884e-06
Iteration 500, Cost: 5.497923194644101e-06
Iteration 600, Cost: 5.497752285136142e-06
Iteration 700, Cost: 5.497729358171435e-06
Iteration 800, Cost: 5.497726281902835e-06
Iteration 900, Cost: 5.4977258690304504e-06
Iteration 2, Cost: 5.4977258137746695e-06
Final Cost: 5.4977258137746695e-06
Congratulations, your accuracy is 100.0%
