## Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Loading the csv file into a panda dataframe

In [None]:
data = pd.read_csv("CarPrice_Assignment.csv")
print(data)

## Before we begin linear regression, we must pre-process the data to make it more suitable for the regression model to work on 
### Using correlation matrix to drop irrelevant and highly correlated features

In [None]:
data_num = data.select_dtypes(include=['number'])
plt.figure(figsize=(12,10))
cor = data_num.corr()
sns.heatmap(cor, annot=True, cmap = 'viridis')
def correlationfeattofeat(dataset, threshold = 0.9):
    column_cor = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]) > threshold:
                colname = corr_matrix.columns[i]
                column_cor.add(colname)
    return column_cor
def correlationfeattotarget(dataset, threshold = 0.1):
    column_cor = set()
    corr_matrix = dataset.corr()
    for col in corr_matrix.columns:
        if abs(corr_matrix.loc['price', col]) < threshold:
            column_cor.add(col)
    return column_cor

corr_features_irrelevant = correlationfeattotarget(data_num)
print(corr_features_irrelevant)
corr_features_correlating = correlationfeattofeat(data_num)
print(corr_features_correlating)
data_num_proc1 = data_num.drop(corr_features_irrelevant, axis=1)
data_num_proc2 = data_num_proc1.drop(corr_features_correlating, axis=1)



### Performing encoding with categorical data, standardization with numerical data

In [4]:
data_num_proc3 = data_num_proc2.drop(['price','car_ID' ], axis=1)  # replace 'target_variable' with your actual target variable
y = data['price']
data_cat =data.select_dtypes(include=['object'])  
mean = data_num_proc3.mean(axis=0)
std = data_num_proc3.std(axis=0)
data_num_norm = (data_num_proc3 - mean) / std

data_cat_proc = pd.get_dummies(data_cat, drop_first=True) 
boolean_cols = data_cat_proc.columns[data_cat_proc.dtypes == 'bool']
data_cat_proc[boolean_cols] = data_cat_proc[boolean_cols].astype(int)
data_proc = pd.concat([data_num_norm, data_cat_proc], axis=1) 

### Splitting given data into a dataset for training and another for testing

In [None]:

test_size = 0.3

n_samples = data_proc.shape[0]

n_test_samples = int(n_samples * test_size)
print(n_test_samples)

indices = np.arange(n_samples)
np.random.shuffle(indices)

# Split the data into training and testing sets
train_indices = indices[n_test_samples:]
test_indices = indices[:n_test_samples]

X_train = data_proc.iloc[train_indices]
y_train = y.iloc[train_indices]


X_test = data_proc.iloc[test_indices]
y_test = y.iloc[test_indices]
mse_values = []

## Performing Linear Regression
### Creating a class LinearRegression, setting its properties and defining its functions

In [6]:
class LinearRegression:
    def __init__(self, lr = 0.01, n_iters = 600):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = 0
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        prev_mse = float('inf')
        for i in range(self.n_iters):
            y_predicted = np.dot(X, self.weights) + self.bias
            delw = (1/n_samples)*np.dot(X.T, (y_predicted - y))
            delb = (1/n_samples)*np.sum(y_predicted - y)
            self.weights = self.weights - self.lr*delw
            self.bias = self.bias - self.lr*delb
            mse = np.mean((y_predicted - y) ** 2)
            mse_values.append(mse)
            if i > 0:
                if prev_mse - mse < 1e3:
                    break
            prev_mse = mse
    def predict(self, X):
        y_predicted = np.dot(X, self.weights) + self.bias
        return y_predicted

### Creating Object to run linear regression on data

In [None]:
regressor = LinearRegression()
regressor.fit(X_train,y_train)
predictions = regressor.predict(X_test)
print(predictions)
plt.scatter(y_test, predictions)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')
plt.plot([min(y_test), max(predictions)], [min(y_test), max(predictions)], 'k:')
plt.show()

## Depiction of change in MSE over iterations

In [None]:
plt.plot(mse_values)
plt.xlabel('Iteration')
plt.ylabel('Mean Square Error')
plt.title('MSE vs Iteration')
plt.show()

## Calculating R2 score

In [None]:

total_sum = np.sum((y_test - np.mean(y_test)) ** 2)
residual_sum = np.sum((y_test - predictions) ** 2)
r2 = 1 - (residual_sum/total_sum)
print(r2)