# Dataset 

[Vehicle Dataset](https://www.kaggle.com/datasets/pritech/vehicle-silhouettes)

In [1]:
import numpy as np
import pandas as pd

In [2]:
# load the dataset

filename = './dataset/vehicle.csv'

vehicle_df = pd.read_csv(filename)

vehicle_df.head()

Unnamed: 0,compactness,circularity,distance_circularity,radius_ratio,pr.axis_aspect_ratio,max.length_aspect_ratio,scatter_ratio,elongatedness,pr.axis_rectangularity,max.length_rectangularity,scaled_variance,scaled_variance.1,scaled_radius_of_gyration,scaled_radius_of_gyration.1,skewness_about,skewness_about.1,skewness_about.2,hollows_ratio,class
0,95,48.0,83.0,178.0,72.0,10,162.0,42.0,20.0,159,176.0,379.0,184.0,70.0,6.0,16.0,187.0,197,van
1,91,41.0,84.0,141.0,57.0,9,149.0,45.0,19.0,143,170.0,330.0,158.0,72.0,9.0,14.0,189.0,199,van
2,104,50.0,106.0,209.0,66.0,10,207.0,32.0,23.0,158,223.0,635.0,220.0,73.0,14.0,9.0,188.0,196,car
3,93,41.0,82.0,159.0,63.0,9,144.0,46.0,19.0,143,160.0,309.0,127.0,63.0,6.0,10.0,199.0,207,van
4,85,44.0,70.0,205.0,103.0,52,149.0,45.0,19.0,144,241.0,325.0,188.0,127.0,9.0,11.0,180.0,183,bus


# Preprocessing

In [3]:
vehicle_df.isna().sum()

compactness                    0
circularity                    5
distance_circularity           4
radius_ratio                   6
pr.axis_aspect_ratio           2
max.length_aspect_ratio        0
scatter_ratio                  1
elongatedness                  1
pr.axis_rectangularity         3
max.length_rectangularity      0
scaled_variance                3
scaled_variance.1              2
scaled_radius_of_gyration      2
scaled_radius_of_gyration.1    4
skewness_about                 6
skewness_about.1               1
skewness_about.2               1
hollows_ratio                  0
class                          0
dtype: int64

In [4]:
vehicle_df = vehicle_df.dropna()

# X, y split and one hot encoding of y

In [5]:
X = vehicle_df.iloc[:,0:-1]
y = vehicle_df.iloc[:, -1]

unique_classes = y.unique()

# Initialize an empty dataframe with column names as unique classes
one_hot_df = pd.DataFrame(0, index=y.index, columns=unique_classes)

# set the value to 1 where the label matches
for value in unique_classes:
    one_hot_df[value] = (y == value).astype(int)

y = one_hot_df

# Train, test split

In [6]:
# Shuffle the dataset while maintaining the correspondence between X and y
shuffled_indices = np.random.permutation(len(X))

# Apply the shuffled indices to both X and y
X_shuffled = X.iloc[shuffled_indices].values
y_shuffled = y.iloc[shuffled_indices].values

# 80% train, 20% test

# X
X_train, X_test = np.split(X_shuffled, [int(0.8 * len(X_shuffled))])

# y
y_train, y_test = np.split(y_shuffled, [int(0.8 * len(y_shuffled))])

# Logistic Regressor

In [17]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def logistic_loss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    
class LogisticRegression:
    
    def __init__(self, weights=[], alpha=0.01, max_iters=100, threshold=1e-6):
        self.weights = weights;
        self.alpha = alpha;
        self.max_iters = max_iters;
        self.threshold = threshold;

    def set_threshold(self, threshold):
        self.threshold = threshold

    def set_max_iters(self, max_iters):
        self.max_iters = max_iters

    def get_weights(self):
        return self.weights

    def train(self, X, Y, alpha = 0.01, max_iters = None, print_loss_iter = 100):
        """
        - X: Training data (features).
        - Y: Target variable (labels).
        - alpha: Learning rate (default = 0.01).
        - max_iters: Maximum number of iterations for training. If None, use stopping criteria
        (e.g., when the loss is constant for the last 3 epochs).
        - print_loss_iter: Print the loss every n iteration (default = 100)
        - If max_iters is not provided, stop when the change in loss falls below a defined threshold.
        """
        
        prevLoss = float('inf')
        
        # setting the class attributes
        self.alpha = alpha
        self.max_iters = max_iters

        # pre-prend a column of 1's in X
        ones_col = (np.ones(len(X))).reshape(-1, 1)
        X = np.hstack((ones_col, X))

        # initialize weights
        self.weights = np.zeros(X.shape[1]).reshape(-1, 1)

        # if max_iters is not provided, fall back to the pre-defined threshold
        for num_iters in range(max_iters if max_iters is not None else 1_000_000):
            
            # multiply with weights and fit through a sigmoid activation function to get prediction
            y_pred = sigmoid(np.dot(X, self.weights))
    
            # get loss
            tot_err = logistic_loss(Y, y_pred)
    
            # update weights based on gradient descent
            self.weights[0] -= (alpha * (2 * (np.mean(y_pred - Y))))
    
            for i in range(len(self.weights)):
                if i != 0: # first weight has been updated
                    self.weights[i] -= (alpha * (2 * (np.mean(np.dot(X.T, (y_pred - Y)))))) 
    
            # multiply with weights to get prediction
            y_pred = sigmoid(np.dot(X, self.weights))
    
            # get loss
            tot_err = logistic_loss(Y, y_pred)

            if num_iters % print_loss_iter == 0:
                print(f"Error on iteration {num_iters}: {tot_err}")
            
            # Check for convergence
            if max_iters is None and abs(prevLoss - tot_err) < self.threshold:
                print("Converged according to the predefined threshold")
                break

            prevLoss = tot_err

    def predict(self, X_test):
        return np.round(sigmoid(np.dot(X_test, self.weights[1:]) + self.weights[0]), 3)

    def predict_class(self, X_test):
        return np.round(sigmoid(np.dot(X_test, self.weights[1:]) + self.weights[0]))

# Training based on One vs All Strategy

In [8]:
lr1 = LogisticRegression()
lr1.train(X=X_train, Y=y_train[:, 0], alpha=0.00000001, print_loss_iter=5)

lr2 = LogisticRegression()
lr2.train(X=X_train, Y=y_train[:, 1], alpha=0.00000001, print_loss_iter=5)

lr3 = LogisticRegression()
lr3.train(X=X_train, Y=y_train[:, 2], alpha=0.00000001, print_loss_iter=5)

Error on iteration 0: 0.5715486288832438
Error on iteration 5: 0.5606793093256582
Converged according to the predefined threshold
Error on iteration 0: 0.6931076427375129
Converged according to the predefined threshold
Error on iteration 0: 0.5745235035159724
Error on iteration 5: 0.5640425810198849
Converged according to the predefined threshold


# Test and Predict confidence scores for each class

In [9]:
y_pred1 = lr1.predict(X_test)

In [10]:
y_pred2 = lr2.predict(X_test)

In [11]:
y_pred3 = lr3.predict(X_test)

# Finalize the prediction by taking the highest confidence score

In [12]:
predictions = np.stack([y_pred1, y_pred2, y_pred3], axis=1)

final_predictions = np.argmax(predictions, axis=1)

# Accuracy

In [13]:
y_test_labels = np.argmax(y_test, axis=1)

correct_predictions = 0

for i in range(len(y_test_labels)):
    if final_predictions[i] == y_test_labels[i]:
        correct_predictions += 1

accuracy = correct_predictions / len(y_test_labels)

print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 52.15%


# Get weights of each Logistic Regressor

In [14]:
lr1.get_weights()

array([[-7.88699097e-09],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04],
       [-5.23057683e-04]])

In [15]:
lr2.get_weights()

array([[1.21322302e-10],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06],
       [8.47902542e-06]])

In [16]:
lr3.get_weights()

array([[-7.77485142e-09],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04],
       [-5.15279239e-04]])