# Necessary installations

In [None]:
!pip install numpy
!pip install matplotlib
!pip install ucimlrepo
!pip install scikit-learn

# Import Statements

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import statistics
import random
import time  # Importing time module for tracking elapsed time
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, classification_report

# Importing the functions files



In [None]:
def QuantileRK(A, b, q, t, N, correct_labels, labels, numMislabelled, numDataPoints):
    A = np.array(A)
    m, n = A.shape
    x = np.random.rand(n)  # Initial guess for the solution
    residuals = []  # To store the residuals for plotting
    cumulative_times = []  # List to store cumulative times

    # Create a boolean mask for the labels that are in correct_labels but not in labels
    difference_mask = np.where(correct_labels == labels)  # Only retain the spots where they are equal

    # Apply the mask to filter the rows
    A_uncorrupted = A[difference_mask]
    b_uncorrupted = b[difference_mask]

    random.seed(0)  # For reproducibility

    cumulative_time = 0  # Initialize cumulative time

    # Iterate for N steps
    for j in range(N):
        start_time = time.time()  # Start time for this iteration

        condition = np.dot(A_uncorrupted, x) > b_uncorrupted  # Compute the condition

        # Count the number of elements that satisfy the condition as a percent of correct inequalities
        not_set_count = np.count_nonzero(condition) / (numDataPoints - numMislabelled) * 100

        residuals.append(not_set_count)

        # ALGORITHM IS FROM HERE DOWN
        # Randomly sample t indices from the set of m indices
        sampled_indices = np.random.choice(m, t, replace=True)

        # Pick a random index k from the m rows
        k = random.choice(np.arange(m))
        a_k = A[k]
        b_k = b[k]

        # Compute the expression for the selected index
        e = np.maximum((np.dot(x, a_k) - b_k), 0)

        # Residuals for the sampled indices
        nyet_list_of_distances = A[sampled_indices] @ x - b[sampled_indices]
        nonnegative_residuals = np.maximum(nyet_list_of_distances, 0)  # Non-negative residuals

        if e <= np.quantile(nonnegative_residuals, q):
            x = x - e * a_k
        else:
            x = x  # No update if the condition is not satisfied

        end_time = time.time() # End time for this iteration
        iteration_time = end_time - start_time
        cumulative_time += iteration_time
        cumulative_times.append(cumulative_time)  # Store the cumulative time

    return x, residuals, cumulative_times

def perceptronModified(data, labels, q, t, N, correct_labels, numMislabelled, numDataPoints):
    n, m = data.shape
    X = data.T
    Y = labels
    Y = np.diag(Y)
    X_tilda = np.matmul(Y, X)
    X_tilda = np.negative(X_tilda)
    x, residuals, cumulative_times = QuantileRK(X_tilda, np.zeros((m,)), q, t, N, correct_labels, labels, numMislabelled, numDataPoints)

    return x, residuals, cumulative_times

# Sampled Indices: Timing them

# Real World Data Set: Banknote Authentication

In [None]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Fetch the Banknote Authentication Dataset
banknote_data = fetch_ucirepo(id=267)

# Extract features and target
X = banknote_data.data.features # Features (variance, skewness, etc.)
y = banknote_data.data.targets

X = X.values
y = y.values
y = y.ravel()
y = 2*y-1 # to make it fit our algorithm

# Scale the features (standardize them)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_T = X_scaled.T

# We set the SVM baseline

In [None]:
# We will train on the full dataset without any random split
svm = SVC(kernel='linear')  # Linear kernel for decision boundary
svm.fit(X_scaled, y)  # Train on the full dataset to get linear truth labels

# Predict the labels using the trained model
y_pred = svm.predict(X_scaled)

# Evaluate the predictions by comparing with the true labels
accuracy = accuracy_score(y, y_pred)
report = classification_report(y, y_pred)

print(f"Accuracy of SVM classifier with linear kernel: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)

incorrectly_classified = np.where(y != y_pred)[0]
misclassified = 1 - accuracy # just to store the values

In [None]:
'''
Code to check how balanced is the dataset.
'''

count_ones = np.sum(y == 1)
count_minus_ones = np.sum(y == -1)

print(f"Number of 1s in y: {count_ones}")
print(f"Number of -1s in y: {count_minus_ones}")

In [None]:
'''
Setting the parameters for the algorithm
'''

numDataPoints = X_scaled_T.shape[1]
numMislabelled = len(incorrectly_classified)
correct_labels = y_pred
data = X_scaled_T
labels = y
t = numDataPoints
N = 5000

In [None]:
quantile_list = [0.85, 0.9, 0.95, 0.97]
residual_list = []  # Dictionary to store residuals for each quantile
nTrials = 10 # how many rounds of averaging we wish to do

for q in quantile_list:
  intermediateResiduals = []
  for i in range(nTrials):
      x, residuals, _ = perceptronModified(data, labels, q, numDataPoints, N, correct_labels, numMislabelled, numDataPoints)
      intermediateResiduals.append(residuals)

  intermediateResiduals = np.mean(intermediateResiduals, axis = 0)
  residual_list.append(intermediateResiduals)

In [None]:
import numpy as np
import matplotlib.pyplot as plt


line_styles = ['-', '--', ':', '-.']

# Create a plot
plt.figure(figsize=(8, 6))

for idx, residuals in enumerate(residual_list):
    # Ensure x matches the length of residuals (assuming iteration count is equal to len(residuals))
    x = np.arange(len(residuals))

    plt.plot(x, residuals, label=f'{quantile_list[idx]}', linestyle=line_styles[idx], linewidth=3)


plt.xlim(0, N)
plt.ylim(20, 80)
plt.yticks(np.arange(0, 110, 10), fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel('Iterations', fontsize=16)
plt.ylabel('Percent of Misclassified Inequalities', fontsize=16)
plt.legend(title='Quantile', fontsize=14, title_fontsize=16)
plt.grid(True)
plt.savefig('banknotes_data_final_plot.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
sampled_indices_list = [450, 900, 1372]
line_styles = ['-', '--', ':']
markers = ['o', 's', '*']

In [None]:
residual_list = []
residual_errors = []
residual_dict = {}
error_dict = {}

for t in sampled_indices_list:
    residual_list_i = []
    residual_errors_i = []
    for i in range(nTrials):
        x, residuals, residuals_time = perceptronModified(data, labels, 0.97, t, N, correct_labels, numMislabelled, numDataPoints)
        residual_list_i.append(residuals_time)
        residual_errors_i.append(residuals)
        
    avg_residual_time = np.mean(residual_list_i, axis=0)
    avg_residual_error = np.mean(residual_errors_i, axis=0)
    residual_list.append(avg_residual_time)
    residual_errors.append(avg_residual_error)
    residual_dict[t] = avg_residual_time
    error_dict[t] = avg_residual_error

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

for idx, t in enumerate(sampled_indices_list):
    x_values = residual_dict[t]
    y_values = error_dict[t]

    line_style = line_styles[idx % len(line_styles)]
    marker = markers[idx % len(markers)]

    plt.plot(x_values, y_values, label=f'{t}', linestyle=line_style, marker=marker, linewidth=3)

plt.xlabel('Time (in seconds)', fontsize=16)
plt.ylabel('Percent of Misclassified Inequalities', fontsize=16)
plt.legend(title='Number Sampled Indices', fontsize=14, title_fontsize=16, loc='upper right')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.grid(True, which='both', axis='both')
plt.xlim(left=0)
plt.ylim(bottom=0)

# Save the figure as a PNG file
plt.savefig('time_sampled_indices_q_equals_0.97_5000_banknotes_final_iterations.png', dpi=300, bbox_inches='tight')
plt.show()

# Occupancy dataset

In [None]:
occupancy_detection = fetch_ucirepo(id=357)
occupancy_detection.data.features = occupancy_detection.data.features.drop(columns=['date']) # Drop the 'date' column
# Convert numeric columns to the correct data type using .loc
occupancy_detection.data.features.loc[:, 'Temperature'] = pd.to_numeric(occupancy_detection.data.features['Temperature'], errors='coerce')
occupancy_detection.data.features.loc[:, 'Humidity'] = pd.to_numeric(occupancy_detection.data.features['Humidity'], errors='coerce')
occupancy_detection.data.features.loc[:, 'Light'] = pd.to_numeric(occupancy_detection.data.features['Light'], errors='coerce')
occupancy_detection.data.features.loc[:, 'CO2'] = pd.to_numeric(occupancy_detection.data.features['CO2'], errors='coerce')
occupancy_detection.data.features.loc[:, 'HumidityRatio'] = pd.to_numeric(occupancy_detection.data.features['HumidityRatio'], errors='coerce')

In [None]:
X = occupancy_detection.data.features
y = occupancy_detection.data.targets
# Drop NaN rows
X_clean = X.drop(index=[8143, 10809])
y_clean = y.drop(index=[8143, 10809])

# Convert y_clean to NumPy and transform it into {-1, 1}
y_clean = y_clean.values.ravel()
y_clean = 2 * y_clean - 1

# We set the SVM baseline again

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)
X_scaled_T = X_scaled.T

# We will train on the full dataset without any random split
svm = SVC(kernel='linear')
svm.fit(X_scaled, y_clean)
y_pred = svm.predict(X_scaled)
accuracy = accuracy_score(y_clean, y_pred)
report = classification_report(y_clean, y_pred)
print(f"Accuracy of SVM classifier with linear kernel: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)

# Identifying where the linear kernel fails:
incorrectly_classified = np.where(y_clean != y_pred)[0]

In [None]:
numDataPoints = X_scaled_T.shape[1]
numMislabelled = len(incorrectly_classified)
correct_labels = y_clean  # we have no correct labels
data = X_scaled_T
labels = y_pred  # corrupted labels
t = numDataPoints
N = 5000

# Accuracy Plots

In [None]:
quantile_list = [0.85, 0.9, 0.92, 0.96]
residual_list = []

for q in quantile_list:
  intermediateResiduals = []
  for i in range(nTrials):
      x, residuals, _ = perceptronModified(data, labels, q, numDataPoints, N, correct_labels, numMislabelled, numDataPoints)
      intermediateResiduals.append(residuals)

  intermediateResiduals = np.mean(intermediateResiduals, axis = 0)
  residual_list.append(intermediateResiduals)  # Store residuals for each quantile

In [None]:
line_styles = ['-', '--', ':', '-.']

plt.figure(figsize=(8, 6))

for idx, residuals in enumerate(residual_list):
    x = np.arange(len(residuals))
    plt.plot(x, residuals, label=f'{quantile_list[idx]}', linestyle=line_styles[idx], linewidth=3)

plt.xlim(0, N)
plt.ylim(20, 80)
plt.yticks(np.arange(0, 110, 10), fontsize=14)
plt.xticks(fontsize=14)
plt.xlabel('Iterations', fontsize=16)
plt.ylabel('Percent of Misclassified Inequalities', fontsize=16)
plt.legend(title='Quantile', fontsize=14, title_fontsize=16)
plt.grid(True)
plt.savefig('Occupancy_final_plot.png', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()


# Timing the iterations

In [None]:
sampled_indices_list = [5000, 15000, 20560]

In [None]:
residual_list = []
residual_errors = []
residual_dict = {}
error_dict = {}
for t in sampled_indices_list:
    residual_list_i = []
    residual_errors_i = []
    for i in range(nTrials):
        x, residuals, residuals_time = perceptronModified(data, labels, 0.96, t, N, correct_labels, numMislabelled, numDataPoints)
        # Append the results for this iteration
        residual_list_i.append(residuals_time)  # residuals_time = time
        residual_errors_i.append(residuals)  # residuals = error

    avg_residual_time = np.mean(residual_list_i, axis=0)
    avg_residual_error = np.mean(residual_errors_i, axis=0)
    residual_list.append(avg_residual_time)
    residual_errors.append(avg_residual_error)
    residual_dict[t] = avg_residual_time
    error_dict[t] = avg_residual_error

In [None]:
plt.figure(figsize=(8, 6))

for idx, t in enumerate(sampled_indices_list):
    x_values = residual_dict[t]
    y_values = error_dict[t]
    line_style = line_styles[idx % len(line_styles)]
    marker = markers[idx % len(markers)]
    plt.plot(x_values, y_values, label=f'{t}', linestyle=line_style, marker=marker, linewidth=3)

plt.xlabel('Time (in seconds)', fontsize=16)
plt.ylabel('Percent of Misclassified Inequalities', fontsize=16)
plt.legend(title='Number Sampled Indices', fontsize=14, title_fontsize=16, loc='upper right')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.grid(True, which='both', axis='both')
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.savefig('time_sampled_indices_q_equals_0.96_5000_Occupancy_iterations.png', dpi=300, bbox_inches='tight')  # Save with high resolution
plt.show()