In [None]:
import numpy as np
import pandas as pd
import platform
import os
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [None]:
data_direc = os.getcwd() + "/"
if platform.system() == "Windows":
    data_direc = data_direc.replace("/", "\\")
train = pd.read_csv(data_direc + "arcene_train.csv",header = None)
true_train_labels = pd.read_csv(data_direc + "arcene_train_labels.csv")
test = pd.read_csv(data_direc + "arcene_valid.csv",header = None)
test_labels = pd.read_csv(data_direc + "arcene_valid_labels.csv")

In [None]:
def loss_function(y, pred):
    y = 2 * y - 1
    return np.sum(np.log(1 + np.exp(-y * pred)))

# Normalize the data
scaler = StandardScaler()
train_normalized = scaler.fit_transform(train)
test_normalized = scaler.transform(test)

# Define parameters
s = 0.001
mu = 300 
N_iter = 300
k_values = [600]
class_probabilities = np.zeros(len(true_train_labels)).reshape(-1, 1)

# Initialize lists to store results
train_losses = []
test_losses = []
train_errors = []
test_errors = []

train_labels = (true_train_labels + 1) / 2

for k in k_values:
    train_losses_iter = []
    for _ in range(k):
        best_loss = float('inf')
        best_predictor = None
        for feature_idx in range(train.shape[1]):
            # Fit univariate linear regressor
            X_feature = np.array(train.iloc[:, feature_idx]).reshape(-1, 1)
            exp_term = -1*np.array(class_probabilities)
            exp_term = np.clip(exp_term, -500, 500)  # Limit the range of the exponent to prevent overflow
            p = np.ones(len(train_labels)).reshape(-1,1) / (np.ones(len(train_labels)).reshape(-1,1) + np.exp(exp_term))
            w = p*(np.ones(len(train_labels)).reshape(-1,1) - p)
            # Handle division by zero or close to zero values in w
            w_mask = np.isclose(w, 0)
            w[w_mask] = 1  # Replace zero or close to zero values with 1
            
            z = np.zeros_like(train_labels)  # Initialize z array
            z[~w_mask] = ((np.array(train_labels)[~w_mask]) - p[~w_mask]) / w[~w_mask]
            
            temp_x = X_feature * w.reshape(-1, 1)  # Reshape w to match the shape of X_feature
            lr = LinearRegression().fit(temp_x, z)
            pred = lr.predict(X_feature)

            # Calculate loss
            loss = loss_function(train_labels, pred)
            if loss.iloc[0] < best_loss:
                print("better :)")
                best_loss = loss.iloc[0]
                best_predictor = lr

        train_losses_iter.append(loss_function(train_labels, class_probabilities))  # Append the training loss at each iteration
        # Update class probabilities
        class_probabilities += best_predictor.predict(X_feature).reshape(-1, 1)

    train_losses.append(train_losses_iter)
    # Calculate training and test loss
    train_loss = loss_function(train_labels, class_probabilities)
    test_loss = loss_function(test_labels, class_probabilities)

    # Calculate training and test error
    train_pred = np.sign(class_probabilities)
    test_pred = np.sign(class_probabilities)
    train_error = 1 - accuracy_score(train_labels, train_pred)
    test_error = 1 - accuracy_score(test_labels, test_pred)
        
    train_losses.append(train_losses_iter)  # Append the list of training losses for this k to train_losses

    test_losses.append(test_loss)
    train_errors.append(train_error)
    test_errors.append(test_error)
    print(f"{k} done")

# Plot training loss vs iteration number for k = 600
plt.plot(range(1, 601), train_losses[-1])
plt.xlabel('Iteration')
plt.ylabel('Training Loss')
plt.title('Training Loss vs Iteration Number for k = 600')
plt.show()

# Report misclassification errors on the training and test sets for all values of k
error_table = pd.DataFrame({'k': k_values, 'Train Error': train_errors, 'Test Error': test_errors})
print(error_table)

# Plot misclassification errors on the training and test sets vs k
plt.plot(k_values, train_errors, label='Train Error')
plt.plot(k_values, test_errors, label='Test Error')
plt.xlabel('Number of Boosting Iterations (k)')
plt.ylabel('Misclassification Error')
plt.title('Misclassification Error vs Number of Boosting Iterations')
plt.legend()
plt.show()

In [None]:
data_direc = os.getcwd() + "/"
if platform.system() == "Windows":
    data_direc = data_direc.replace("/", "\\")
train = pd.read_csv(data_direc + "gisette_train.csv")
train_labels = pd.read_csv(data_direc + "gisette_train_labels.csv")
test = pd.read_csv(data_direc + "gisette_valid.csv")
test_labels = pd.read_csv(data_direc + "gisette_valid_labels.csv")
train = np.delete(train, 5000, axis=1)
test = np.delete(test, 5000, axis=1)

In [None]:
def loss_function(y, pred):
    return np.log(1 + np.exp(-2 * y * pred)).mean()

# Normalize the data
scaler = StandardScaler()
train_normalized = scaler.fit_transform(train)
test_normalized = scaler.transform(test)

# Define parameters
s = 0.001
mu = 300 
N_iter = 300
k_values = [600]
class_probabilities = np.zeros(len(true_train_labels)).reshape(-1, 1)

# Initialize lists to store results
train_losses = []
test_losses = []
train_errors = []
test_errors = []

train_labels = (true_train_labels + 1) / 2

for k in k_values:
    train_losses_iter = []
    for _ in range(k):
        best_loss = float('inf')
        best_predictor = None
        for feature_idx in range(train.shape[1]):
            # Fit univariate linear regressor
            X_feature = np.array(train.iloc[:, feature_idx]).reshape(-1, 1)
            exp_term = -1*np.array(class_probabilities)
            exp_term = np.clip(exp_term, -500, 500)  # Limit the range of the exponent to prevent overflow
            p = np.ones(len(train_labels)).reshape(-1,1) / (np.ones(len(train_labels)).reshape(-1,1) + np.exp(exp_term))
            print((np.ones(len(train_labels)).reshape(-1,1) + np.exp(exp_term)))
            w = p*(np.ones(len(train_labels)).reshape(-1,1) - p)
            # Handle division by zero or close to zero values in w
            w_mask = np.isclose(w, 0)
            w[w_mask] = 1  # Replace zero or close to zero values with 1
            
            z = np.zeros_like(train_labels)  # Initialize z array
            z[~w_mask] = ((np.array(train_labels)[~w_mask]) - p[~w_mask]) / w[~w_mask]
            
            temp_x = X_feature * w.reshape(-1, 1)  # Reshape w to match the shape of X_feature
            lr = LinearRegression().fit(temp_x, z)
            pred = lr.predict(X_feature)

            # Calculate loss
            loss = loss_function(train_labels, pred)
            if loss.iloc[0] < best_loss:
                best_loss = loss.iloc[0]
                best_predictor = lr
            




        train_losses_iter.append(loss_function(train_labels, class_probabilities))  # Append the training loss at each iteration
        # Update class probabilities
        class_probabilities += best_predictor.predict(X_feature)

    train_losses.append(train_losses_iter)
    # Calculate training and test loss
    train_loss = loss_function(train_labels, class_probabilities)
    test_loss = loss_function(test_labels, class_probabilities)

    # Calculate training and test error
    train_pred = np.where(class_probabilities >= 0, 1, 0)
    test_pred = np.where(class_probabilities >= 0, 1, 0)
    train_error = 1 - accuracy_score(train_labels, train_pred)
    test_error = 1 - accuracy_score(test_labels, test_pred)
        
    train_losses.append(train_losses_iter)  # Append the list of training losses for this k to train_losses

    test_losses.append(test_loss)
    train_errors.append(train_error)
    test_errors.append(test_error)
    print(f"{k} done")

# Plot training loss vs iteration number for k = 600
plt.plot(range(1, 601), train_losses[-1])
plt.xlabel('Iteration')
plt.ylabel('Training Loss')
plt.title('Training Loss vs Iteration Number for k = 600')
plt.show()

# Report misclassification errors on the training and test sets for all values of k
error_table = pd.DataFrame({'k': k_values, 'Train Error': train_errors, 'Test Error': test_errors})
print(error_table)

# Plot misclassification errors on the training and test sets vs k
plt.plot(k_values, train_errors, label='Train Error')
plt.plot(k_values, test_errors, label='Test Error')
plt.xlabel('Number of Boosting Iterations (k)')
plt.ylabel('Misclassification Error')
plt.title('Misclassification Error vs Number of Boosting Iterations')
plt.legend()
plt.show()

In [10]:
class_probabilities.shape

(100, 1)

In [None]:
data_direc = os.getcwd() + "/"
if platform.system() == "Windows":
    data_direc = data_direc.replace("/", "\\")
train = pd.read_csv(data_direc + "gisette_train.csv")
train_labels = pd.read_csv(data_direc + "gisette_train_labels.csv")
test = pd.read_csv(data_direc + "gisette_valid.csv")
test_labels = pd.read_csv(data_direc + "gisette_valid_labels.csv")
train = np.delete(train, 5000, axis=1)
test = np.delete(test, 5000, axis=1)