In [26]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import seaborn as sns

In [27]:
try:
    import google.colab
    %pip install numpy pandas matplotlib scikit-learn
except ImportError:
    pass


In [None]:
data_path = 'SAheart.data'

if os.path.exists(data_path):
    os.remove(data_path)
!wget https://hastie.su.domains/ElemStatLearn/datasets/SAheart.data

data = pd.read_csv(data_path)
data = data.drop('row.names', axis=1)
data['famhist'] = data['famhist'].map({'Absent': 0, 'Present': 1})
data = data.sample(frac=1, random_state=2).reset_index(drop=True)

display(data.describe().round(2))
data.info()
data.head()

In [None]:
# Split the data into features and target
X = data.drop('chd', axis=1)
y = data['chd']
features = X.columns
features

In [None]:
# Create a pairplot for the dataset
sns.pairplot(data, hue='chd')
plt.show()

In [None]:
# Split the data into train (80%), validation (10%), and test (10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

# Standardize the feature
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train, y_train.values

In [None]:
class LogisticRegressionSGD:
    def __init__(self, learning_rate=0.01, C=0.01, num_steps=None, regularization=None, approach='vectorized'):
        self.learning_rate = learning_rate  # Step size for updating weights
        self.C = C  # Regularization strength (L1 or L2 penalty factor)
        self.num_steps = num_steps if num_steps is not None else X.shape[0]
        self.regularization = regularization  # 'l1', 'l2', or None
        self.approach = approach  # 'vectorized' or 'stochastic'
        self.weights = None
        self.log_likelihood_values = []

    def sigmoid(self, z):
        # S(z) = 1 / (1 + exp(-z))
        return 1 / (1 + np.exp(-z))

    def predict(self, X):
        return self.sigmoid(np.dot(X, self.weights))

    def compute_log_likelihood(self, X, y):
        # log_likelihood = sum(y * log(predictions) + (1 - y) * log(1 - predictions))
        predictions = self.predict(X)
        return np.sum(y * np.log(predictions + 1e-9) + (1 - y) * np.log(1 - predictions + 1e-9)) # 1e-9 is added to prevent log(0)

    def fit(self, X, y):
        # Initialize weights with zeros
        self.weights = np.zeros(X.shape[1])
        
        for step in range(self.num_steps):
            X, y = shuffle(X, y, random_state=step)
            
            if self.approach == 'vectorized':
                # Vectorized computation for all samples in the dataset
                predictions = self.predict(X)
                errors = y - predictions
                gradient = np.dot(X.T, errors) / X.shape[0]
                
                # Update weights using the gradient and learning rate
                self.weights += self.learning_rate * gradient
            elif self.approach == 'stochastic':
                # Stochastic gradient descent: iterate through each sample
                for i in range(X.shape[0]):
                    prediction = self.predict(X[i])
                    error = y[i] - prediction
                    gradient = X[i] * error
                    
                    # Update weights using the gradient and learning rate
                    self.weights += self.learning_rate * gradient
            
            # Apply Regularization Penalty if regularization is enabled
            if self.regularization == 'l1':
                # L1 Regularization
                self.weights = np.sign(self.weights) * np.maximum(0, np.abs(self.weights) - self.learning_rate * self.C)
            elif self.regularization == 'l2':
                # L2 Regularization
                self.weights -= self.learning_rate * self.C * self.weights
            
            log_likelihood = self.compute_log_likelihood(X, y)
            self.log_likelihood_values.append(log_likelihood)

    def plot_log_likelihood(self):
        # Plot the log likelihood values over iterations to visualize convergence
        plt.plot(range(len(self.log_likelihood_values)), self.log_likelihood_values)
        plt.xlabel('Iteration')
        plt.ylabel('Log Likelihood')
        plt.title('Log Likelihood Over Iterations')
        plt.show()

learning_rate = 0.01
C = 0.01
num_steps = 50
approach = 'stochastic'

# Train logistic regression using SGD without regularization
model_no_reg = LogisticRegressionSGD(learning_rate=learning_rate, C=C, num_steps=num_steps, regularization=None, approach=approach)
model_no_reg.fit(X_train, y_train.values)

print("no_reg", model_no_reg.log_likelihood_values[-1])
model_no_reg.plot_log_likelihood()

y_pred = model_no_reg.predict(X_test) >= 0.5
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [None]:
from sklearn.linear_model import SGDClassifier

# Train logistic regression using SGDClassifier from scikit-learn
sgd_logistic = SGDClassifier(loss='log_loss', max_iter=1000)
sgd_logistic.fit(X_train, y_train)
y_pred = sgd_logistic.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
# Manual Forward Stepwise Feature Selection using Cross-Validation with LogisticRegressionSGD
def forward_stepwise_selection(X_train, y_train, X_val, y_val, features, model):
    best_score = 0
    selected_features = []
    available_features = list(features)

    while available_features:
        best_feature = None
        for feature in available_features:
            current_features = selected_features + [feature]
            X_train_subset = pd.DataFrame(X_train, columns=features)[current_features].values
            X_val_subset = pd.DataFrame(X_val, columns=features)[current_features].values
            
            model.fit(X_train_subset, y_train.values)
            predictions = model.predict(X_val_subset) >= 0.5
            score = accuracy_score(y_val, predictions)
            
            if score > best_score:
                best_score = score
                best_feature = feature
        
        if best_feature is not None:
            selected_features.append(best_feature)
            available_features.remove(best_feature)
        else:
            break
    return selected_features

model = LogisticRegressionSGD(learning_rate=0.01, C=0.01, num_steps=100, regularization=None, approach='stochastic')
selected_features = forward_stepwise_selection(X_train, y_train, X_val, y_val, features, model)

print("Selected Features: ", selected_features)