In [4]:
import pandas as pd
import numpy as np
#Perform basic dimension check

#Read in full toxicity dataset
data = pd.read_csv('toxicity_data.csv')

#Output dimensions of the data
print("Dimensions of toxicity data:")
print("Number of rows:", data.shape[0])
print("Number of columns:", data.shape[1])

Dimensions of toxicity data:
Number of rows: 171
Number of columns: 1204


In [5]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [11]:
def test_sigmoid():
    #Sample case
    ]
    test_inputs = [0, 1, -1, 10, -10]
    expected_outputs = [0.5, 0.73105858, 0.26894142, 0.9999546, 4.53978687e-05]
    for input_val, expected_output in zip(test_inputs, expected_outputs):
        output = sigmoid(input_val)
        assert np.isclose(output, expected_output), f"Test failed: Input {input_val}, Expected {expected_output}, Got {output}"

    print("Sigmoid function test passed.")
test_sigmoid()

Sigmoid function test passed.


In [25]:
def logistic_regression(X, y, learning_rate=0.001, n_iters=1000):
    """
    Fit logistic regression model to the data.
    """
    n_samples, n_features = X.shape

    # Initialize parameters
    weights = np.zeros(n_features)
    bias = 0

    # Gradient descent
    for _ in range(n_iters):
        # Linear combination of weights and features, plus bias
        linear_model = np.dot(X, weights) + bias
        # Apply sigmoid function
        y_predicted = sigmoid(linear_model)

        # Compute gradients
        dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))  # Derivative w.r.t weights
        db = (1 / n_samples) * np.sum(y_predicted - y)         # Derivative w.r.t bias

        # Update parameters
        weights -= learning_rate * dw
        bias -= learning_rate * db

    return weights, bias

def logistic_regression_predict(X, weights, bias):
    """
    Predict using logistic regression model.
    """
    linear_model = np.dot(X, weights) + bias
    y_predicted = sigmoid(linear_model)
    y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
    return np.array(y_predicted_cls)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def select_best_feature(X, y, selected_features):
    remaining_features = [i for i in range(X.shape[1]) if i not in selected_features]
    best_score = -np.inf
    best_feature = None

    for feature in remaining_features:
        X_temp = X[:, selected_features + [feature]]
        weights, bias, _ = logistic_regression(X_temp, y)

        linear_model = np.dot(X_temp, weights) + bias
        y_predicted = sigmoid(linear_model)

        score = compute_score(y, y_predicted)
        if score > best_score:
            best_score = score
            best_feature = feature

    return best_feature

def compute_score(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [16]:
def test_compute_score():
    #Test cases
    y_true = np.array([0, 1, 0, 1, 1])
    y_pred = np.array([0, 1, 0, 0, 1])
    expected_output = 0.8

    #Run
    output = compute_score(y_true, y_pred)
    assert np.isclose(output, expected_output), f"Test failed: Expected {expected_output}, Got {output}"

    print("Compute score function test passed.")
test_compute_score()

Compute score function test passed.


In [31]:
#Toxicity data has class variable (toxic vs nontoxic) as target variable
X = data.drop('Class', axis=1) 
y = data['Class']

#Print XPortion
print("Input features for logistic regression (X):")
print(X.head())
#Print YPortion
print("Target variable for logistic regression:")
print(y.head())

#Convert target variable numeric
# Assuming 'y' is your array of strings containing 'NonToxic' or 'Toxic'
y_numeric = [0 if label == 'NonToxic' else 1 for label in y]

#Print YNumericPortion
print("Target variable for logistic regression:")
print(y_numeric)

Input features for logistic regression (X):
   MATS3v  nHBint10  MATS3s  MATS3p  nHBDon_Lipinski  minHBint8  MATS3e  \
0  0.0908         0  0.0075  0.0173                0        0.0 -0.0436   
1  0.0213         0  0.1144 -0.0410                0        0.0  0.1231   
2  0.0018         0 -0.0156 -0.0765                2        0.0 -0.1138   
3 -0.0251         0 -0.0064 -0.0894                3        0.0 -0.0747   
4  0.0135         0  0.0424 -0.0353                0        0.0 -0.0638   

   MATS3c  minHBint2  MATS3m  ...   WTPT-3   WTPT-4   WTPT-5  ETA_EtaP_L  \
0  0.0409        0.0  0.1368  ...   0.0000   0.0000   0.0000      0.1780   
1 -0.0316        0.0  0.1318  ...  28.2185   8.8660  19.3525      0.1739   
2 -0.1791        0.0  0.0615  ...  33.1064   5.2267  27.8796      0.1688   
3 -0.1151        0.0  0.0361  ...  32.5232   7.7896  24.7336      0.1702   
4  0.0307        0.0  0.0306  ...  32.0726  12.3240  19.7486      0.1789   

   ETA_EtaP_F  ETA_EtaP_B  nT5Ring  SHdNH  ETA_d

In [17]:
#Function for splitting data into training/testing sections
def train_test_split(X, y, test_size=0.2, random_state=None):
    """Split the dataset into training and testing sets."""
    if random_state is not None:
        np.random.seed(random_state)

    #Shuffle indices
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    #Calculate the number of samples for the testing set
    num_test_samples = int(len(X) * test_size)

    #Split the shuffled indices into training and testing indices
    test_indices = indices[:num_test_samples]
    train_indices = indices[num_test_samples:]

    #Split the dataset into training and testing sets
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)
#Test train/test split
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (137, 1203)
Shape of X_test: (34, 1203)
Shape of y_train: (137,)
Shape of y_test: (34,)


In [27]:
#Switch hyperparameters up
learning_rate = 0.001
num_iterations_list = [100]

#Train the logistic regression model and evaluate accuracy for each number of iterations
accuracy_scores = []
for num_iterations in num_iterations_list:
    #Training step
    weights, bias = logistic_regression(X_train, y_train, learning_rate, num_iterations)
    
    #Make predictions on the test data
    y_pred = predict(X_test, weights, bias)
    
    #Calculate accuracy
    acc = accuracy(y_test, y_pred)
    accuracy_scores.append(acc)

#Plot accuracy over iterations
plt.plot(num_iterations_list, accuracy_scores, marker='o')
plt.xlabel('Number of Iterations')
plt.ylabel('Accuracy')
plt.title('Accuracy of Logistic Regression Model')
plt.grid(True)
plt.show()

TypeError: unsupported operand type(s) for -: 'float' and 'str'