## Extracting the preprocessed data

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the CSV file
df = pd.read_csv("glove_features_and_labels.csv")

# Separate features (X) and labels (y)
X = df.drop(columns=['label']).values  # Drop the label column for features
y = df['label'].values  # Extract the label column

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Verify the shapes
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

num_negatives = (y_train == 0).sum()
num_positives = (y_train == 1).sum()

print("Zeros: " + str(num_negatives))
print("Ones: " + str(num_positives))

pos_weight_value = num_negatives / num_positives

print("Ratio: " + str(pos_weight_value))

Shape of X_train: (1495, 202)
Shape of X_test: (642, 202)
Shape of y_train: (1495,)
Shape of y_test: (642,)
Zeros: 684
Ones: 811
Ratio: 0.843403205918619


In [2]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.metrics import accuracy_score

# Define the model
model = RandomForestClassifier()

# Define the distribution of hyperparameters
param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11)
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=100, cv=3, n_jobs=-1, verbose=2, random_state=42)

# Fit the random search
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best parameters:", random_search.best_params_)

best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Compute accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)

print(f"Test set accuracy: {accuracy:.4f}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 394}
Test set accuracy: 0.7726


In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from scipy.stats import randint
import numpy as np
from sklearn.metrics import accuracy_score

# Define the model
mlp = MLPClassifier(max_iter=1000, random_state=42)

# Define the parameter distribution
param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (150, 100), (200, 100), (300, 150)],  
    'activation': ['tanh', 'relu'],  
    'solver': ['adam', 'sgd', 'lbfgs'],  
    'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],  
    'learning_rate': ['constant', 'invscaling', 'adaptive'],  
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1, 0.5, 1.0], 
    'batch_size': [16, 32, 64, 128, 256] 
}

# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(mlp, param_distributions=param_dist, n_iter=50, cv=5, random_state=42, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best hyperparameters found: ", random_search.best_params_)

# Use the best model to make predictions on the test set
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate accuracy

accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")

Best hyperparameters found:  {'solver': 'lbfgs', 'learning_rate_init': 0.001, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (100,), 'batch_size': 256, 'alpha': 0.0001, 'activation': 'relu'}
Test set accuracy: 0.7897


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
