### Task 1 ###
Logistic Regression

#### Our implementation of Logistic Regression

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

def split_data(df_feature, df_target, random_state=None, test_size=0.5):
    indexes = df_feature.index.to_numpy()
    if random_state is not None:
        np.random.seed(random_state)
    test_index = np.random.choice(indexes, int(len(indexes) * test_size), replace=False)
    train_index = np.setdiff1d(indexes, test_index)
    
    df_feature_train = df_feature.loc[train_index, :]
    df_feature_test = df_feature.loc[test_index, :]
    df_target_train = df_target.loc[train_index, :]
    df_target_test = df_target.loc[test_index, :]
    
    return df_feature_train, df_feature_test, df_target_train, df_target_test

def normalize_z(dfin, columns_means=None, columns_stds=None):
    if columns_means is None:
        columns_means = dfin.mean(axis=0)
    if columns_stds is None:
        columns_stds = dfin.std(axis=0)
    
    # Prevent division by zero
    columns_stds = columns_stds.replace(0, 1e-5)
    
    dfout = (dfin - columns_means) / columns_stds
    return dfout, columns_means, columns_stds

def prepare_feature(df_feature):
    if isinstance(df_feature, pd.DataFrame):
        np_feature = df_feature.to_numpy()
    else:
        np_feature = df_feature
    X = np.hstack((np.ones((np_feature.shape[0], 1)), np_feature))
    return X

def prepare_target(df_target):
    if isinstance(df_target, pd.DataFrame):
        np_target = df_target.to_numpy()
    else:
        np_target = df_target
    return np_target.ravel()

def calc_logreg(X, beta):
    z = np.dot(X, beta)
    p_x = 1 / (1 + np.exp(-z))
    return p_x

def compute_cost_linreg(beta, X, y):
    epsilon = 1e-5
    pred = np.clip(calc_logreg(X, beta), epsilon, 1 - epsilon)
    error = np.where(y == 1, np.log(pred), np.log(1 - pred))
    J = -np.mean(error)
    return J

def gradient_descent_logreg(X, y, beta, alpha, num_iters):
    m = y.shape[0]
    J_storage = np.zeros(num_iters)
    for i in range(num_iters):
        pred = calc_logreg(X, beta)
        error = pred - y
        gradient = np.dot(X.T, error) / m
        beta -= alpha * gradient
        J_storage[i] = compute_cost_linreg(beta, X, y)
        
        # Debugging: print intermediate values
        if i % 50 == 0:
            print(f"Iteration {i}: Cost {J_storage[i]}, Beta {beta[:5]}")  # Print the first 5 beta values for readability
        
    return beta, J_storage

def predict_norm(X, beta):
    probabilities = calc_logreg(X, beta)
    return np.where(probabilities >= 0.5, 1, 0)

def predict_logreg(df_feature, beta, means=None, stds=None):
    df_feature, means, stds = normalize_z(df_feature, means, stds)
    X = prepare_feature(df_feature)
    return predict_norm(X, beta)

# Load the dataset
df = pd.read_csv('./data/train_tfidf_features.csv')

df_feature = df.drop('label', axis=1)
df_feature = df_feature.drop('id', axis=1)
target = pd.DataFrame(df['label'])

df_feature_train, df_feature_test, df_target_train, df_target_test = split_data(df_feature, target, random_state=42, test_size=0.3)

df_feature_train_norm, means, stds = normalize_z(df_feature_train)

X = prepare_feature(df_feature_train_norm)
y = prepare_target(df_target_train)

beta = np.zeros(X.shape[1])
alpha = 0.01
num_iters = 125
beta, J_storage = gradient_descent_logreg(X, y, beta, alpha, num_iters)

predictions = predict_logreg(df_feature_test, beta, means, stds)

# Print final beta and cost
print(f"Final Beta: {beta}")
print(f"Final Cost: {J_storage[-1]}")

cm = confusion_matrix(df_target_test, predictions)
print(cm)

f1_score(df_target_test, predictions, average='macro')

Iteration 0: Cost 0.6908383573948907, Beta [-1.15761909e-03  1.92035294e-05 -8.05249720e-05  2.22213018e-05
  2.93724623e-05]
Iteration 50: Cost 0.6003190302308883, Beta [-0.05551384  0.00064915 -0.00366712  0.00099683  0.00128311]
Iteration 100: Cost 0.5424007585248339, Beta [-0.10365702  0.00078116 -0.00655588  0.00174863  0.00219551]
Final Beta: [-1.24851828e-01  7.20731156e-04 -7.75337259e-03 ... -3.21755855e-03
 -7.34490599e-05  3.77841653e-03]
Final Cost: 0.5215186010762247
[[2380  846]
 [ 716 1213]]


np.float64(0.6806256320368889)

Macro F1 Score for our Logistic Regression model: 0.68062...

#### SKLearn implementation of Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

# Load the dataset
df = pd.read_csv('./data/train_tfidf_features.csv')

df_feature = df.drop('label', axis=1)
df_feature = df_feature.drop('id', axis=1)
target = pd.DataFrame(df['label'])


df_feature_train, df_feature_test, df_target_train, df_target_test = split_data(df_feature, target, random_state=42, test_size=0.3)

model = LogisticRegression()
model.fit(df_feature_train, df_target_train)
predictions = model.predict(df_feature_test)

#f1 score
f1_score(df_target_test, predictions, average='macro')

  y = column_or_1d(y, warn=True)


np.float64(0.6804009001282998)

Macro F1 Score for SKLearn implementation: 0.68040...

### Task 2 ###
PCA and KNN

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

PCA:

PCA in sklearn takes in these arguments:

1. n_components: int/float
    - Number of components to keep. By deafult is all components
    
https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA

In [None]:
df_feature_train, df_feature_test, df_target_train, df_target_test = split_data(df_feature, df_target, random_state=42, test_size=0.3)

scaler = StandardScaler()
df_feature_train_scaled = scaler.fit_transform(df_feature_train)
df_feature_test_scaled = scaler.transform(df_feature_test)

KNN:

Most imporatantly, KNeighborsClassifier in sklearn takes in these arguments: 
1. n_neighours: int
    - the number of neighbours that we will be comparing to for us to determine how to classify the identified point
2. weights: ['uniform', 'distance']
    - uniform: All points in each neighborhood are weighted equally
    - distance: Weigh points by the inverse of their distance. --> Closer neighbors of a query point will have greater influence than neighbors which are further away
3. metrics: str
    - metric to use for distance computation. Default is minkowski

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier


In [None]:
n_components = [2000, 1000, 500, 100]
f1_dict = {'uniform, minkowski': [0, 0, 0, 0],
           'uniform, euclidean': [0, 0, 0, 0],
           'uniform, manhattan': [0, 0, 0, 0],
           'distance, minkowski': [0, 0, 0, 0],
           'distance, euclidean': [0, 0, 0, 0],
           'distance, manhattan': [0, 0, 0, 0]}

for i, n in enumerate(n_components):
    pca = PCA(n_components=n)
    df_feature_train_pca = pca.fit_transform(df_feature_train_scaled)
    df_feature_test_pca = pca.transform(df_feature_test_scaled)
    
    knn = KNeighborsClassifier(n_neighbors=2, weights='uniform', metric='minkowski')
    knn.fit(df_feature_train_pca, df_target_train)
    y_pred = knn.predict(df_feature_test_pca)
    
    macro_f1 = f1_score(df_target_test, y_pred, average='macro')
    print(f"PCA Components: {n}, weight: uniform, metrics: minkowski, Macro F1 Score: {macro_f1}")
    f1_dict['uniform, minkowski'][i] = macro_f1
    
    knn = KNeighborsClassifier(n_neighbors=2, weights='uniform', metric='euclidean')
    knn.fit(df_feature_train_pca, df_target_train)
    y_pred = knn.predict(df_feature_test_pca)
    
    macro_f1 = f1_score(df_target_test, y_pred, average='macro')
    print(f"PCA Components: {n}, weight: uniform, metrics: euclidean, Macro F1 Score: {macro_f1}")
    f1_dict['uniform, euclidean'][i] = macro_f1
    
    knn = KNeighborsClassifier(n_neighbors=2, weights='uniform', metric='manhattan')
    knn.fit(df_feature_train_pca, df_target_train)
    y_pred = knn.predict(df_feature_test_pca)
    
    macro_f1 = f1_score(df_target_test, y_pred, average='macro')
    print(f"PCA Components: {n}, weight: uniform, metrics: manhattan, Macro F1 Score: {macro_f1}")
    f1_dict['uniform, manhattan'][i] = macro_f1
    
    knn = KNeighborsClassifier(n_neighbors=2, weights='distance', metric='minkowski')
    knn.fit(df_feature_train_pca, df_target_train)
    y_pred = knn.predict(df_feature_test_pca)
    
    macro_f1 = f1_score(df_target_test, y_pred, average='macro')
    print(f"PCA Components: {n}, weight: distance, metrics: minkowski, Macro F1 Score: {macro_f1}")
    f1_dict['distance, minkowski'][i] = macro_f1
    
    knn = KNeighborsClassifier(n_neighbors=2, weights='distance', metric='euclidean')
    knn.fit(df_feature_train_pca, df_target_train)
    y_pred = knn.predict(df_feature_test_pca)
    
    macro_f1 = f1_score(df_target_test, y_pred, average='macro')
    print(f"PCA Components: {n}, weight: distance, metrics: euclidean, Macro F1 Score: {macro_f1}")
    f1_dict['distance, euclidean'][i] = macro_f1
    
    knn = KNeighborsClassifier(n_neighbors=2, weights='distance', metric='manhattan')
    knn.fit(df_feature_train_pca, df_target_train)
    y_pred = knn.predict(df_feature_test_pca)
    
    macro_f1 = f1_score(df_target_test, y_pred, average='macro')
    print(f"PCA Components: {n}, weight: distance, metrics: manhattan, Macro F1 Score: {macro_f1}")
    f1_dict['distance, manhattan'][i] = macro_f1
    # submission = pd.DataFrame({'Id': np.arange(len(y_pred)), 'Predicted': y_pred})
    # submission.to_csv(f'./knn_submissions/knn_pca_{n}_components_submission.csv', index=False)
    
    
    


PCA Components: 2000, weight: uniform, metrics: minkowski, Macro F1 Score: 0.5501698573624343

PCA Components: 2000, weight: uniform, metrics: euclidean, Macro F1 Score: 0.5501698573624343

PCA Components: 2000, weight: uniform, metrics: manhattan, Macro F1 Score: 0.5524122193964222

PCA Components: 2000, weight: distance, metrics: minkowski, Macro F1 Score: 0.5392491953449301

PCA Components: 2000, weight: distance, metrics: euclidean, Macro F1 Score: 0.5392491953449301

PCA Components: 2000, weight: distance, metrics: manhattan, Macro F1 Score: 0.5435481359602898


PCA Components: 1000, weight: uniform, metrics: minkowski, Macro F1 Score: 0.5509358763631034

PCA Components: 1000, weight: uniform, metrics: euclidean, Macro F1 Score: 0.5509358763631034

PCA Components: 1000, weight: uniform, metrics: manhattan, Macro F1 Score: 0.5508372124746364

PCA Components: 1000, weight: distance, metrics: minkowski, Macro F1 Score: 0.5501770344051041

PCA Components: 1000, weight: distance, metrics: euclidean, Macro F1 Score: 0.5501770344051041

PCA Components: 1000, weight: distance, metrics: manhattan, Macro F1 Score: 0.5495024023827315


PCA Components: 500, weight: uniform, metrics: minkowski, Macro F1 Score: 0.5537058356694703

PCA Components: 500, weight: uniform, metrics: euclidean, Macro F1 Score: 0.5537058356694703

PCA Components: 500, weight: uniform, metrics: manhattan, Macro F1 Score: 0.5517738757177308

PCA Components: 500, weight: distance, metrics: minkowski, Macro F1 Score: 0.5605568943455942

PCA Components: 500, weight: distance, metrics: euclidean, Macro F1 Score: 0.5605568943455942

PCA Components: 500, weight: distance, metrics: manhattan, Macro F1 Score: 0.560207813194894


PCA Components: 100, weight: uniform, metrics: minkowski, Macro F1 Score: 0.5486275090215184

PCA Components: 100, weight: uniform, metrics: euclidean, Macro F1 Score: 0.5486275090215184

PCA Components: 100, weight: uniform, metrics: manhattan, Macro F1 Score: 0.5452027672093189

PCA Components: 100, weight: distance, metrics: minkowski, Macro F1 Score: 0.5817419006772417

PCA Components: 100, weight: distance, metrics: euclidean, Macro F1 Score: 0.5817419006772417

PCA Components: 100, weight: distance, metrics: manhattan, Macro F1 Score: 0.5765390074323072


In [4]:
# PCA components None

{'uniform, minkowski': np.float64(0.45174245826512327),
 'uniform, euclidean': np.float64(0.45174245826512327),
 'uniform, manhattan': np.float64(0.43015677039496125),
 'distance, minkowski': np.float64(0.44654508929493897),
 'distance, euclidean': np.float64(0.44654508929493897),
 'distance, manhattan': np.float64(0.4248016097529777)}

{'uniform, minkowski': np.float64(0.45174245826512327),
 'uniform, euclidean': np.float64(0.45174245826512327),
 'uniform, manhattan': np.float64(0.43015677039496125),
 'distance, minkowski': np.float64(0.44654508929493897),
 'distance, euclidean': np.float64(0.44654508929493897),
 'distance, manhattan': np.float64(0.4248016097529777)}

In [5]:
pca = 2000, 1000, 500, 100

{'uniform, minkowski': [np.float64(0.5501698573624343),
                        np.float64(0.5509358763631034),
                        np.float64(0.5537058356694703),
                        np.float64(0.5486275090215184)],

                        
 'uniform, euclidean': [np.float64(0.5501698573624343),
                        np.float64(0.5509358763631034),
                        np.float64(0.5537058356694703),
                        np.float64(0.5486275090215184)],


 'uniform, manhattan': [np.float64(0.5524122193964222),
                        np.float64(0.5508372124746364),
                        np.float64(0.5517738757177308),
                        np.float64(0.5452027672093189)],


 'distance, minkowski': [np.float64(0.5392491953449301),
                        np.float64(0.5501770344051041),
                        np.float64(0.5605568943455942),
                        np.float64(0.5817419006772417)],


 'distance, euclidean': [np.float64(0.5392491953449301),
                        np.float64(0.5501770344051041),
                        np.float64(0.5605568943455942),
                        np.float64(0.5817419006772417)],


 'distance, manhattan': [np.float64(0.5435481359602898),
                        np.float64(0.5495024023827315),
                        np.float64(0.560207813194894),
                        np.float64(0.5765390074323072)]}

{'uniform, minkowski': [np.float64(0.5501698573624343),
  np.float64(0.5509358763631034),
  np.float64(0.5537058356694703),
  np.float64(0.5486275090215184)],
 'uniform, euclidean': [np.float64(0.5501698573624343),
  np.float64(0.5509358763631034),
  np.float64(0.5537058356694703),
  np.float64(0.5486275090215184)],
 'uniform, manhattan': [np.float64(0.5524122193964222),
  np.float64(0.5508372124746364),
  np.float64(0.5517738757177308),
  np.float64(0.5452027672093189)],
 'distance, minkowski': [np.float64(0.5392491953449301),
  np.float64(0.5501770344051041),
  np.float64(0.5605568943455942),
  np.float64(0.5817419006772417)],
 'distance, euclidean': [np.float64(0.5392491953449301),
  np.float64(0.5501770344051041),
  np.float64(0.5605568943455942),
  np.float64(0.5817419006772417)],
 'distance, manhattan': [np.float64(0.5435481359602898),
  np.float64(0.5495024023827315),
  np.float64(0.560207813194894),
  np.float64(0.5765390074323072)]}

### Task 3 ###

#### Model 1 (Bernoulli Naive Bayes) ####

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.decomposition import PCA
import pandas as pd
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('./data/train_tfidf_features.csv')

# Prepare the data
df_feature = df.drop(['label', 'id'], axis=1)
df_target = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_feature, df_target, test_size=0.2, random_state=42, stratify=df_target)

# Initialize the Bernoulli Naive Bayes model
bnb = BernoulliNB(alpha=1,  binarize=0.1)

# alpha is the smoothing parameter that is used to handle 0 probabilities in the dataset, which can occur if a feature has never been observed with a particular class label during training. 
# This can lead to a probability of 0 for that feature, which can cause the entire probability calculation to be 0. 
# To avoid this, we add a small value to the probability calculation to ensure that the probability is never 0. In our case, we added 1 to the count of each feature-class combination before cauculating the probability.
# alpha=1 because a higher value of alpha means more smoothing, which can help to avoid overfitting. Useful in highly imbalanced datasets or when we are dealing with sparse features.

# binarize is the threshold value used to binarize the input features. If a feature value is greater than the threshold, it is set to 1, otherwise it is set to 0.

# Fit the model
bnb.fit(X_train, y_train)

# Predict on the validation set
y_pred = bnb.predict(X_test)

# Print classification report and macro F1 score
print(classification_report(y_test, y_pred))
print("Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))

              precision    recall  f1-score   support

           0       0.76      0.81      0.78      2127
           1       0.65      0.57      0.61      1310

    accuracy                           0.72      3437
   macro avg       0.71      0.69      0.70      3437
weighted avg       0.72      0.72      0.72      3437

Macro F1 Score: 0.6977701451503642


#### Model 2 (Multinomial Naive Bayes) ####

In [2]:
from sklearn.naive_bayes import MultinomialNB

# Load the dataset
df = pd.read_csv('./data/train_tfidf_features.csv')

# Prepare the data
df_feature = df.drop(['label', 'id'], axis=1)
df_target = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_feature, df_target, test_size=0.2, random_state=42, stratify=df_target)

# Initialize the Bernoulli Naive Bayes model
mnb = MultinomialNB(alpha=1)

# Fit the model
mnb.fit(X_train, y_train)

# Predict on the validation set
y_pred = mnb.predict(X_test)

# Print classification report and macro F1 score
print(classification_report(y_test, y_pred))
print("Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))

              precision    recall  f1-score   support

           0       0.72      0.90      0.80      2127
           1       0.73      0.44      0.55      1310

    accuracy                           0.72      3437
   macro avg       0.72      0.67      0.67      3437
weighted avg       0.72      0.72      0.70      3437

Macro F1 Score: 0.6739739021610032


Initially, we believed that multinomial naive bayes would be the best model for our use case. The SKLearn documentation mentions: 

"The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work."

Which gave us confidence that this would be one of the better models. However, when we actually tried using it, we were disappointed by its performance and surprised to find that Bernoulli Naive Bayes worked better instead.

This is probably due to the fact that the dataset have very sparse features. We believe that the simplicity of binary features led to better performance by focusing attention on the presence of features across the corpus rather than their varied weights.

It could also be considered that by reducing the complexity of the feature space from many different unique tf-idf scores to just 1 or 0, it helped prevent overfitting. 

#### Model 3 (RandomForestClassifier) ####

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, make_scorer, f1_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import pandas as pd
import numpy as np

def prepare_feature(df_feature):
    if isinstance(df_feature, pd.DataFrame):
        np_feature = df_feature.to_numpy()
    else:
        np_feature = df_feature
    return np_feature

def prepare_target(df_target):
    if isinstance(df_target, pd.DataFrame):
        np_target = df_target.to_numpy()
    else:
        np_target = df_target
    return np_target.ravel()

def split_data(X, y, test_size=0.3, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

# Load the dataset
df = pd.read_csv('./data/train_tfidf_features.csv')

# Prepare the data
df_feature = df.drop(['label', 'id'], axis=1)
df_target = df['label']

prepared_feature = prepare_feature(df_feature)
prepared_target = prepare_target(df_target)

# Split the data
X_train, X_test, y_train, y_test = split_data(prepared_feature, prepared_target, random_state=42, test_size=0.3)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=400, random_state=42, bootstrap=False)

# n_estimators is the number of trees in the forest. A higher number of trees can lead to better performance, but it can also increase the training time.
# bootstrap is a boolean parameter that specifies whether to use bootstrapping when building the trees. If bootstrap is set to False, the entire training dataset is used to build each tree.
# random_state is the seed used by the random number generator. Since bootstrap is set to False, the random_state parameter controls the randomness in the selection of features for determining the best splits at each node of each tree, ensuring consistent feature subsets across different runs of the model when the same seed is used. It does not influence the selection of data samples since the entire dataset is used for each tree

# Tested rf with different hyperparemeters:
#     'n_estimators': [100, 200, 300, 400],
#     'max_depth': [None, 10, 20, 30, 40, 50],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt'],
#     'bootstrap': [True, False]
# Found the best hyperparameters to be: n_estimators=400, max_depth=None, min_samples_leaf=1, max_features='sqrt', bootstrap=False

# max_features='sqrt' means the number of features to consider when looking for the best split is the square root of the total number of features. 
# For each node where a split decision needs to be made, the algorithm randomly selects a new subset of features equal to the square root of the total number of features. This subset could be different from those considered in previous or subsequent splits within the same tree
# max_depth=None means the maximum depth of the tree is not limited. The tree will continue to grow until all leaves are pure or until all leaves contain less than min_samples_split samples
# min_samples_leaf=1 means the minimum number of samples required to be at a leaf node is 1. This means that each leaf node will have at least one sample

# The best hyperparameters were found using RandomizedSearchCV and we realised that these parameters are the default parameters. Only n_estimators was changed to 400 and bootstrap was set to False.

# Fit the model
rf.fit(X_train, y_train)

# Predict on test data
predictions = rf.predict(X_test)

# Print classification report
print(classification_report(y_test, predictions))

test_macro_f1 = f1_score(y_test, predictions, average='macro')
print(f"Test macro F1 score: {test_macro_f1:.2f}")


  return np_target.ravel()


              precision    recall  f1-score   support

           0       0.75      0.81      0.78      3190
           1       0.64      0.56      0.60      1966

    accuracy                           0.71      5156
   macro avg       0.70      0.68      0.69      5156
weighted avg       0.71      0.71      0.71      5156

Test macro F1 score: 0.69
Test macro F1 score: 0.69


While the macro f1 score was 0.69 here, in the public leaderboard, the score was slightly below 0.71. This was the first model that helped us get past the public blue line!

#### Model 4 (LightGBM) ####

In [1]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, classification_report
from scipy.stats import uniform, randint

# Load the dataset
df = pd.read_csv('train_tfidf_features.csv')

# Prepare the data
df_feature = df.drop(['label', 'id'], axis=1)
df_target = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_feature, df_target, test_size=0.2, random_state=42, stratify=df_target)

# Define the refined parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(460, 490),  # Slightly adjusted range
    'learning_rate': uniform(0.14, 0.18),  # Slightly adjusted range
    'num_leaves': randint(85, 100),  # Slightly adjusted range
    'max_depth': randint(8, 10),  # Keeping the same range
    'subsample': uniform(0.98, 0.02),  # Keeping the same range
    'colsample_bytree': uniform(0.98, 0.02)  # Keeping the same range
}

# Initialize the LightGBM model with class weights
lgbm = LGBMClassifier(objective='binary', random_state=42, class_weight='balanced')

# Use RandomizedSearchCV to find the best hyperparameters within the refined range
random_search = RandomizedSearchCV(estimator=lgbm,
                                   param_distributions=param_dist,
                                   scoring='f1_macro',
                                   n_iter=30,
                                   cv=5,
                                   verbose=2,
                                   n_jobs=-1,
                                   random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Print the best parameters found by RandomizedSearchCV
print(f'Best parameters found: {random_search.best_params_}')

# Use the best estimator to make predictions
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate and print the Macro F1 Score
macro_f1 = f1_score(y_test, y_pred, average='macro')
print(f'Macro F1 Score after RandomizedSearchCV: {macro_f1}')

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['Non-Hateful', 'Hateful']))


Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 4193, number of negative: 6805
[LightGBM] [Info] Number of positive: 4193, number of negative: 6805
[LightGBM] [Info] Number of positive: 4192, number of negative: 6805
[LightGBM] [Info] Number of positive: 4193, number of negative: 6805
[LightGBM] [Info] Number of positive: 4193, number of negative: 6805
[LightGBM] [Info] Number of positive: 4192, number of negative: 6805
[LightGBM] [Info] Number of positive: 4193, number of negative: 6804
[LightGBM] [Info] Number of positive: 4193, number of negative: 6804
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.144086 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18883
[LightGBM] [Info] Number of data points in the train set: 10997, number of used features: 805
[LightGBM] [Info] [binary:Boost

LightGBM was another model that we attempted to optimise, as inital cross-validation testing suggested that it performed relatively well even without optimisation. LightGBM is known as a more lightweight gradient boosting method, and is meant to work well with high-dimensional data, including datasets with a large number of features. After multiple iterations of optimisation of the parameters shown above, the model managed to achieve a macro F1 score of approximately 0.70.

Here is a breakdown of what each of these LightGBM hyperparameters means and how they affect the model:

1. n_estimators is the number of boosting rounds or the number of trees in the ensemble. More trees can improve model performance but also increase the risk of overfitting

2. learning_rate: step size shrinkage used in each boosting step. A smaller learning rate means the model is updated more slowly, requiring more boosting rounds (trees) to converge

3. num_leaves: maximum number of leaves per tree. More leaves increase the complexity of the model, allowing it to capture more details but also increasing the risk of overfitting

4. max_depth: maximum depth of each tree. Deeper trees can model more complex relationships but also increase the risk of overfitting. 

5. subsample: fraction of samples to be used for fitting individual base learners. Using a subsample of the data can prevent overfitting by adding randomness

6. colsample_bytree: fraction of features to be used for fitting individual base learners. Using a subset of features for each tree can prevent overfitting by adding randomness

In order to optimise the model, RandomizedSearchCV was run over multiple iterations, deriving the parameter values that returned the best macro F1 scores each time. After finding out the parameter values, we would run the RandomizedSearchCV over a new range of values, now centered around the new parameter values derived previously. This is how we optimised the LightGBM model.

#### Model 5 (XGBoost) ####

In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv('./data/train_tfidf_features.csv')

# Prepare the data
df_feature = df.drop(['label', 'id'], axis=1)
df_target = df['label']

# Getting value counts for all unique values in the label column
class_counts = df_target.value_counts()
print(class_counts)
print(class_counts / len(df_target) * 100)  # Printing the percentage representation of each class

# Specifically checking entries where the label is 1
count_label_1 = (df_target == 1).sum()  # Counts the number of times '1' appears in the label column
percentage_label_1 = count_label_1 / len(df_target) * 100  # Calculates the percentage of '1' labels
print(f"Count of '1' in label: {count_label_1}")
print(f"Percentage of '1' in label: {percentage_label_1:.2f}%")

label
0    10633
1     6551
Name: count, dtype: int64
label
0    61.877328
1    38.122672
Name: count, dtype: float64
Count of '1' in label: 6551
Percentage of '1' in label: 38.12%


We will employ the SMOTE resampling technique to reach a balance between the 0 and 1 classes.

Originally, when we did XGBoost without the resampling technique, the performance was not the best. After introducing the resampling technique, it produced better results.

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Create a pipeline with PCA and XGBoost
pipeline = Pipeline([
    ('pca', PCA()), 
    ('classifier', xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.1))
])

# Create a parameter grid
param_grid = {
    'pca__n_components': [2000, 1000, 500, 100],  # PCA components
    'classifier__n_estimators': [50, 100, 200, 300],  # Number of trees in XGBoost, equivalent to the number of boosting rounds
    'classifier__max_depth': [3, 4, 5, 6, 7]  # Maximum depth of the each tree in XGBoost
}

# Define a scorer for F1 score
macro_f1_scorer = make_scorer(f1_score, average='macro')

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, scoring=macro_f1_scorer, cv=3, verbose=2)

# Fit the grid search to the data
grid_search.fit(df_feature_train_scaled, df_target_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation macro F1 score: {:.2f}".format(grid_search.best_score_))

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
predictions = best_model.predict(df_feature_test_scaled)
test_macro_f1 = f1_score(df_target_test, predictions, average='macro')
print(f"Test macro F1 score: {test_macro_f1:.2f}")

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

prepared_features = prepare_feature(df_feature)
prepared_target = prepare_target(df_target)

# Split the data
df_feature_train, df_feature_test, df_target_train, df_target_test = split_data(prepared_features, prepared_target, random_state=42, test_size=0.3)

# Handle class imbalance with SMOTE
# SMOTE identifies the minority class that needs oversampling to reach a balance between the classes
# It then selects a random sample from the minority class and computes the k-nearest neighbors for this sample
# A random neighbor is selected and a random amount between 0 and 1 is chosen
# A new sample is created by adding the random amount to the original sample
smote = SMOTE(random_state=42)
df_feature_train_res, df_target_train_res = smote.fit_resample(df_feature_train, df_target_train)

# pca = PCA(n_components=4000)
# df_feature_train_pca = pca.fit_transform(df_feature_train_res)
# df_feature_test_pca = pca.transform(df_feature_test)

# Standardize the data
scaler = StandardScaler()
df_feature_train_scaled = scaler.fit_transform(df_feature_train_res)
df_feature_test_scaled = scaler.transform(df_feature_test)

param_distributions = {
    'max_depth': [5, 6, 7, 8],
    'scale_pos_weight': [1, 2],  # ratio of the number of negative class to the positive class. Setting it to 1 means no scaling is applied, which is appropriate when classes are balanced. When classes are imbalanced, this parameter can be set to a value that compensates for the imbalance by giving more weight to the minority class
    'min_child_weight': [1, 2],  # corresponds to the minimum number of instances needed to make a further partition on a leaf node. Higher values prevent the model from learning relations which might be highly specific to the particular sample selected for a tree.
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],  # node is split only when the resulting split gives a positive reduction in the loss function. Gamma specifies the minimum loss reduction required to make a split.
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],  # Defines the fraction of samples (rows) to be randomly sampled for each tree. Sampling is done without replacement.
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]  # specifies the fraction of features (columns) to be randomly sampled for each tree
}

model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, n_estimators=200, learning_rate=0.1)
macro_f1_scorer = make_scorer(f1_score, average='macro')
random_search = RandomizedSearchCV(model, param_distributions, n_iter=100, scoring=macro_f1_scorer, cv=3, verbose=1, random_state=42)
random_search.fit(df_feature_train_scaled, df_target_train_res)

print("Best score:", random_search.best_score_)
print("Best parameters:", random_search.best_params_)

# Evaluate the best model on the test set
best_model = random_search.best_estimator_
predictions = best_model.predict(df_feature_test_scaled)
test_macro_f1 = f1_score(df_target_test, predictions, average='macro')
print(f"Test macro F1 score: {test_macro_f1:.2f}")

# Print a detailed classification report
print(classification_report(df_target_test, predictions))

In [None]:
# The best parameters from the previous line of code gave us this model
model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, n_estimators=200, learning_rate=0.1, subsample= 0.7, scale_pos_weight= 1, min_child_weight= 1, max_depth= 8, gamma= 0.4, colsample_bytree= 1.0)

# f1 score is 0.71 on the train test set.

#### Model 6 (Bagging Classifier) ####

#### Model 7 (AdaBoost) ####

#### Model 8 (Gaussian NB) ####

---

#### Final Model (Ensemble Model) ####
Bernouli Naive Bayes, LightGBM, RandomForestClassifier

Testing our final model with the train set

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

# Load the dataset
df = pd.read_csv('./data/train_tfidf_features.csv')
df_target = df['label']
scaler = StandardScaler()


# Prepare the data
X = df.drop(['label', 'id'], axis=1)
y = df['label']
scaled_features = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df_target)


# Initialize the Bernoulli Naive Bayes model
bnb = BernoulliNB(alpha=1,  binarize=0.1)
rf = RandomForestClassifier(n_estimators=375, random_state=42, bootstrap=True)
lgbm_params = {
    'colsample_bytree': 0.9918482913772408,
    'learning_rate': 0.1483610742895996,
    'max_depth': 8,
    'n_estimators': 466,
    'num_leaves': 89,
    'subsample': 0.9890099850393909
}
lgbm = LGBMClassifier(objective='binary', random_state=42, class_weight='balanced', **lgbm_params)

# Define the ensemble model using VotingClassifier
ensemble = VotingClassifier(estimators=[('bnb', bnb), ('rf', rf), ('lgbm', lgbm)], voting='soft')

# Predict on the validation set
ensemble.fit(X_train, y_train)

y_pred = ensemble.predict(X_test)
# Print classification report and macro F1 score
print(classification_report(y_test, y_pred))
print("Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))


Retraining the model on the entire train set

In [None]:
df_test = pd.read_csv('./data/test_tfidf_features.csv')
df_test.drop('id', axis=1, inplace=True)
# df_test_prepared = prepare_feature(df_test)
ensemble_submission = VotingClassifier(estimators=[('bnb', bnb), ('rf', rf), ('lgbm', lgbm)], voting='soft')
ensemble_submission.fit(X, y)
predictions = ensemble_submission.predict(df_test)
predictions_df = pd.DataFrame(predictions, columns=['label'])
test_id = pd.read_csv('./data/test_tfidf_features.csv')['id']
submission_df = pd.concat([test_id, predictions_df
                           ], axis=1)
submission_df.to_csv('submission.csv', index=False)