<a href="https://colab.research.google.com/github/ke-wen/ke-wen/blob/main/posters/sentence_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir("/content/drive/MyDrive/AP_FYP")
!pwd


/content/drive/MyDrive/AP_FYP


In [3]:
import pandas as pd
# Load the Dataset
dataset_path = 'data/HateSpeechDatasetBalanced.csv'
data = pd.read_csv(dataset_path)
print('The shape of the dataset is:', data.shape)

The shape of the dataset is: (726119, 2)


In [4]:
df = data.sample(frac=0.1, random_state=42)
df

Unnamed: 0,Content,Label
615013,you should be deeply embarrassed... by not ful...,1
75913,do not make me make you fall in love with a bi...,0
523130,trump america is anti immigrant sexual activit...,1
682117,you guys are clearly a pole smoker please get ...,1
395535,oh come along the only reason people like stri...,0
...,...,...
386016,hi just a means cool a k a to say you that the...,0
439019,kiss i am not forgetting to update them its ju...,0
394958,mediation committee oppose per user page attac...,0
308990,pictures or it did not happen got to see so ma...,0


In [5]:
pip install -U sentence-transformers



In [6]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets
train_data, test_data, train_labels, test_labels = train_test_split(df["Content"], df['Label'], test_size=0.1, random_state=42)

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sentence_transformers import SentenceTransformer
import torch

# Make sure model run in GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
def generate_embeddings(sentences):
    return model.encode(sentences, convert_to_tensor=True, device=device)

train_embeddings = generate_embeddings(train_data.tolist())
test_embeddings = generate_embeddings(test_data.tolist())

# have to convert embeddings to CPU tensors because sklearn does not support GPU tensors
train_embeddings = train_embeddings.cpu().numpy()
test_embeddings = test_embeddings.cpu().numpy()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


## Try logistic regression, knn, SVM, Gradient Boosting, and Random Forest models

# 1. K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(train_embeddings, train_labels)
knn_predictions = knn.predict(test_embeddings)
knn_accuracy = accuracy_score(test_labels, knn_predictions)
print(f'KNN Accuracy: {knn_accuracy * 100:.2f}%')
print("KNN Classification Report:")
print(classification_report(test_labels, knn_predictions))


# 2. Logistic Regression
lr = LogisticRegression(max_iter=10000)
lr.fit(train_embeddings, train_labels)
lr_predictions = lr.predict(test_embeddings)
lr_accuracy = accuracy_score(test_labels, lr_predictions)
print(f'Logistic Regression Accuracy: {lr_accuracy * 100:.2f}%')
print("Logistic Regression Classification Report:")
print(classification_report(test_labels, lr_predictions))

# 3. Support Vector Machine (SVM)
svm = SVC(kernel='linear', max_iter=1000, random_state=42)
svm.fit(train_embeddings, train_labels)
svm_predictions = svm.predict(test_embeddings)
svm_accuracy = accuracy_score(test_labels, svm_predictions)
print(f'SVM Accuracy: {svm_accuracy * 100:.2f}%')
print("SVM Classification Report:")
print(classification_report(test_labels, svm_predictions))

# 4. Gradient Boosting (GB)
gb = GradientBoostingClassifier(n_estimators=30, learning_rate=0.05, max_depth=2, random_state=42)
gb.fit(train_embeddings, train_labels)
gb_predictions = gb.predict(test_embeddings)
gb_accuracy = accuracy_score(test_labels, gb_predictions)
print(f'Gradient Boosting Accuracy: {gb_accuracy * 100:.2f}%')
print("Gradient Boosting Classification Report:")
print(classification_report(test_labels, gb_predictions))

# 5. Random Forest (RF)
rf = RandomForestClassifier(n_estimators=30, random_state=42)
rf.fit(train_embeddings, train_labels)
rf_predictions = rf.predict(test_embeddings)
rf_accuracy = accuracy_score(test_labels, rf_predictions)
print(f'Random Forest Accuracy: {rf_accuracy * 100:.2f}%')
print("Random Forest Classification Report:")
print(classification_report(test_labels, rf_predictions))

KNN Accuracy: 79.45%
KNN Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.79      0.79      3621
           1       0.79      0.80      0.80      3641

    accuracy                           0.79      7262
   macro avg       0.79      0.79      0.79      7262
weighted avg       0.79      0.79      0.79      7262

Logistic Regression Accuracy: 78.78%
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.76      0.78      3621
           1       0.77      0.82      0.79      3641

    accuracy                           0.79      7262
   macro avg       0.79      0.79      0.79      7262
weighted avg       0.79      0.79      0.79      7262





SVM Accuracy: 62.35%
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.77      0.67      3621
           1       0.68      0.48      0.56      3641

    accuracy                           0.62      7262
   macro avg       0.64      0.62      0.62      7262
weighted avg       0.64      0.62      0.62      7262

Gradient Boosting Accuracy: 69.50%
Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.63      0.67      3621
           1       0.67      0.76      0.71      3641

    accuracy                           0.69      7262
   macro avg       0.70      0.69      0.69      7262
weighted avg       0.70      0.69      0.69      7262

Random Forest Accuracy: 74.99%
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.73      0.74      3621
           1       0.74      0.77      0.76  