<a href="https://colab.research.google.com/github/ke-wen/ke-wen/blob/main/posters/sample_dataset_word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir("/content/drive/MyDrive/AP_FYP")
!pwd


/content/drive/MyDrive/AP_FYP


In [3]:
import pandas as pd

In [4]:
#Load the Dataset
dataset_path = 'data/HateSpeechDatasetBalanced.csv'
data = pd.read_csv(dataset_path)

In [5]:
df = data.sample(frac=0.1, random_state=42)
df

Unnamed: 0,Content,Label
615013,you should be deeply embarrassed... by not ful...,1
75913,do not make me make you fall in love with a bi...,0
523130,trump america is anti immigrant sexual activit...,1
682117,you guys are clearly a pole smoker please get ...,1
395535,oh come along the only reason people like stri...,0
...,...,...
386016,hi just a means cool a k a to say you that the...,0
439019,kiss i am not forgetting to update them its ju...,0
394958,mediation committee oppose per user page attac...,0
308990,pictures or it did not happen got to see so ma...,0


In [6]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
import nltk, re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Let's get a list of stop words from the NLTK library
stop = stopwords.words('english')
print(stop)
# These words are important for our problem. We don't want to remove them.
excluding = ['against', 'not', 'don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't",
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren',
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# New stop word list
stop_words = [word for word in stop if word not in excluding]
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
snow = SnowballStemmer('english')

def process_text(texts):
    final_text_list=[]
    for sent in texts:

        # Check if the sentence is a missing value
        if isinstance(sent, str) == False:
            sent = ""

        filtered_sentence=[]

        sent = sent.lower() # Lowercase
        sent = sent.strip() # Remove leading/trailing whitespace
        sent = re.sub('\s+', ' ', sent) # Remove extra space and tabs
        sent = re.compile('<.*?>').sub('', sent) # Remove HTML tags/markups:

        for w in word_tokenize(sent):
            # We are applying some custom filtering here, feel free to try different things
            # Check if it is not numeric and its length>2 and not in stop words
            if(not w.isnumeric()) and (len(w)>2) and (w not in stop_words):
                # Stem and add to filtered list
                filtered_sentence.append(snow.stem(w))
        final_string = " ".join(filtered_sentence) #final string of cleaned words

        final_text_list.append(final_string)

    return final_text_list

In [9]:
from sklearn.model_selection import train_test_split

X=df[["Content"]]
Y=df["Label"]
X_training, X_test, y_training, y_test = train_test_split(X,
                                                  Y,
                                                  test_size=0.10,
                                                  shuffle=True,
                                                  random_state=324
                                                 )

X_train, X_val, y_train, y_val = train_test_split(X_training,
                                                  y_training,
                                                  test_size=0.10,
                                                  shuffle=True,
                                                  random_state=324
                                                 )



In [10]:
print("Processing the content fields")
train_text_list = process_text(X_train["Content"].tolist())
val_text_list = process_text(X_val["Content"].tolist())

Processing the content fields


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
import numpy as np

# Train Word2Vec model
sentences = [text.split() for text in train_text_list]
model_w2v = Word2Vec(sentences, vector_size=1000, window=5, min_count=1, workers=4)

def document_vector(doc):
  doc = [word for word in doc if word in model_w2v.wv]
  if not doc:
    return np.zeros(model_w2v.vector_size)
  return np.mean([model_w2v.wv[word] for word in doc], axis=0)

KNN

In [12]:
train_vectors = [document_vector(text.split()) for text in train_text_list]
val_vectors = [document_vector(text.split()) for text in val_text_list]


pipeline = Pipeline([
    ('classifier', KNeighborsClassifier(n_neighbors=10))
])

pipeline.fit(train_vectors, y_train)

In [13]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the validation set
y_pred = pipeline.predict(val_vectors)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_val, y_pred))

Accuracy: 0.7563886763580719
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3171
           1       0.76      0.77      0.76      3364

    accuracy                           0.76      6535
   macro avg       0.76      0.76      0.76      6535
weighted avg       0.76      0.76      0.76      6535



No Naive Bayes (MultinomialNB) because this model requires the input data features to be no negative.

Logistic Regression

In [14]:
pipeline = Pipeline([
    ('classifier', LogisticRegression(max_iter=10000))
])

pipeline.fit(train_vectors, y_train)

In [15]:
y_pred = pipeline.predict(val_vectors)

accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_val, y_pred))

Accuracy: 0.7545524100994644
              precision    recall  f1-score   support

           0       0.76      0.72      0.74      3171
           1       0.75      0.78      0.77      3364

    accuracy                           0.75      6535
   macro avg       0.75      0.75      0.75      6535
weighted avg       0.75      0.75      0.75      6535



Support Vector Machine (SVM)

In [16]:
from sklearn.svm import SVC

pipeline = Pipeline([
    ('classifier', SVC(kernel='linear', max_iter=10000))
])

pipeline.fit(train_vectors, y_train)
y_pred = pipeline.predict(val_vectors)



In [17]:
y_pred = pipeline.predict(val_vectors)

accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_val, y_pred))

Accuracy: 0.6610558530986993
              precision    recall  f1-score   support

           0       0.74      0.46      0.57      3171
           1       0.63      0.85      0.72      3364

    accuracy                           0.66      6535
   macro avg       0.68      0.66      0.64      6535
weighted avg       0.68      0.66      0.65      6535



Gradient Boosting (GB)

In [18]:
from xgboost import XGBClassifier

pipeline = Pipeline([
    ('classifier', XGBClassifier(tree_method='gpu_hist', n_estimators=100, max_depth=3))
])

pipeline.fit(train_vectors, y_train)


    E.g. tree_method = "hist", device = "cuda"



In [19]:
y_pred = pipeline.predict(val_vectors)

accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_val, y_pred))

Accuracy: 0.7739862280030605
              precision    recall  f1-score   support

           0       0.80      0.71      0.75      3171
           1       0.75      0.83      0.79      3364

    accuracy                           0.77      6535
   macro avg       0.78      0.77      0.77      6535
weighted avg       0.78      0.77      0.77      6535




    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Random Forest

In [20]:
from xgboost import XGBRFClassifier

pipeline = Pipeline([
    ('classifier', XGBRFClassifier(tree_method='gpu_hist', n_estimators=100, max_depth=10, random_state=42))
])

pipeline.fit(train_vectors, y_train)
y_pred = pipeline.predict(val_vectors)



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



In [21]:
y_pred = pipeline.predict(val_vectors)

accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_val, y_pred))

Accuracy: 0.7735271614384086
              precision    recall  f1-score   support

           0       0.81      0.69      0.75      3171
           1       0.75      0.85      0.79      3364

    accuracy                           0.77      6535
   macro avg       0.78      0.77      0.77      6535
weighted avg       0.78      0.77      0.77      6535

