## ***Import Libraries***

In [3]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import csr_matrix
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from collections import Counter
import re
import random
import matplotlib.pyplot as plt
import seaborn as sns
import html
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
df = pd.read_csv("labeled_data.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [6]:
df.drop(["Unnamed: 0","count","hate_speech","offensive_language","neither"],axis=1,inplace=True)

In [7]:
map_dic = {0:"hate_speech",1:"offensive_language",2:"neither"}
df["class"]=df["class"].map(map_dic)

## ***Data Cleaning***

In [8]:
def process(message):
    nopunc = [char for char in message if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    
    return ' '.join(clean)

In [9]:
df["tweet"] = df["tweet"].apply(process)

In [10]:
def clean_tweet(tweet):
    # Unescape HTML entities (e.g., &amp; -> &, &#128536; -> emoji)
    tweet = html.unescape(tweet)
    # Remove mentions (@username)
    tweet = re.sub(r'@\w+', '', tweet)
    # Remove hashtags (#hashtag)
    tweet = re.sub(r'#\w+', '', tweet)
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)
    # Remove emojis and non-ASCII characters
    tweet = tweet.encode('ascii', 'ignore').decode('ascii')
    # Remove special characters, punctuation (except sentence enders), and digits
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)
    # Collapse multiple spaces into one
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet

In [11]:
df['tweet'] = df['tweet'].apply(clean_tweet)

In [12]:
df["tweet"] = df["tweet"].str.lower()

In [13]:
df

Unnamed: 0,class,tweet
0,neither,rt mayasolovely woman shouldnt complain cleani...
1,offensive_language,rt mleew boy dats coldtyga dwn bad cuffin dat ...
2,offensive_language,rt urkindofbrand dawg rt sbabylife ever fuck b...
3,offensive_language,rt cganderson vivabased look like tranny
4,offensive_language,rt shenikaroberts shit hear might true might f...
...,...,...
24778,offensive_language,yous muthafin lie lifeasking pearls coreyemanu...
24779,neither,youve gone broke wrong heart baby drove rednec...
24780,offensive_language,young buck wanna eat dat nigguh like aint fuck...
24781,offensive_language,youu got wild bitches tellin lies


## ***Stemming***

In [14]:
porter = nltk.PorterStemmer()
def stem(message):
    stemmedArr = [porter.stem(term) for term in message.split(" ")]
    return ' '.join(stemmedArr)

In [15]:
df["tweet"]=df["tweet"].apply(stem)

In [16]:
df.head()

Unnamed: 0,class,tweet
0,neither,rt mayasolov woman shouldnt complain clean hou...
1,offensive_language,rt mleew boy dat coldtyga dwn bad cuffin dat h...
2,offensive_language,rt urkindofbrand dawg rt sbabylif ever fuck bi...
3,offensive_language,rt cganderson vivabas look like tranni
4,offensive_language,rt shenikarobert shit hear might true might fa...


# **GLOVE**

In [17]:
X = df["tweet"]
y = df["class"]

In [18]:
# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_path = 'glove.6B/glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_path)
print(f"Loaded {len(glove_embeddings)} word vectors.")

Loaded 400000 word vectors.


In [19]:
def preprocess(text):
    return re.sub(r"[^\w\s]", "", text.lower()).split()

def vectorize_text(texts, embeddings, dim=100):
    vectors = []
    for sentence in texts:
        words = preprocess(sentence)
        word_vecs = [embeddings[w] for w in words if w in embeddings]
        if word_vecs:
            vectors.append(np.mean(word_vecs, axis=0))
        else:
            vectors.append(np.zeros(dim))  # for empty or OOV-only input
    return np.array(vectors)

# Example usage
X = vectorize_text(X, glove_embeddings, dim=100)
print(X.shape)

(24783, 100)


## ***XGBoost Classifier***

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [21]:
## ***XGBoost Classifier***
from sklearn.preprocessing import LabelEncoder

# Encode the string labels to integers
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
from xgboost import XGBClassifier
# Create and train the XGBoost model
xgb_clf = XGBClassifier(
    n_estimators=100,   # Number of boosting rounds
    learning_rate=0.1,  # Step size shrinkage used to prevent overfitting
    max_depth=5,        # Maximum depth of a tree
    random_state=42
)

# Train the model
xgb_clf.fit(X_train, y_train_encoded)

# Make predictions on test set
predicted_encoded = xgb_clf.predict(X_test)
predicted = label_encoder.inverse_transform(predicted_encoded)

# Calculate and display accuracy
print(f"Accuracy: {np.mean(predicted == y_test)}")

Accuracy: 0.8468056489576328


## ***Logistic Regression***

In [22]:
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
log_reg.fit(X_train, y_train)

In [23]:
y_pred = log_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7265635507733692

Classification Report:
                     precision    recall  f1-score   support

       hate_speech       0.20      0.63      0.31       427
           neither       0.56      0.79      0.66      1261
offensive_language       0.96      0.72      0.82      5747

          accuracy                           0.73      7435
         macro avg       0.57      0.71      0.59      7435
      weighted avg       0.85      0.73      0.76      7435



## ***Support Vector Machine Classifier***

In [24]:
svc = SVC(kernel='linear', class_weight='balanced', probability=True).fit(X_train, y_train)

In [25]:
y_pred = svc.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7238735709482179

Classification Report:
                     precision    recall  f1-score   support

       hate_speech       0.21      0.65      0.31       427
           neither       0.55      0.78      0.65      1261
offensive_language       0.96      0.72      0.82      5747

          accuracy                           0.72      7435
         macro avg       0.57      0.72      0.59      7435
      weighted avg       0.85      0.72      0.76      7435



## ***Gaussian Naive Bais***

In [26]:
gnb_clf = GaussianNB().fit(X_train, y_train)

In [27]:
y_pred = gnb_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.771217215870881

Classification Report:
                     precision    recall  f1-score   support

       hate_speech       0.26      0.27      0.26       427
           neither       0.51      0.75      0.61      1261
offensive_language       0.91      0.81      0.86      5747

          accuracy                           0.77      7435
         macro avg       0.56      0.61      0.58      7435
      weighted avg       0.80      0.77      0.78      7435



In [28]:
"d"ad

SyntaxError: invalid syntax (1392802622.py, line 1)

In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.initializers import Constant

embedding_dim = 100  # because you're using glove.6B.100d.txt
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 for padding (index 0)

# Initialize embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Fill embedding matrix with GloVe vectors
for word, i in word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


# Example variables (adjust if needed)
vocab_size = glove_embeddings.shape[0]
embedding_dim = glove_embeddings.shape[1]
max_length = X_train.shape[1]

# Build LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              embeddings_initializer=Constant(embedding_matrix),
              input_length=max_length,
              trainable=False),  # set to True if you want to fine-tune embeddings
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))


NameError: name 'tokenizer' is not defined