# Assignment week 5

In [1]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

In [2]:
# Step 1: Load the data
data = pd.read_csv('full_data.csv')  # Replace 'hate_speech_data.csv' with the path to your data file

In [3]:
# Step 2: Text Preprocessing (Cleaning)
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    words = word_tokenize(text)
    # Remove stopwords and apply stemming
    cleaned_words = [ps.stem(word) for word in words if word not in stop_words]
    # Join the cleaned words back into a single string
    cleaned_text = ' '.join(cleaned_words)
    return cleaned_text

data['cleaned_comment'] = data['comment'].apply(clean_text)

In [4]:
# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_comment'], data['isHate'], test_size=0.2, random_state=42)

In [5]:
# Step 4: Vectorization
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [6]:
# Step 5: Train the classifiers
nb_alpha = 1.0 

nb_classifier = MultinomialNB(alpha=nb_alpha)
nb_classifier.fit(X_train_vec, y_train)

In [7]:
dt_max_depth = 10  

dt_classifier = DecisionTreeClassifier(max_depth=dt_max_depth)
dt_classifier.fit(X_train_vec, y_train)

In [8]:
# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

In [9]:
# Calculate training accuracy, precision, recall, and F1 score
nb_train_pred = nb_classifier.predict(X_train_vec)
dt_train_pred = dt_classifier.predict(X_train_vec)

nb_train_accuracy, nb_train_precision, nb_train_recall, nb_train_f1 = calculate_metrics(y_train, nb_train_pred)
dt_train_accuracy, dt_train_precision, dt_train_recall, dt_train_f1 = calculate_metrics(y_train, dt_train_pred)

# Calculate testing accuracy, precision, recall, and F1 score
nb_test_pred = nb_classifier.predict(X_test_vec)
dt_test_pred = dt_classifier.predict(X_test_vec)

nb_test_accuracy, nb_test_precision, nb_test_recall, nb_test_f1 = calculate_metrics(y_test, nb_test_pred)
dt_test_accuracy, dt_test_precision, dt_test_recall, dt_test_f1 = calculate_metrics(y_test, dt_test_pred)

print("Naive Bayes Metrics:")
print("Naive Bayes - Accuracy:", nb_train_accuracy)
print("Naive Bayes - Precision:", nb_train_precision)
print("Naive Bayes - Recall:", nb_train_recall)
print("Naive Bayes - F1 Score:", nb_train_f1)
print("Naive Bayes - Accuracy:", nb_test_accuracy)
print("Naive Bayes - Precision:", nb_test_precision)
print("Naive Bayes - Recall:", nb_test_recall)
print("Naive Bayes - F1 Score:", nb_test_f1)

print("\nDecision Tree Metrics:")

print("Decision Tree - Accuracy:", dt_train_accuracy)
print("Decision Tree - Precision:", dt_train_precision)
print("Decision Tree - Recall:", dt_train_recall)
print("Decision Tree - F1 Score:", dt_train_f1)
print("Decision Tree - Accuracy:", dt_test_accuracy)
print("Decision Tree - Precision:", dt_test_precision)
print("Decision Tree - Recall:", dt_test_recall)
print("Decision Tree - F1 Score:", dt_test_f1)

Naive Bayes Metrics:
Naive Bayes - Accuracy: 0.9689027498725522
Naive Bayes - Precision: 0.985887384176764
Naive Bayes - Recall: 0.9428123508963261
Naive Bayes - F1 Score: 0.9638688547437372
Naive Bayes - Accuracy: 0.9516612690416217
Naive Bayes - Precision: 0.9689029657356752
Naive Bayes - Recall: 0.9193989071038251
Naive Bayes - F1 Score: 0.9435020328052712

Decision Tree Metrics:
Decision Tree - Accuracy: 0.8870363151108046
Decision Tree - Precision: 0.8169767441860465
Decision Tree - Recall: 0.9578079203871583
Decision Tree - F1 Score: 0.8818047755012394
Decision Tree - Accuracy: 0.8835312462516492
Decision Tree - Precision: 0.8115874855156431
Decision Tree - Recall: 0.9568306010928962
Decision Tree - F1 Score: 0.8782445141065831


In [10]:
# Function for single prediction
def predict_single_comment(comment):
    cleaned_comment = clean_text(comment)
    vec_comment = vectorizer.transform([cleaned_comment])
    
    nb_pred = nb_classifier.predict(vec_comment)[0]
    dt_pred = dt_classifier.predict(vec_comment)[0]
    
    return nb_pred, dt_pred

# Single prediction example
comment = "Like my 3rd grade teacher used to say, you either eat pussy, or you are one. what's it gonna be? @JBilinovich @JZolly23"
nb_pred, dt_pred = predict_single_comment(comment)
print("Naive Bayes Prediction:", nb_pred)
print("Decision Tree Prediction:", dt_pred)

Naive Bayes Prediction: 0
Decision Tree Prediction: 0


In [11]:
# Function for single prediction
def predict_single_comment(comment):
    cleaned_comment = clean_text(comment)
    vec_comment = vectorizer.transform([cleaned_comment])
    
    nb_pred = nb_classifier.predict(vec_comment)[0]
    dt_pred = dt_classifier.predict(vec_comment)[0]
    
    return nb_pred, dt_pred

# Single prediction example
comment = "We hate niggers, we hate faggots and we hate spics-kkk rally"
nb_pred, dt_pred = predict_single_comment(comment)
print("Naive Bayes Prediction:", nb_pred)
print("Decision Tree Prediction:", dt_pred)

Naive Bayes Prediction: 1
Decision Tree Prediction: 1
