<h1> NLP Application </h1>

<h2> by Nathan Dilla & John Haviland </h2>

<h3> Problem Statement </h3>


<h2> Dataset Overview </h2>
<h3> Purpose </h3>




<h3> Step 1: Import Libraries & Load Dataset </h3>



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

data = pd.read_csv('hate_speech_dataset.csv')

# Display basic statistics about the dataset
print(data.describe())

<h3> Step 2: Data Preprocessing </h3>



In [None]:
# Remove rows with missing values
data.dropna(inplace=True)

# Remove user @'s and non-alphabetical characters
data['tweet'] = data['tweet'].replace(to_replace=r'@\w+', value='', regex=True)
data['tweet'] = data['tweet'].replace(to_replace='[^A-Za-z0-9\s]+', value='', regex=True)

<h3> Step 3: Visualizing Sentiment Distribution </h3>

In [None]:
# Count and plot sentiment labels
sentiment_counts = data['sentiment'].value_counts()
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Sentiment Distribution')
plt.show()

<h3> Step 4: Text Preprocessing </h3>

In [None]:
import string
from nltk.corpus import stopwords

nltk.download('stopwords')

def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Tokenize and remove stop words
    tokens = nltk.word_tokenize(text)
    #tokens = [word for word in tokens if word not in stopwords.words('english')]

    return ' '.join(tokens)

data['processed_text'] = data['tweet'].apply(preprocess_text)


<h3> Step 5: TF-IDF Vectorization </h3>

In [None]:
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(data['processed_text'])
y = data['sentiment']

<h3> Step 6: Train-Test Split </h3>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<h3> Step 7: Model Training & Evaluation

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Print model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

<h3> Step 8: Model Testing </h3>

In [None]:
new_questions = ["this tweet is so bad kill yourself", "wow what a great post", "i do not know how i feel about this"]
new_questions = [preprocess_text(question) for question in new_questions]
new_predictions = model.predict(tfidf_vectorizer.transform(new_questions))
print(new_predictions)


<h3> Step 9: Prediction Evaluation </h3>

In [None]:
# Print Classification Report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Print Confusion Matrix Visualization
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

sns.heatmap(confusion, annot=True, cmap='Blues', fmt='d', xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


<h3> Analysis of our Findings </h3>



<h3> References </h3>
