In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt



In [3]:
# Download necessary NLTK data
nltk.download('stopwords')

# Initialize the stopwords and stemmer
stopword = set(stopwords.words("english"))
stemmer = SnowballStemmer("english")

# Load the dataset
df = pd.read_csv("twitter_data.csv")

# Map the 'class' column to the 'labels' column
df['labels'] = df['class'].map({
    0: "Hate Speech Detected",
    1: "Offensive language detected",
    2: "No hate and Offensive speech"
})

# Keep only the relevant columns
df = df[['tweet', 'labels']]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Define the text cleaning function
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)  # Remove text inside square brackets
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub(r'\n', '', text)  # Remove newlines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words with digits
    text = [word for word in text.split(' ') if word not in stopword]  # Remove stopwords
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]  # Apply stemming
    text = " ".join(text)
    return text

In [5]:
# Apply the clean function to the 'tweet' column
df["tweet"] = df["tweet"].apply(clean)

# Convert text and labels to numpy arrays
x = np.array(df["tweet"])
y = np.array(df["labels"])

# Vectorize the text data using CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(x)

# Encode labels (since they are string, DecisionTreeClassifier needs them numeric)
le = LabelEncoder()
y = le.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

# Initialize and train the DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)


In [6]:
# Predict and evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%\n")


Model Accuracy: 87.64%



In [7]:
# Function to predict new tweets
def predict_tweet(tweet):
    # Clean and vectorize the new tweet
    tweet = clean(tweet)  # Clean the tweet text
    tweet_vect = cv.transform([tweet])  # Transform the cleaned tweet into the same format as the training data
    prediction = clf.predict(tweet_vect)  # Make the prediction

    # Decode the numeric prediction to the corresponding label
    label = le.inverse_transform(prediction)

    return label[0]

In [8]:
# Function to display a pie chart of the predictions
def display_pie_chart(predictions):
    # Count the occurrences of each label in the predictions
    labels, counts = np.unique(predictions, return_counts=True)

    # Create a pie chart
    plt.figure(figsize=(7, 7))
    plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=140, colors=['#FF6666', '#FFD700', '#32CD32'])
    plt.title("Distribution of Predicted Labels")
    plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular.
    plt.show()

In [9]:
# Input: User enters a tweet
user_input = input("Enter a tweet for classification: ")

# Get the prediction for the entered tweet
result = predict_tweet(user_input)
print(f"\nPrediction: {result}\n")



Enter a tweet for classification: I will kill you

Prediction: Hate Speech Detected



In [10]:
# Input: User enters a tweet
user_input = input("Enter a tweet for classification: ")

# Get the prediction for the entered tweet
result = predict_tweet(user_input)
print(f"\nPrediction: {result}\n")


Enter a tweet for classification: You are an idiot

Prediction: No hate and Offensive speech



In [11]:
# Input: User enters a tweet
user_input = input("Enter a tweet for classification: ")

# Get the prediction for the entered tweet
result = predict_tweet(user_input)
print(f"\nPrediction: {result}\n")


Enter a tweet for classification: You are so pretty

Prediction: No hate and Offensive speech



In [12]:
# Input: User enters a tweet
user_input = input("Enter a tweet for classification: ")

# Get the prediction for the entered tweet
result = predict_tweet(user_input)
print(f"\nPrediction: {result}\n")


Enter a tweet for classification: Bastard

Prediction: No hate and Offensive speech

