In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv('labeled_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
def label_race (row):
   if row['class'] == 2 :
      return 'Neither'
   if row['class'] == 1 :
      return 'Hate'
   if row['class'] == 0 :
      return 'Offensive'
   return 'Other'

df['label'] = df.apply (lambda row: label_race(row), axis=1)

In [4]:
df = df[['class','tweet', 'label']]
df.head()

Unnamed: 0,class,tweet,label
0,2,!!! RT @mayasolovely: As a woman you shouldn't...,Neither
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,Hate
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Hate
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,Hate
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,Hate


**DATA CLEANING**

In [5]:
import re

def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove mentions
    tweet = re.sub(r'@\S+', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\S+', '', tweet)
    # Remove special characters and punctuation
    tweet = re.sub(r'[^\w\s]', '', tweet)
    # Convert to lowercase
    tweet = tweet.lower()
    # Remove extra whitespace
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet

df['tweet'] = df['tweet'].apply(clean_tweet)

In [6]:
df.head(10)

Unnamed: 0,class,tweet,label
0,2,rt as a woman you shouldnt complain about clea...,Neither
1,1,rt boy dats coldtyga dwn bad for cuffin dat ho...,Hate
2,1,rt dawg rt you ever fuck a bitch and she start...,Hate
3,1,rt she look like a tranny,Hate
4,1,rt the shit you hear about me might be true or...,Hate
5,1,the shit just blows meclaim you so faithful an...,Hate
6,1,i can not just sit up and hate on another bitc...,Hate
7,1,cause im tired of you big bitches coming for u...,Hate
8,1,amp you might not get ya bitch back amp thats ...,Hate
9,1,hobbies include fighting mariam bitch,Hate


In [7]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmas)



In [9]:
df['tweet'] = df['tweet'].apply(lemmatize_text)

In [10]:
df.head()

Unnamed: 0,class,tweet,label
0,2,rt a a woman you shouldnt complain about clean...,Neither
1,1,rt boy dat coldtyga dwn bad for cuffin dat hoe...,Hate
2,1,rt dawg rt you ever fuck a bitch and she start...,Hate
3,1,rt she look like a tranny,Hate
4,1,rt the shit you hear about me might be true or...,Hate


**TRAIN DATASET**

In [24]:
x = df['tweet']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 features for efficiency
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [25]:
# Train a Logistic Regression model
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train_tfidf, y_train)



In [19]:
# Predict on the test set
y_pred = logreg.predict(X_test_tfidf)

In [23]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred, target_names=["Hate", "Offensive", "Normal"])

print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report_result)

Accuracy: 0.8915

Classification Report:
               precision    recall  f1-score   support

        Hate       0.91      0.96      0.94      3852
   Offensive       0.83      0.80      0.82       826
      Normal       0.55      0.16      0.25       279

    accuracy                           0.89      4957
   macro avg       0.76      0.64      0.67      4957
weighted avg       0.88      0.89      0.88      4957



In [26]:
def predict_label(text):
    # Clean and lemmatize the input text
    cleaned_text = clean_tweet(text)
    lemmatized_text = lemmatize_text(cleaned_text)

    # Vectorize the input text
    text_vectorized = vectorizer.transform([lemmatized_text])

    # Predict the label
    prediction = lr.predict(text_vectorized)[0]

    return prediction


input_string = input("Enter you comment")
predicted_label = predict_label(input_string)
print(predicted_label)

Enter you commentYou must be killed  BITCH 
Hate
