In [1]:
# Import necessary libraries
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib


# Read the CSV file into a DataFrame
df = pd.read_csv('../../data/original_raw_data.csv')

# Removing the 'Unnamed: 0' column
df = df.drop('Unnamed: 0', axis=1)

# Initialize lemmatizer, stemmer and TweetTokenizer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
tweet_tokenizer = TweetTokenizer()

# Define stopwords
stop_words = set(stopwords.words('english'))

# Remove 'rt' as it refer to a re-tweet in the tweeet and is non important for our model
stop_words.update(['RT', 'I'])

def preprocess_text(text):
    # Tokenize the text
    word_tokens = tweet_tokenizer.tokenize(text)

    # Lemmatize the tokens, stem the tokens, remove stopwords and non-alphabetic tokens
    processed_tokens = [stemmer.stem(lemmatizer.lemmatize(w)) for w in word_tokens if w not in stop_words and w.isalpha()]

    # Join the tokens back into a single string and return it
    return ' '.join(processed_tokens)

# Apply the preprocessing function to the 'tweet' column
df['processed_tweet'] = df['tweet'].apply(preprocess_text)

# Prepare the text data
X_processed = df['processed_tweet']
y = df['class']

X_train_processed, X_test_processed, y_train_processed, y_test_processed = train_test_split(X_processed, y, test_size=0.3, random_state=42)

# Initialize vectorizer with optimal parameters
vectorizer = CountVectorizer(max_features=None, min_df=5, ngram_range=(1, 1), stop_words=None)

# Initialize AdaBoostClassifier
classifier = AdaBoostClassifier(random_state=42)

# Create a pipeline
pipe = Pipeline([
    ('count_vectorizer', vectorizer),
    ('adaboost', classifier)
])

# Start the timer
# start_time = time.time()

# Fit the pipeline on your training data
pipe.fit(X_train_processed, y_train_processed)

# Stop the timer and print the elapsed time
# elapsed_time = time.time() - start_time
# print(f"Model trained successfully. Time elapsed: {elapsed_time} seconds.")

# Use the trained model to make predictions on your test data
y_pred = pipe.predict(X_test_processed)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test_processed, y_pred)
print(f"Model accuracy: {accuracy}")

# Save the model to a file
joblib.dump(pipe, 'hate_speech_detector.joblib')

Model accuracy: 0.9011432414256894


['hate_speech_detector.joblib']