In [47]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [48]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [49]:
df = pd.read_csv("data/processed_data.csv")
df.head()

Unnamed: 0,text,label
0,We stayed for a one night getaway with family ...,0
1,Triple A rate with upgrade to view room was le...,0
2,This comes a little late as I'm finally catchi...,0
3,The Omni Chicago really delivers on all fronts...,0
4,I asked for a high floor away from the elevato...,0


In [50]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [51]:
def preprocess_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["cleaned_text"] = df["text"].apply(preprocess_text)

In [52]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
tfidf_features = tfidf.fit_transform(df["cleaned_text"])

In [53]:
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

df["sentiment"] = df["cleaned_text"].apply(get_sentiment)

In [None]:
def count_exclamations(text):
    return str(text).count("!")

df["exclamation_count"] = df["text"].apply(count_exclamations)

In [None]:
X = pd.concat([
    pd.DataFrame(tfidf_features.toarray()),
    df[["sentiment", "exclamation_count"]],
], axis=1)

y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [56]:
model = XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.82      0.87      0.84      4314
           1       0.86      0.80      0.83      4293

    accuracy                           0.84      8607
   macro avg       0.84      0.84      0.84      8607
weighted avg       0.84      0.84      0.84      8607



In [57]:
if hasattr(model, 'feature_importances_'):
    feature_importances = sorted(
        zip(model.feature_importances_, X.columns), 
        reverse=True
    )
    print("\nTop 20 Features:")
    for importance, feature in feature_importances[:20]:
        print(f"{feature}: {importance:.4f}")


Top 20 Features:
689: 0.0201
1348: 0.0154
4927: 0.0096
3257: 0.0091
4784: 0.0088
4433: 0.0084
111: 0.0076
2088: 0.0072
4849: 0.0069
3124: 0.0064
114: 0.0062
2781: 0.0061
500: 0.0061
466: 0.0061
28: 0.0061
3575: 0.0060
128: 0.0058
4492: 0.0058
2399: 0.0056
921: 0.0056


In [58]:
import joblib
joblib.dump(model, 'models/bot_detection_model.pkl')
joblib.dump(tfidf, 'models/tfidf_vectorizer.pkl')

['models/tfidf_vectorizer.pkl']