1. Loads labeled text data (examples of messages labeled as human, content_bot, follower_bot, etc.).
2. Cleans and maps labels to numbers.
3.  Splits data into training and test sets.
4. Tries three different feature+model approaches:
    Character-level n-gram features with XGBoost.
    Hand-crafted/custom numeric features with XGBoost.
    Word pattern features with a Random Forest.
5. Compares accuracies on the test set and saves the best model and its vectorizer/extractor for later use.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import re
import numpy as np

In [3]:
# Load dataset
df = pd.read_csv("text_bot_training.csv")

# Clean data
df.dropna(subset=["text", "label"], inplace=True)
df["label"] = df["label"].str.strip().str.lower()

In [4]:
# Label mapping
label_map = {
    "human": 0, 
    "content_bot": 1, 
    "follower_bot": 2, 
    "spam_bot": 3, 
    "customer_service_bot": 4
}
df["label"] = df["label"].map(label_map)
df.dropna(subset=["label"], inplace=True)

In [5]:
print("Cleaned data shape:", df.shape)
print("Label distribution:")
print(df["label"].value_counts().sort_index())

Cleaned data shape: (250, 2)
Label distribution:
label
0    50
1    50
2    50
3    50
4    50
Name: count, dtype: int64


In [6]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

In [7]:
print("CHARACTER-LEVEL FEATURES")
char_vectorizer = CountVectorizer(
    analyzer='char',
    ngram_range=(3, 5), # Character 3-5 grams
    max_features=1000,
    lowercase=True
)

X_train_char = char_vectorizer.fit_transform(X_train)
X_test_char = char_vectorizer.transform(X_test)

print(f"Character features shape: {X_train_char.shape}")

CHARACTER-LEVEL FEATURES
Character features shape: (200, 1000)


In [8]:
char_model = XGBClassifier(
    objective="multi:softprob", 
    num_class=5, 
    eval_metric="mlogloss", 
    random_state=42,
    n_estimators=100
)

char_model.fit(X_train_char, y_train)
y_pred_char = char_model.predict(X_test_char)
print("Character-level Accuracy:", accuracy_score(y_test, y_pred_char))
print("Character-level Classification Report:\n", classification_report(y_test, y_pred_char, target_names=label_map.keys()))

Character-level Accuracy: 0.84
Character-level Classification Report:
                       precision    recall  f1-score   support

               human       1.00      1.00      1.00        10
         content_bot       1.00      1.00      1.00        10
        follower_bot       0.90      0.90      0.90        10
            spam_bot       0.70      0.70      0.70        10
customer_service_bot       0.60      0.60      0.60        10

            accuracy                           0.84        50
           macro avg       0.84      0.84      0.84        50
        weighted avg       0.84      0.84      0.84        50



In [9]:
print("\nCUSTOM FEATURES")
def extract_bot_features(texts):
    features = []
    for text in texts:
        text_lower = str(text).lower()
        words = text_lower.split()
        
        feature_dict = {
            # Structural features
            'text_length': len(text),
            'word_count': len(words),
            'char_density': len(text) / max(1, len(text.replace(' ', ''))),
            
            # Bot pattern indicators
            'underscore_count': text_lower.count('_'),
            'has_underscore': int('_' in text_lower),
            'has_http': int('http' in text_lower),
            'has_digits': int(any(char.isdigit() for char in text)),
            
            # Keyword patterns
            'bot_keywords': sum(1 for word in words if word in [
                'bot', 'auto', 'follow', 'like', 'click', 'free', 'win', 'update', 
                'news', 'alert', 'offer', 'deal', 'prize', 'boost', 'growth',
                'support', 'help', 'service', 'customer', 'tracking', 'order'
            ]),
            
            # Linguistic features
            'unique_word_ratio': len(set(words)) / max(1, len(words)),
            'avg_word_length': np.mean([len(word) for word in words]) if words else 0,
            'uppercase_ratio': sum(1 for char in text if char.isupper()) / max(1, len(text)),
            
            # Specific bot type indicators
            'follower_terms': sum(1 for word in words if word in ['follow', 'follower', 'growth', 'boost', 'like', 'instagram', 'twitter']),
            'spam_terms': sum(1 for word in words if word in ['free', 'win', 'prize', 'click', 'offer', 'deal', 'limited']),
            'content_terms': sum(1 for word in words if word in ['news', 'update', 'alert', 'trend', 'daily', 'report']),
            'service_terms': sum(1 for word in words if word in ['support', 'help', 'service', 'customer', 'order', 'tracking']),
        }
        features.append(list(feature_dict.values()))
    
    return np.array(features)


CUSTOM FEATURES


In [10]:
X_train_custom = extract_bot_features(X_train)
X_test_custom = extract_bot_features(X_test)

print(f"Custom features shape: {X_train_custom.shape}")

custom_model = XGBClassifier(random_state=42)
custom_model.fit(X_train_custom, y_train)
y_pred_custom = custom_model.predict(X_test_custom)
print("Custom Features Accuracy:", accuracy_score(y_test, y_pred_custom))
print("Custom Features Classification Report:\n", classification_report(y_test, y_pred_custom, target_names=label_map.keys()))

Custom features shape: (200, 15)
Custom Features Accuracy: 0.58
Custom Features Classification Report:
                       precision    recall  f1-score   support

               human       1.00      1.00      1.00        10
         content_bot       0.46      0.60      0.52        10
        follower_bot       0.57      0.40      0.47        10
            spam_bot       0.56      0.50      0.53        10
customer_service_bot       0.36      0.40      0.38        10

            accuracy                           0.58        50
           macro avg       0.59      0.58      0.58        50
        weighted avg       0.59      0.58      0.58        50



In [11]:
print("\nSIMPLE WORD PATTERNS")
pattern_vectorizer = CountVectorizer(
    max_features=200,
    ngram_range=(1, 2),
    stop_words='english',
    token_pattern=r'(?u)\b[a-z_][a-z_]{2,}\b'  # Match words with underscores
)

X_train_pattern = pattern_vectorizer.fit_transform(X_train)
X_test_pattern = pattern_vectorizer.transform(X_test)

print(f"Pattern features shape: {X_train_pattern.shape}")
print("Sample pattern features:", pattern_vectorizer.get_feature_names_out()[:20])


SIMPLE WORD PATTERNS
Pattern features shape: (200, 200)
Sample pattern features: ['account_assist' 'account_assist password_reset' 'account_transfer'
 'account_transfer content_backup' 'account_verification'
 'account_verification security_check' 'achievement_badges' 'act_fast'
 'act_fast last_chance' 'adventure' 'advertising_profits'
 'advertising_profits monetization_strategies' 'afternoon'
 'afternoon hiking' 'aggregation_services' 'amazing' 'auto_news_update'
 'auto_news_update tech_trends' 'auto_posting'
 'auto_posting content_scheduling']


In [12]:
pattern_model = RandomForestClassifier(n_estimators=100, random_state=42)
pattern_model.fit(X_train_pattern, y_train)
y_pred_pattern = pattern_model.predict(X_test_pattern)
print("Pattern-based Accuracy:", accuracy_score(y_test, y_pred_pattern))
print("Pattern-based Classification Report:\n", classification_report(y_test, y_pred_pattern, target_names=label_map.keys()))

Pattern-based Accuracy: 0.3
Pattern-based Classification Report:
                       precision    recall  f1-score   support

               human       1.00      0.50      0.67        10
         content_bot       0.00      0.00      0.00        10
        follower_bot       0.00      0.00      0.00        10
            spam_bot       0.22      1.00      0.36        10
customer_service_bot       0.00      0.00      0.00        10

            accuracy                           0.30        50
           macro avg       0.24      0.30      0.21        50
        weighted avg       0.24      0.30      0.21        50



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
# Choose the best approach and save
best_accuracy = max(
    accuracy_score(y_test, y_pred_char),
    accuracy_score(y_test, y_pred_custom),
    accuracy_score(y_test, y_pred_pattern)
)

if best_accuracy == accuracy_score(y_test, y_pred_char):
    print("\nUsing CHARACTER-LEVEL model")
    joblib.dump(char_model, "best_model_xgb.pkl")
    joblib.dump(char_vectorizer, "vectorizer.pkl")
    best_vectorizer = char_vectorizer
elif best_accuracy == accuracy_score(y_test, y_pred_custom):
    print("\nUsing CUSTOM FEATURES model")
    joblib.dump(custom_model, "best_model_xgb.pkl")
    # For custom features, we need the feature extraction function
    with open("feature_extractor.pkl", "wb") as f:
        joblib.dump(extract_bot_features, f)
    best_vectorizer = None
else:
    print("\nUsing PATTERN-BASED model")
    joblib.dump(pattern_model, "best_model_xgb.pkl")
    joblib.dump(pattern_vectorizer, "vectorizer.pkl")
    best_vectorizer = pattern_vectorizer

print(f"\nBest accuracy: {best_accuracy:.3f}")
print("Model saved successfully!")


Using CHARACTER-LEVEL model

Best accuracy: 0.840
Model saved successfully!
