In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [12]:
df = pd.read_csv('twitter_human_bots_dataset.csv')
print(f"Shape: {df.shape}")
print(f"Account types: {df['account_type'].value_counts()}")


Shape: (37438, 20)
Account types: account_type
human    25013
bot      12425
Name: count, dtype: int64


In [13]:
df = df.drop_duplicates(subset=['id'], keep='first')

numericals = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'average_tweets_per_day', 'account_age_days']
nulls = df[numericals].isnull().sum()
df[numericals] = df[numericals].fillna(0)

text_cols = ['description', 'location', 'screen_name']
nulls = df[text_cols].isnull().sum()
print(nulls)
df['description'] = df['description'].fillna("")
df['location'] = df['location'].fillna('unknown')

bool_cols = ['geo_enabled', 'verified', 'default_profile', 'default_profile_image']
for col in bool_cols:
    df[col] = df[col].astype(int)


description    7257
location          4
screen_name       0
dtype: int64


In [14]:
drop_cols = ['profile_background_image_url', 'profile_image_url', 'created_at']
df = df.drop(columns=drop_cols)


In [15]:
exclude_cols = ['description', 'location', 'account_type', 'id', 'screen_name', 'lang']
feature_cols = [col for col in df.columns if col not in exclude_cols]

X_numerical = df[feature_cols].select_dtypes(include=[np.number]).values
y = df['account_type'].values

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train_full, X_test_full, y_train_full, y_test = train_test_split(
    df, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Train shape: {X_train_full.shape}")
print(f"Test shape: {X_test_full.shape}")


Train shape: (29950, 17)
Test shape: (7488, 17)


In [16]:
from sklearn.decomposition import TruncatedSVD

tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_train_full = tfidf.fit_transform(X_train_full['description'])
tfidf_test = tfidf.transform(X_test_full['description'])

num_topic = 10
svd = TruncatedSVD(n_components=num_topic, random_state=42)
svd_train_full = svd.fit_transform(tfidf_train_full)
svd_test = svd.transform(tfidf_test)

exclude_cols = ['description', 'location', 'account_type', 'id', 'screen_name', 'lang']
feature_cols = [col for col in X_train_full.columns if col not in exclude_cols]

X_train_full_numerical = X_train_full[feature_cols].select_dtypes(include=[np.number]).values
X_test_numerical = X_test_full[feature_cols].select_dtypes(include=[np.number]).values

X_train_full_combined = np.hstack([X_train_full_numerical, svd_train_full])
X_test_combined = np.hstack([X_test_numerical, svd_test])

print(f"Train features shape: {X_train_full_combined.shape}")
print(f"Test features shape: {X_test_combined.shape}")
print(f"Encoded labels: {label_encoder.classes_}")


Train features shape: (29950, 21)
Test features shape: (7488, 21)
Encoded labels: ['bot' 'human']


In [17]:
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full_combined)
X_test_scaled = scaler.transform(X_test_combined)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full_scaled, y_train_full, test_size=0.1, random_state=42, stratify=y_train_full
)

print(f"Train shape: {X_train.shape}")
print(f"Val shape: {X_val.shape}")
print(f"Test shape: {X_test_scaled.shape}")


Train shape: (26955, 21)
Val shape: (2995, 21)
Test shape: (7488, 21)


In [18]:
class BotPredictorDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class AdvancedNeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_sizes=[512, 256, 128, 64], dropout_rate=0.3):
        super(AdvancedNeuralNetwork, self).__init__()
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.BatchNorm1d(hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            prev_size = hidden_size
        
        layers.append(nn.Linear(prev_size, 2))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

train_dataset = BotPredictorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

model = AdvancedNeuralNetwork(input_size=X_train.shape[1], hidden_sizes=[512, 256, 128, 64], dropout_rate=0.3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

X_train_tensor = torch.FloatTensor(X_train).to(device)
X_val_tensor = torch.FloatTensor(X_val).to(device)
y_train_tensor = torch.LongTensor(y_train).to(device)
y_val_tensor = torch.LongTensor(y_val).to(device)
y_test_tensor = torch.LongTensor(y_test).to(device)

print("Model architecture:")
print(model)


Using device: cpu
Model architecture:
AdvancedNeuralNetwork(
  (network): Sequential(
    (0): Linear(in_features=21, out_features=512, bias=True)
    (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=512, out_features=256, bias=True)
    (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=256, out_features=128, bias=True)
    (9): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=128, out_features=64, bias=True)
    (13): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): ReLU()
    (15): Dropout(p=0.3, inplace=False)
    (16): Linear(in_features=64, out_features=2, bias=True)
  )
)


In [19]:
num_epochs = 100
best_val_loss = float('inf')
patience = 10
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    model.eval()
    with torch.no_grad():
        train_outputs = model(X_train_tensor)
        train_loss_val = criterion(train_outputs, y_train_tensor).item()
        
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor).item()
        val_pred = torch.argmax(val_outputs, dim=1)
        val_acc = accuracy_score(y_val_tensor.cpu().numpy(), val_pred.cpu().numpy())
    
    scheduler.step(val_loss)
    
    if (epoch + 1) % 10 == 0:
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss_val:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, LR: {current_lr:.6f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model_advanced.pth')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

model.load_state_dict(torch.load('best_model_advanced.pth'))
print("Training complete.")


Epoch [10/100], Train Loss: 0.4126, Val Loss: 0.4105, Val Acc: 0.8013, LR: 0.001000
Epoch [20/100], Train Loss: 0.4051, Val Loss: 0.4123, Val Acc: 0.7943, LR: 0.001000
Early stopping at epoch 26
Training complete.


In [20]:
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)

model.eval()
with torch.no_grad():
    train_outputs = model(X_train_tensor)
    test_outputs = model(X_test_tensor)
    y_train_pred = torch.argmax(train_outputs, dim=1).cpu().numpy()
    y_test_pred = torch.argmax(test_outputs, dim=1).cpu().numpy()

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))


Train Accuracy: 0.8031
Test Accuracy: 0.8053

Classification Report (Test):
              precision    recall  f1-score   support

         bot       0.70      0.71      0.71      2485
       human       0.86      0.85      0.85      5003

    accuracy                           0.81      7488
   macro avg       0.78      0.78      0.78      7488
weighted avg       0.81      0.81      0.81      7488


Confusion Matrix (Test):
[[1774  711]
 [ 747 4256]]
