In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
df = pd.read_csv('twitter_human_bots_dataset.csv')
print(f"Shape: {df.shape}")
print(f"Account types: {df['account_type'].value_counts()}")


Shape: (37438, 20)
Account types: account_type
human    25013
bot      12425
Name: count, dtype: int64


In [3]:
df = df.drop_duplicates(subset=['id'], keep='first')

numericals = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'average_tweets_per_day', 'account_age_days']
nulls = df[numericals].isnull().sum()
df[numericals] = df[numericals].fillna(0)

text_cols = ['description', 'location', 'screen_name']
nulls = df[text_cols].isnull().sum()
print(nulls)
df['description'] = df['description'].fillna("")
df['location'] = df['location'].fillna('unknown')

bool_cols = ['geo_enabled', 'verified', 'default_profile', 'default_profile_image']
for col in bool_cols:
    df[col] = df[col].astype(int)


description    7257
location          4
screen_name       0
dtype: int64


In [4]:
drop_cols = ['profile_background_image_url', 'profile_image_url', 'created_at']
df = df.drop(columns=drop_cols)


In [5]:
exclude_cols = ['description', 'location', 'account_type', 'id', 'screen_name', 'lang']
feature_cols = [col for col in df.columns if col not in exclude_cols]

X_numerical = df[feature_cols].select_dtypes(include=[np.number]).values
y = df['account_type'].values

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train_full, X_test_full, y_train, y_test = train_test_split(
    df, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Train shape: {X_train_full.shape}")
print(f"Test shape: {X_test_full.shape}")


Train shape: (29950, 17)
Test shape: (7488, 17)


In [6]:
from sklearn.decomposition import TruncatedSVD

tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_train = tfidf.fit_transform(X_train_full['description'])
tfidf_test = tfidf.transform(X_test_full['description'])

num_topic = 10
svd = TruncatedSVD(n_components=num_topic, random_state=42)
svd_train = svd.fit_transform(tfidf_train)
svd_test = svd.transform(tfidf_test)

exclude_cols = ['description', 'location', 'account_type', 'id', 'screen_name', 'lang']
feature_cols = [col for col in X_train_full.columns if col not in exclude_cols]

X_train_numerical = X_train_full[feature_cols].select_dtypes(include=[np.number]).values
X_test_numerical = X_test_full[feature_cols].select_dtypes(include=[np.number]).values

X_train = np.hstack([X_train_numerical, svd_train])
X_test = np.hstack([X_test_numerical, svd_test])

print(f"Train features shape: {X_train.shape}")
print(f"Test features shape: {X_test.shape}")
print(f"Encoded labels: {label_encoder.classes_}")


Train features shape: (29950, 21)
Test features shape: (7488, 21)
Encoded labels: ['bot' 'human']


In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Train shape: {X_train_scaled.shape}")
print(f"Test shape: {X_test_scaled.shape}")


Train shape: (29950, 21)
Test shape: (7488, 21)


In [8]:
nn = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),
    max_iter=500,
    random_state=42,
    alpha=0.01,
    learning_rate_init=0.001,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
    verbose=True
)

nn.fit(X_train_scaled, y_train)
print("Training complete.")


Iteration 1, loss = 0.51789696
Validation score: 0.789983
Iteration 2, loss = 0.44528470
Validation score: 0.788648
Iteration 3, loss = 0.43150823
Validation score: 0.793322
Iteration 4, loss = 0.42710561
Validation score: 0.794324
Iteration 5, loss = 0.42094181
Validation score: 0.790317
Iteration 6, loss = 0.42015931
Validation score: 0.783639
Iteration 7, loss = 0.41808636
Validation score: 0.793656
Iteration 8, loss = 0.41557892
Validation score: 0.793990
Iteration 9, loss = 0.41234132
Validation score: 0.798664
Iteration 10, loss = 0.41247504
Validation score: 0.794324
Iteration 11, loss = 0.40967053
Validation score: 0.796995
Iteration 12, loss = 0.40813781
Validation score: 0.785643
Iteration 13, loss = 0.40645203
Validation score: 0.796327
Iteration 14, loss = 0.40690565
Validation score: 0.801002
Iteration 15, loss = 0.40532690
Validation score: 0.800334
Iteration 16, loss = 0.40342363
Validation score: 0.797329
Iteration 17, loss = 0.40316445
Validation score: 0.800668
Iterat

In [9]:
y_train_pred = nn.predict(X_train_scaled)
y_test_pred = nn.predict(X_test_scaled)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))


Train Accuracy: 0.8100
Test Accuracy: 0.8010

Classification Report (Test):
              precision    recall  f1-score   support

         bot       0.70      0.71      0.70      2485
       human       0.85      0.85      0.85      5003

    accuracy                           0.80      7488
   macro avg       0.78      0.78      0.78      7488
weighted avg       0.80      0.80      0.80      7488


Confusion Matrix (Test):
[[1761  724]
 [ 766 4237]]
