In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
df = pd.read_csv('twitter_human_bots_dataset.csv')
print(f"Shape: {df.shape}")
print(f"Account types: {df['account_type'].value_counts()}")


Shape: (37438, 20)
Account types: account_type
human    25013
bot      12425
Name: count, dtype: int64


In [3]:
df = df.drop_duplicates(subset=['id'], keep='first')

numericals = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'average_tweets_per_day', 'account_age_days']
nulls = df[numericals].isnull().sum()
df[numericals] = df[numericals].fillna(0)

text_cols = ['description', 'location', 'screen_name']
nulls = df[text_cols].isnull().sum()
print(nulls)
df['description'] = df['description'].fillna("")
df['location'] = df['location'].fillna('unknown')

bool_cols = ['geo_enabled', 'verified', 'default_profile', 'default_profile_image']
for col in bool_cols:
    df[col] = df[col].astype(int)


description    7257
location          4
screen_name       0
dtype: int64


In [4]:
drop_cols = ['profile_background_image_url', 'profile_image_url', 'created_at']
df = df.drop(columns=drop_cols)


In [6]:
exclude_cols = ['description', 'location', 'account_type', 'id', 'screen_name', 'lang']
feature_cols = [col for col in df.columns if col not in exclude_cols]

X = df
y = df['account_type']

print(f"Features shape: {X.shape}")
print(f"Target distribution: {pd.Series(y).value_counts()}")


Features shape: (37438, 17)
Target distribution: account_type
human    25013
bot      12425
Name: count, dtype: int64


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")


Train shape: (29950, 17)
Test shape: (7488, 17)


In [94]:
def predict(df):
    return np.where((df['average_tweets_per_day'] > 5) | (df['followers_count'] < 50), 'bot', 'human')

y_train_pred = predict(X_train)
y_test_pred = predict(X_test)


In [95]:
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))


Train Accuracy: 0.6263
Test Accuracy: 0.6259

Classification Report (Test):
              precision    recall  f1-score   support

         bot       0.46      0.76      0.57      2485
       human       0.82      0.56      0.67      5003

    accuracy                           0.63      7488
   macro avg       0.64      0.66      0.62      7488
weighted avg       0.70      0.63      0.64      7488


Confusion Matrix (Test):
[[1890  595]
 [2206 2797]]
