In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [46]:
df = pd.read_csv('twitter_human_bots_dataset.csv')
print(f"Shape: {df.shape}")
print(f"Account types: {df['account_type'].value_counts()}")


Shape: (37438, 20)
Account types: account_type
human    25013
bot      12425
Name: count, dtype: int64


In [47]:
df = df.drop_duplicates(subset=['id'], keep='first')

numericals = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'average_tweets_per_day', 'account_age_days']
nulls = df[numericals].isnull().sum()
df[numericals] = df[numericals].fillna(0)

text_cols = ['description', 'location', 'screen_name']
nulls = df[text_cols].isnull().sum()
print(nulls)
df['description'] = df['description'].fillna("")
df['location'] = df['location'].fillna('unknown')

bool_cols = ['geo_enabled', 'verified', 'default_profile', 'default_profile_image']
for col in bool_cols:
    df[col] = df[col].astype(int)


description    7257
location          4
screen_name       0
dtype: int64


In [48]:
drop_cols = ['profile_background_image_url', 'profile_image_url', 'created_at']
df = df.drop(columns=drop_cols)


In [49]:
from sklearn.decomposition import TruncatedSVD

tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_mat = tfidf.fit_transform(df['description'])

num_topic = 10
svd = TruncatedSVD(n_components=num_topic, random_state=42)
svd_matrix = svd.fit_transform(tfidf_mat)

svd_df = pd.DataFrame(
    svd_matrix, 
    columns=[f"topic_{i}" for i in range(num_topic)]
)

df_final = pd.concat([df.reset_index(drop=True), svd_df.reset_index(drop=True)], axis=1)

print(f"Added {num_topic} text component features.")
print(df_final.shape)


Added 10 text component features.
(37438, 27)


In [50]:
exclude_cols = ['description', 'location', 'account_type', 'id', 'screen_name', 'lang']
feature_cols = [col for col in df_final.columns if col not in exclude_cols]

X = df_final[feature_cols].select_dtypes(include=[np.number]).values
y = df_final['account_type'].values

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"Target distribution: {pd.Series(y).value_counts()}")
print(f"Encoded labels: {label_encoder.classes_}")


Features shape: (37438, 21)
Target distribution: human    25013
bot      12425
Name: count, dtype: int64
Encoded labels: ['bot' 'human']


In [51]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")


Train shape: (29950, 21)
Test shape: (7488, 21)


In [52]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42, criterion='entropy')


rfc.fit(X_train, y_train)
print("Training complete.")

Training complete.


In [53]:
# print names of features with importances, sorted by importance
sorted_indices = np.argsort(rfc.feature_importances_)[::-1]
for i in sorted_indices:
    print(f"{feature_cols[i]}: {rfc.feature_importances_[i]:.4f}")


followers_count: 0.1665
favourites_count: 0.1384
friends_count: 0.1051
statuses_count: 0.0885
average_tweets_per_day: 0.0826
account_age_days: 0.0791
Unnamed: 0: 0.0435
verified: 0.0393
topic_0: 0.0290
geo_enabled: 0.0241
topic_1: 0.0229
topic_8: 0.0228
topic_9: 0.0221
topic_2: 0.0218
topic_4: 0.0212
topic_7: 0.0205
topic_3: 0.0193
topic_5: 0.0191
topic_6: 0.0191
default_profile: 0.0131
default_profile_image: 0.0020


In [54]:
y_train_pred = rfc.predict(X_train)
y_test_pred = rfc.predict(X_test)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))


Train Accuracy: 1.0000
Test Accuracy: 0.8823

Classification Report (Test):
              precision    recall  f1-score   support

         bot       0.87      0.76      0.81      2485
       human       0.89      0.94      0.91      5003

    accuracy                           0.88      7488
   macro avg       0.88      0.85      0.86      7488
weighted avg       0.88      0.88      0.88      7488


Confusion Matrix (Test):
[[1891  594]
 [ 287 4716]]
