In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [25]:
df = pd.read_csv('twitter_human_bots_dataset.csv')
print(f"Shape: {df.shape}")
print(f"Account types: {df['account_type'].value_counts()}")


Shape: (37438, 20)
Account types: account_type
human    25013
bot      12425
Name: count, dtype: int64


In [None]:
df = df.drop_duplicates(subset=['id'], keep='first')

numericals = ['favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'average_tweets_per_day', 'account_age_days']
nulls = df[numericals].isnull().sum()
df[numericals] = df[numericals].fillna(0)

text_cols = ['description', 'location', 'screen_name']
nulls = df[text_cols].isnull().sum()
print(nulls)
df['description'] = df['description'].fillna("")
df['location'] = df['location'].fillna('unknown')

bool_cols = ['geo_enabled', 'verified', 'default_profile', 'default_profile_image']
for col in bool_cols:
    df[col] = df[col].astype(int)


Shape after preprocessing: (37438, 14)


In [None]:
drop_cols = ['profile_background_image_url', 'profile_image_url', 'created_at']
df = df.drop(columns=drop_cols)


Final shape: (37438, 24)


In [None]:
from sklearn.decomposition import TruncatedSVD

tfidf = TfidfVectorizer(max_features=500, stop_words='english')
tfidf_mat = tfidf.fit_transform(df['description'])

num_topic = 10
svd = TruncatedSVD(n_components=num_topic, random_state=42)
svd_matrix = svd.fit_transform(tfidf_mat)

svd_df = pd.DataFrame(
    svd_matrix, 
    columns=[f"topic_{i}" for i in range(num_topic)]
)

df_final = pd.concat([df.reset_index(drop=True), svd_df.reset_index(drop=True)], axis=1)

print(f"Added {num_topic} text component features.")
print(df_final.shape)


In [None]:
exclude_cols = ['description', 'location', 'account_type', 'id', 'screen_name', 'lang']
feature_cols = [col for col in df_final.columns if col not in exclude_cols]

X = df_final[feature_cols].select_dtypes(include=[np.number]).values
y = df_final['account_type'].values

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"Target distribution: {pd.Series(y).value_counts()}")
print(f"Encoded labels: {label_encoder.classes_}")


Features shape: (37438, 21)
Target distribution: human    25013
bot      12425
Name: count, dtype: int64
Encoded labels: ['bot' 'human']


In [29]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")


Train shape: (29950, 21)
Test shape: (7488, 21)


In [30]:
nn = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),
    max_iter=500,
    random_state=42,
    alpha=0.01,
    learning_rate_init=0.001,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=10,
    verbose=True
)

nn.fit(X_train, y_train)
print("Training complete.")


Iteration 1, loss = 0.51739615
Validation score: 0.790985
Iteration 2, loss = 0.44575187
Validation score: 0.791653
Iteration 3, loss = 0.43201902
Validation score: 0.794992
Iteration 4, loss = 0.42718432
Validation score: 0.790317
Iteration 5, loss = 0.42113917
Validation score: 0.792321
Iteration 6, loss = 0.41976982
Validation score: 0.783639
Iteration 7, loss = 0.41756921
Validation score: 0.790985
Iteration 8, loss = 0.41485565
Validation score: 0.792321
Iteration 9, loss = 0.41173333
Validation score: 0.799666
Iteration 10, loss = 0.41178816
Validation score: 0.793322
Iteration 11, loss = 0.40909501
Validation score: 0.796995
Iteration 12, loss = 0.40762223
Validation score: 0.788314
Iteration 13, loss = 0.40535187
Validation score: 0.797329
Iteration 14, loss = 0.40605819
Validation score: 0.800000
Iteration 15, loss = 0.40449964
Validation score: 0.799666
Iteration 16, loss = 0.40212353
Validation score: 0.798331
Iteration 17, loss = 0.40162846
Validation score: 0.801669
Iterat

In [31]:
y_train_pred = nn.predict(X_train)
y_test_pred = nn.predict(X_test)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred, target_names=label_encoder.classes_))
print("\nConfusion Matrix (Test):")
print(confusion_matrix(y_test, y_test_pred))


Train Accuracy: 0.8141
Test Accuracy: 0.8030

Classification Report (Test):
              precision    recall  f1-score   support

         bot       0.70      0.72      0.71      2485
       human       0.86      0.84      0.85      5003

    accuracy                           0.80      7488
   macro avg       0.78      0.78      0.78      7488
weighted avg       0.80      0.80      0.80      7488


Confusion Matrix (Test):
[[1787  698]
 [ 777 4226]]
