In [1]:
import pandas as pd
import numpy as np
import math

from keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate
from keras.models import Model
from keras.utils import plot_model
from sklearn.metrics import precision_score, roc_auc_score, recall_score, f1_score
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import pandas as pd

bot_accounts = pd.concat([pd.read_csv('/input/cresci-2017.csv/social_spambots_1/users.csv'),
                         pd.read_csv(''/input/cresci-2017.csv/social_spambots_1/users.csv'),
                         pd.read_csv('/content/gdrive/MyDrive/data/social_spambots_3.csv')]).reset_index(drop=True)
clean_accounts = pd.read_csv('/content/gdrive/MyDrive/data/geniune_accounts.csv')

requiredColumns = ['screen_name', 'created_at', 'updated', 'location', 'verified', 'statuses_count', 'friends_count',
                   'followers_count', 'favourites_count', 'default_profile_image', 'profile_use_background_image',
                   'protected', 'default_profile']
bot_accounts = bot_accounts[requiredColumns]
clean_accounts = clean_accounts[requiredColumns]

def clean_df(df):
    df['created_at'] = pd.to_datetime(df['created_at']).dt.tz_localize(None).dt.tz_localize('UTC')  # Convert to UTC timezone
    df['updated'] = pd.to_datetime(df['updated']).dt.tz_localize(None).dt.tz_localize('UTC')  # Convert to UTC timezone
    df['age'] = (df['updated'] - df['created_at']).dt.days  # Use dt.days to get the number of days
    df['has_location'] = df['location'].notnull().astype(int)  # Convert boolean to integer
    df['has_avatar'] = df['default_profile_image'].notnull().astype(int)
    df['has_background'] = df['profile_use_background_image'].notnull().astype(int)
    df['is_verified'] = df['verified'].notnull().astype(int)
    df['is_protected'] = df['protected'].notnull().astype(int)
    df['profile_modified'] = df['default_profile'].notnull().astype(int)
    df = df.rename(columns={"screen_name": "username", "statuses_count": "total_tweets", "friends_count": "total_following",
                            "followers_count": "total_followers", "favourites_count": "total_likes"})
    return df[['username', 'age', 'has_location', 'is_verified', 'total_tweets', 'total_following', 'total_followers',
               'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified']]

bot_accounts = clean_df(bot_accounts)
clean_accounts = clean_df(clean_accounts)


FileNotFoundError: [Errno 2] No such file or directory: '/content/gdrive/MyDrive/data/social_spambots_1.csv'

In [None]:
bot_accounts['BotOrNot'] = 1
clean_accounts['BotOrNot'] = 0

combined_df = pd.concat([bot_accounts, clean_accounts])

new_df = combined_df.sample(frac=1).reset_index(drop=True)

In [None]:
training_df = new_df.drop('username', axis=1)[:int(combined_df.shape[0] * 0.8)]
test_df = new_df.drop('username', axis=1)[int(combined_df.shape[0] * 0.8):]

columns_to_standardize = ['age', 'total_tweets', 'total_following', 'total_followers', 'total_likes']

training_df_mean = training_df[columns_to_standardize].mean()
training_df_std = training_df[columns_to_standardize].std()

training_df[columns_to_standardize] = (training_df[columns_to_standardize] - training_df_mean)/training_df_std
test_df[columns_to_standardize] = (test_df[columns_to_standardize] - training_df_mean)/training_df_std

# training_df_mean = training_df.mean()
# training_df_std = training_df.std()

# training_df = (training_df - training_df_mean)/training_df_std
# test_df = (test_df - training_df_mean)/training_df_std

# max_vals = training_df.max()

# training_df = training_df/max_vals
# test_df = test_df/max_vals

In [None]:
X_train = training_df.drop(['BotOrNot', 'is_protected'], axis=1).values
y_train = training_df['BotOrNot'].values.reshape(-1,1)

X_test = test_df.drop(['BotOrNot', 'is_protected'], axis=1).values
y_test = test_df['BotOrNot'].values.reshape(-1,1)

In [None]:
from imblearn.over_sampling import SMOTE

s = SMOTE()
smote_X, smote_y = s.fit_resample(X_train, y_train.reshape(-1))

# e = EditedNearestNeighbours()
# r_X, r_y = e.fit_resample(smote_X, smote_y)

In [None]:
inp = Input(shape=[10])

another = Dense(500, activation='relu')(inp)
another = Dense(200, activation='relu')(another)
another = Dense(1, activation='sigmoid')(another)

mod = Model(inp, another)
mod.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
training = mod.fit(x=smote_X, y=smote_y, batch_size=64, epochs=20, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Evaluating the model
y_pred_prob = mod.predict(X_test)
y_pred = (y_pred_prob >= 0.5).astype(int)
y_scores = y_pred_prob.ravel()

recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_scores)

print("Recall Score:", recall)
print("F1 Score:", f1)
print("ROC/AUC Score:", roc_auc)

Recall Score: 0.9714889123548046
F1 Score: 0.983957219251337
ROC/AUC Score: 0.9948444002733088


In [None]:
mod.save('model.h5')