In [1]:
import pandas as pd
import numpy as np
import math

from keras.layers import Embedding, Dense, LSTM, Dense, Input, concatenate
from keras.models import Model
from keras.utils import plot_model
from sklearn.metrics import precision_score, roc_auc_score, recall_score, f1_score
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import pandas as pd

bot_accounts = pd.concat([pd.read_csv('D:/DNNforBotDetection/DNNforBotDetection/input/cresci-2017.csv/social_spambots_1/users.csv'),
                         pd.read_csv('D:/DNNforBotDetection/DNNforBotDetection/input/cresci-2017.csv/social_spambots_2/users.csv'),
                         pd.read_csv('D:/DNNforBotDetection/DNNforBotDetection/input/cresci-2017.csv/social_spambots_3/users.csv')]).reset_index(drop=True)
clean_accounts = pd.read_csv('D:/DNNforBotDetection/DNNforBotDetection/input/cresci-2017.csv/genuine_accounts/users.csv')

requiredColumns = ['screen_name', 'created_at', 'updated', 'location', 'statuses_count', 'friends_count',
                   'followers_count', 'favourites_count', 'default_profile_image', 'profile_use_background_image',
                   'protected', 'default_profile']
bot_accounts = bot_accounts[requiredColumns]
clean_accounts = clean_accounts[requiredColumns]

def clean_df(df):
    df['created_at'] = pd.to_datetime(df['created_at']).dt.tz_localize(None).dt.tz_localize('UTC')  # Convert to UTC timezone
    df['updated'] = pd.to_datetime(df['updated']).dt.tz_localize(None).dt.tz_localize('UTC')  # Convert to UTC timezone
    df['age'] = (df['updated'] - df['created_at']).dt.days  # Use dt.days to get the number of days
    df['has_location'] = df['location'].notnull().astype(int)  # Convert boolean to integer
    df['has_avatar'] = df['default_profile_image'].notnull().astype(int)
    df['has_background'] = df['profile_use_background_image'].notnull().astype(int)
    df['is_protected'] = df['protected'].notnull().astype(int)
    df['profile_modified'] = df['default_profile'].notnull().astype(int)
    df = df.rename(columns={"screen_name": "username", "statuses_count": "total_tweets", "friends_count": "total_following",
                            "followers_count": "total_followers", "favourites_count": "total_likes"})
    return df[['username', 'age', 'has_location', 'total_tweets', 'total_following', 'total_followers',
               'total_likes', 'has_avatar', 'has_background', 'is_protected', 'profile_modified']]

bot_accounts = clean_df(bot_accounts)
clean_accounts = clean_df(clean_accounts)


In [3]:
bot_accounts['BotOrNot'] = 1
clean_accounts['BotOrNot'] = 0

combined_df = pd.concat([bot_accounts, clean_accounts])

new_df = combined_df.sample(frac=1).reset_index(drop=True)

In [4]:
bot_accounts

Unnamed: 0,username,age,has_location,total_tweets,total_following,total_followers,total_likes,has_avatar,has_background,is_protected,profile_modified,BotOrNot
0,davideb66,2555,0,1299,40,22,1,1,1,0,1,1
1,ElisaDospina,2521,1,18665,3442,12561,16358,0,1,0,0,1
2,Vladimir65,2497,1,22987,755,600,14,0,1,0,0,1
3,RafielaMorales,2435,1,7975,350,398,11,0,1,0,0,1
4,FabrizioC_c,2413,1,20218,405,413,162,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4907,AldridgeBizOpp,728,1,106,23,4,0,0,1,0,0,1
4908,DLBusinessOpp,728,1,173,20,2,0,0,1,0,0,1
4909,LFCareerOptions,709,1,137,124,29,0,0,1,0,0,1
4910,RickChou_TD,709,1,170,353,115,38,0,1,0,0,1


In [5]:
training_df = new_df.drop('username', axis=1)[:int(combined_df.shape[0] * 0.8)]
test_df = new_df.drop('username', axis=1)[int(combined_df.shape[0] * 0.8):]

columns_to_standardize = ['age', 'total_tweets', 'total_following', 'total_followers', 'total_likes']

training_df_mean = training_df[columns_to_standardize].mean()
training_df_std = training_df[columns_to_standardize].std()

training_df[columns_to_standardize] = (training_df[columns_to_standardize] - training_df_mean)/training_df_std
test_df[columns_to_standardize] = (test_df[columns_to_standardize] - training_df_mean)/training_df_std

# training_df_mean = training_df.mean()
# training_df_std = training_df.std()

# training_df = (training_df - training_df_mean)/training_df_std
# test_df = (test_df - training_df_mean)/training_df_std

# max_vals = training_df.max()

# training_df = training_df/max_vals
# test_df = test_df/max_vals

In [6]:
test_df

Unnamed: 0,age,has_location,total_tweets,total_following,total_followers,total_likes,has_avatar,has_background,is_protected,profile_modified,BotOrNot
6708,-0.768271,0,-0.367037,-0.277217,-0.072157,-0.251898,0,1,0,0,1
6709,2.194420,1,0.052771,0.271688,-0.041939,-0.224139,0,1,0,0,0
6710,-0.764959,0,-0.367410,-0.277217,-0.071720,-0.251898,0,1,0,0,1
6711,-0.750054,0,-0.365828,-0.276725,-0.072157,-0.251898,0,1,0,0,1
6712,-0.755022,0,-0.366805,-0.277709,-0.071793,-0.251898,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
8381,-0.758335,0,-0.366572,-0.280168,-0.071866,-0.251898,0,1,0,0,1
8382,-0.753366,0,-0.366572,-0.277217,-0.072084,-0.251898,0,1,0,0,1
8383,-0.523174,1,-0.356477,-0.258527,-0.070337,-0.203351,0,1,0,1,0
8384,-0.771583,0,-0.367037,-0.280168,-0.071938,-0.251898,0,1,0,0,1


In [7]:
X_train = training_df.drop(['BotOrNot', 'is_protected'], axis=1).values
y_train = training_df['BotOrNot'].values.reshape(-1,1)

X_test = test_df.drop(['BotOrNot', 'is_protected'], axis=1).values
y_test = test_df['BotOrNot'].values.reshape(-1,1)

In [8]:
X_test[0]

array([-0.76827094,  0.        , -0.36703735, -0.27721707, -0.07215691,
       -0.25189841,  0.        ,  1.        ,  0.        ])

In [9]:
from imblearn.over_sampling import SMOTE

s = SMOTE()
smote_X, smote_y = s.fit_resample(X_train, y_train.reshape(-1))

# e = EditedNearestNeighbours()
# r_X, r_y = e.fit_resample(smote_X, smote_y)

In [10]:
inp = Input(shape=[9])

another = Dense(500, activation='relu')(inp)
another = Dense(200, activation='relu')(another)
another = Dense(1, activation='sigmoid')(another)

mod = Model(inp, another)
mod.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
training = mod.fit(x=smote_X, y=smote_y, batch_size=64, epochs=100, validation_data=(X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [57]:
X_test[100]

array([ 0.62613125,  1.        , -0.14522387, -0.17687954, -0.06392884,
       -0.17197617,  0.        ,  1.        ,  0.        ])

In [14]:
# Evaluating the model
y_pred_prob = mod.predict(X_test)
y_pred = (y_pred_prob >= 0.5).astype(int)
y_scores = y_pred_prob.ravel()

recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_scores)

print("Recall Score:", recall)
print("F1 Score:", f1)
print("ROC/AUC Score:", roc_auc)

Recall Score: 0.9756345177664975
F1 Score: 0.9821154828819622
ROC/AUC Score: 0.9962218266786795


In [None]:
mod.save('D:/DNNforBotDetection/DNNforBotDetection/models/model.h5')

In [58]:
def clean(df):
    
    df = df.drop('username', axis=1)
    
    df[columns_to_standardize] = (df[columns_to_standardize] - training_df_mean)/training_df_std
    df = df.drop('is_protected', axis=1).values
    return df

In [59]:
first = bot_accounts.head(1)
first
first[columns_to_standardize]

Unnamed: 0,age,total_tweets,total_following,total_followers,total_likes
0,2555,1299,40,22,1


In [60]:
clean(first)

array([[ 2.23085301,  0.        , -0.30921139, -0.27869262, -0.07084624,
        -0.25176929,  1.        ,  1.        ,  1.        ,  1.        ]])