In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
import pandas as pd
from datetime import datetime

In [4]:
def baseline_processing(path):
    df = pd.read_csv(path)
    df = df[['statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count']].copy()
    df['statuses_over_followers'] = df['statuses_count'] / df['followers_count']
    df['statuses_over_friends'] = df['statuses_count'] / df['friends_count']
    df['statuses_over_favourites'] = df['statuses_count'] / df['favourites_count']
    df['statuses_over_listed'] = df['statuses_count'] / df['listed_count']
    df['followers_over_friends'] = df['followers_count'] / df['friends_count']
    df['followers_over_favourites'] = df['followers_count'] / df['favourites_count']
    df['followers_over_listed'] = df['followers_count'] / df['listed_count']
    df['friends_over_favourites'] = df['friends_count'] / df['favourites_count']
    df['friends_over_listed'] = df['friends_count'] / df['listed_count']
    df['favourites_over_listed'] = df['favourites_count'] / df['listed_count']
    return df

In [5]:
real = baseline_processing('../data/cresci-2017/genuine_accounts/users.csv')
real.head()

Unnamed: 0,statuses_count,followers_count,friends_count,favourites_count,listed_count,statuses_over_followers,statuses_over_friends,statuses_over_favourites,statuses_over_listed,followers_over_friends,followers_over_favourites,followers_over_listed,friends_over_favourites,friends_over_listed,favourites_over_listed
0,2177,208,332,265,1,10.466346,6.557229,8.215094,2177.0,0.626506,0.784906,208.0,1.25283,332.0,265.0
1,2660,330,485,3972,5,8.060606,5.484536,0.669688,532.0,0.680412,0.083082,66.0,0.122105,97.0,794.4
2,1254,166,177,1185,0,7.554217,7.084746,1.058228,inf,0.937853,0.140084,inf,0.149367,inf,inf
3,202968,2248,981,60304,101,90.288256,206.899083,3.365747,2009.584158,2.291539,0.037278,22.257426,0.016268,9.712871,597.069307
4,82,21,79,5,0,3.904762,1.037975,16.4,inf,0.265823,4.2,inf,15.8,inf,inf


In [6]:
ss1 = baseline_processing('../data/cresci-2017/social_spambots_1/users.csv')
ss2 = baseline_processing('../data/cresci-2017/social_spambots_2/users.csv')
ss3 = baseline_processing('../data/cresci-2017/social_spambots_3/users.csv')

ts1 = baseline_processing('../data/cresci-2017/traditional_spambots_1/users.csv')
ts2 = baseline_processing('../data/cresci-2017/traditional_spambots_2/users.csv')
ts3 = baseline_processing('../data/cresci-2017/traditional_spambots_3/users.csv')
ts4 = baseline_processing('../data/cresci-2017/traditional_spambots_4/users.csv')

ff = baseline_processing('../data/cresci-2017/fake_followers/users.csv')

In [7]:
bot = pd.concat([ss1, ss2, ss3, ts1, ts2, ts3, ts4, ff], axis=0)
bot.head()

Unnamed: 0,statuses_count,followers_count,friends_count,favourites_count,listed_count,statuses_over_followers,statuses_over_friends,statuses_over_favourites,statuses_over_listed,followers_over_friends,followers_over_favourites,followers_over_listed,friends_over_favourites,friends_over_listed,favourites_over_listed
0,1299,22,40,1,0,59.045455,32.475,1299.0,inf,0.55,22.0,inf,40.0,inf,inf
1,18665,12561,3442,16358,110,1.485949,5.422719,1.141032,169.681818,3.649332,0.767881,114.190909,0.210417,31.290909,148.709091
2,22987,600,755,14,6,38.311667,30.446358,1641.928571,3831.166667,0.794702,42.857143,100.0,53.928571,125.833333,2.333333
3,7975,398,350,11,2,20.037688,22.785714,725.0,3987.5,1.137143,36.181818,199.0,31.818182,175.0,5.5
4,20218,413,405,162,8,48.953995,49.920988,124.802469,2527.25,1.019753,2.549383,51.625,2.5,50.625,20.25


In [16]:
real.replace([np.inf, -np.inf], np.nan, inplace=True)
real.fillna(0, inplace=True)
bot.replace([np.inf, -np.inf], np.nan, inplace=True)
bot.fillna(0, inplace=True)
print(real.shape, bot.shape)

(3474, 15) (10894, 15)


In [17]:
data = pd.concat([real, bot], axis=0)
data = data.to_numpy()
print(data.shape)

(14368, 15)


In [18]:
target = np.array([0] * len(real.index) + [1] * len(bot.index))
target.shape

(14368,)

In [19]:
print(data[3500], target[3500])
print(data[3200], target[3200])

[9.05300000e+03 2.43000000e+02 9.65000000e+02 2.72000000e+02
 0.00000000e+00 3.72551440e+01 9.38134715e+00 3.32830882e+01
 0.00000000e+00 2.51813472e-01 8.93382353e-01 0.00000000e+00
 3.54779412e+00 0.00000000e+00 0.00000000e+00] 1
[6.82000000e+02 1.75000000e+02 1.54000000e+02 4.71000000e+02
 0.00000000e+00 3.89714286e+00 4.42857143e+00 1.44798301e+00
 0.00000000e+00 1.13636364e+00 3.71549894e-01 0.00000000e+00
 3.26963907e-01 0.00000000e+00 0.00000000e+00] 0


In [23]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(data,target,test_size=0.3)

rfc = RandomForestClassifier(n_estimators=100, random_state=0)
rfc = rfc.fit(Xtrain, Ytrain)
score = rfc.score(Xtest, Ytest)

print(score)

0.9849222918116446


In [27]:
test_result = rfc.predict_proba(Xtest[0:20])
print(test_result)
print(Ytest[0:20])

[[0.   1.  ]
 [1.   0.  ]
 [0.   1.  ]
 [0.   1.  ]
 [0.   1.  ]
 [0.   1.  ]
 [0.02 0.98]
 [1.   0.  ]
 [0.   1.  ]
 [1.   0.  ]
 [0.   1.  ]
 [0.   1.  ]
 [0.   1.  ]
 [0.   1.  ]
 [0.83 0.17]
 [0.   1.  ]
 [0.06 0.94]
 [0.   1.  ]
 [0.   1.  ]
 [0.   1.  ]]
[1 0 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1]


In [29]:
import pickle
freal = open('./real-pickled', 'wb')
fbot = open('./bot-pickled', 'wb')
pickle.dump(real, freal, protocol=1)
pickle.dump(bot, fbot, protocol=1)