In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
import pandas as pd
realDF = pd.read_csv('../data-processing/genuine_accounts_processed.csv')
ss1DF = pd.read_csv('../data-processing/social_spambots_1_processed.csv')
ss2DF = pd.read_csv('../data-processing/social_spambots_2_processed.csv')
ss3DF = pd.read_csv('../data-processing/social_spambots_3_processed.csv')
ffDF = pd.read_csv('../data-processing/fake_followers_processed.csv')
ts1DF = pd.read_csv('../data-processing/traditional_spambots_1_processed.csv')


In [3]:
def data_split (df, split=0.7):
    train_num = int(len(df.index)*split)
    return df.iloc[:train_num,:], df.iloc[train_num+1:,:]

realTrain, realTest = data_split(realDF)
ss1Train, ss1Test = data_split(ss1DF)
ss2Train, ss2Test = data_split(ss2DF)
ss3Train, ss3Test = data_split(ss3DF)
ts1Train, ts1Test = data_split(ts1DF)
ffTrain, ffTest = data_split(ffDF)

In [4]:
def trainRF (reals, bots, n_estimators=100):

    data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
    target = np.array([0] * sum(len(real.index) for real in reals) + [1] * sum(len(bot.index) for bot in bots))
    rfc = RandomForestClassifier(n_estimators=n_estimators)
    rfc = rfc.fit(data, target)
    return rfc

In [5]:
def testRF(reals, bots, RF):
    data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
    target = np.array([0] * sum(len(real.index) for real in reals) + [1] * sum(len(bot.index) for bot in bots))
    return RF.score(data, target)

In [16]:
def testRF_ESC(reals, bots, RFs):
    data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
    target = np.array([0] * sum(len(real.index) for real in reals) + [1] * sum(len(bot.index) for bot in bots))
    totalAccConf = 0.0
    totalAccNum = 0
    for i in range(data.shape[0]):
        botMax = max(float(RF.predict_proba([data[i]])[0][1]) for RF in RFs)
        genuineMax = min(float(RF.predict_proba([data[i]])[0][1]) for RF in RFs)
        if botMax >= 0.5:
            totalAccConf += abs(botMax - float(target[i]))
            if abs(botMax - float(target[i])) >= 0.5:
                print("Error Pred 1, ", i, data[i], target[i], [RF.predict_proba([data[i]])[0] for RF in RFs], botMax)
            else:
                totalAccNum += 1
        elif genuineMax <= 0.5:
            totalAccConf += abs(float(target[i]) - genuineMax)
            if abs(float(target[i]) - genuineMax) >= 0.5:
                print("Error Pred 2, ", i, data[i], target[i], [RF.predict_proba([data[i]])[0] for RF in RFs], genuineMax)
            else:
                totalAccNum += 1
    return float(totalAccNum)/data.shape[0], totalAccConf/data.shape[0]

In [7]:
RF = []
RF.append(trainRF([realTrain], [ss1Train, ss2Train, ss3Train, ffTrain, ts1Train]))
RF.append(trainRF([realTrain], [ss1Train, ss2Train, ss3Train]))
RF.append(trainRF([realTrain], [ts1Train]))
RF.append(trainRF([realTrain], [ffTrain]))

  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()


In [8]:
print(testRF([realTest], [ss1Test, ss2Test, ss3Test, ffTest, ts1Test], RF[0]))

0.9897879025923016


  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()


In [18]:
correctRate, correctConf = testRF_ESC([realTest], [ss1Test, ss2Test, ss3Test, ts1Test, ffTest], RF)

  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()


Error Pred 1,  49 [  4.    4.   40.    0.    0.    0.1 -36.    0.    0.1   0.    0.    0.
   0.    0.    1.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0.    0.    2.    1.    0.    0.4   0.    0.    0.    0.    0.
   0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.
   0.    0. ] 0 [array([0.34, 0.66]), array([0.96, 0.04]), array([0.78, 0.22]), array([0.57, 0.43])] 0.66
Error Pred 1,  114 [ 6.70000000e+01  1.80000000e+01  2.16000000e+02  2.00000000e+00
  0.00000000e+00  8.33333333e-02 -1.98000000e+02  9.25925926e-03
  3.10185185e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  6.30000000e+01
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  2.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  6.66666667e-01  2.40597015e+01
  1.94832942e+02  0.00000000e+00  0.00000000e+00 

In [19]:
print(correctRate, correctConf)

0.9895260539408222 0.0207017543859649
