In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [6]:
import pandas as pd
realDF = pd.read_csv('../data-processing/cresci-2017/genuine_accounts_processed.csv')
ss1DF = pd.read_csv('../data-processing/cresci-2017/social_spambots_1_processed.csv')
ss2DF = pd.read_csv('../data-processing/cresci-2017/social_spambots_2_processed.csv')
ss3DF = pd.read_csv('../data-processing/cresci-2017/social_spambots_3_processed.csv')
ffDF = pd.read_csv('../data-processing/cresci-2017/fake_followers_processed.csv')
ts1DF = pd.read_csv('../data-processing/cresci-2017/traditional_spambots_1_processed.csv')
coaDF = pd.read_csv('../data-processing/college-official-accounts/coa_processed.csv')
v2019DF = pd.read_csv('../data-processing/verified-2019/v2019_processed.csv')


In [None]:
v2019DF.info()

In [7]:
# removing some entries from data
del coaDF['url_youtube'], v2019DF['url_youtube']
del coaDF['favorite'], realDF['favorite'], ss1DF['favorite'], ss2DF['favorite'], ss3DF['favorite'], ffDF['favorite'], ts1DF['favorite'], v2019DF['favorite']
del coaDF['Unnamed: 0'], realDF['Unnamed: 0'], ss1DF['Unnamed: 0'], ss2DF['Unnamed: 0'], ss3DF['Unnamed: 0'], ffDF['Unnamed: 0'], ts1DF['Unnamed: 0'], v2019DF['Unnamed: 0']
del coaDF['favorite_over_friend'], realDF['favorite_over_friend'], ss1DF['favorite_over_friend'], ss2DF['favorite_over_friend'], ss3DF['favorite_over_friend'], ffDF['favorite_over_friend'], ts1DF['favorite_over_friend'], v2019DF['favorite_over_friend']

In [8]:
def data_split (df, split=0.7):
    train_num = int(len(df.index)*split)
    return df.iloc[:train_num,:], df.iloc[train_num+1:,:]

realTrain, realTest = data_split(realDF)
ss1Train, ss1Test = data_split(ss1DF)
ss2Train, ss2Test = data_split(ss2DF)
ss3Train, ss3Test = data_split(ss3DF)
ts1Train, ts1Test = data_split(ts1DF)
ffTrain, ffTest = data_split(ffDF)
coaTrain, coaTest = data_split(coaDF)
v2019Train, v2019Test = data_split(v2019DF)

In [9]:
print(coaTrain.shape, v2019Train.shape)

(2499, 48) (1367, 48)


In [13]:
def trainRF (reals, bots, n_estimators=100):

    data = pd.concat(reals+bots).fillna(0).to_numpy()
    target = np.array([0] * sum(len(real.index) for real in reals) + [1] * sum(len(bot.index) for bot in bots))
    rfc = RandomForestClassifier(n_estimators=n_estimators)
    rfc = rfc.fit(data, target)
    return rfc

In [14]:
def testRF(reals, bots, RF):
    data = pd.concat(reals+bots).fillna(0).to_numpy()
    target = np.array([0] * sum(len(real.index) for real in reals) + [1] * sum(len(bot.index) for bot in bots))
    return RF.score(data, target)

In [20]:
def testRF_ESC(reals, bots, RFs):
    data = pd.concat(reals+bots).fillna(0).to_numpy()
    target = np.array([0] * sum(len(real.index) for real in reals) + [1] * sum(len(bot.index) for bot in bots))
    totalAccNum = 0
    for i in range(data.shape[0]):
        catScore = [float(RFs[0].predict_proba([data[i]])[0][0])] + [float(RF.predict_proba([data[i]])[0][1]) for RF in RFs[1:]]
        if target[i] == 0 and catScore[0] == max(catScore) or target[i] == 1 and catScore[0] != max(catScore):
            totalAccNum += 1
        else:    
            print("Pred", i, target[i], catScore)
    return float(totalAccNum)/data.shape[0]

In [16]:
real_detect_RF = []
real_detect_RF.append(trainRF([realTrain], [ss1Train, ss2Train, ss3Train, ffTrain, ts1Train, coaTrain, v2019Train]))
real_detect_RF.append(trainRF([realTrain], [ss1Train, ss2Train, ss3Train]))
real_detect_RF.append(trainRF([realTrain], [ts1Train]))
real_detect_RF.append(trainRF([realTrain], [coaTrain]))
real_detect_RF.append(trainRF([realTrain], [ffTrain]))
real_detect_RF.append(trainRF([realTrain], [v2019Train]))

In [19]:
print(testRF([realTest], [ss1Test, ss2Test, ss3Test, ffTest, ts1Test, coaTest, v2019Test], real_detect_RF[0]))

0.9928767123287672


In [21]:
correctRate = testRF_ESC([realTest][0:10], [ss1Test, ss2Test, ss3Test, ffTest, ts1Test, coaTest, v2019Test][0:10], real_detect_RF)

Pred 0 0 [0.01, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 1 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 2 0 [0.02, 0.0, 0.01, 0.0, 0.0, 0.0]
Pred 3 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 4 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 5 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 6 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 7 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 8 0 [0.04, 0.01, 0.01, 0.0, 0.0, 0.0]
Pred 9 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 10 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 11 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 12 0 [0.05, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 13 0 [0.1, 0.03, 0.0, 0.0, 0.0, 0.0]
Pred 14 0 [0.02, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 15 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 16 0 [0.02, 0.02, 0.0, 0.0, 0.0, 0.01]
Pred 17 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 18 0 [0.17, 0.09, 0.01, 0.1, 0.0, 0.07]
Pred 19 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 20 0 [0.01, 0.0, 0.0, 0.01, 0.0, 0.0]
Pred 21 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 22 0 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Pred 23 0 [0.01, 0.0, 0.03, 0.0, 0.03, 0.0]
Pred 

KeyboardInterrupt: 

In [None]:
v2019DF.to_pickle('v2019-pickled')

In [None]:
print(correctRate, correctConf)

0.9924278277330809 0.006578324656885935


In [10]:
RF = []
RF.append(trainRF([v2019Train], [ss1Train, ss2Train, ss3Train, ffTrain, ts1Train, coaTrain]))
RF.append(trainRF([v2019Train], [ss1Train, ss2Train, ss3Train]))
RF.append(trainRF([v2019Train], [ts1Train]))
RF.append(trainRF([v2019Train], [coaTrain]))
RF.append(trainRF([v2019Train], [ffTrain]))

  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()


In [11]:
print(testRF([v2019Test], [ss1Test, ss2Test, ss3Test, ffTest, ts1Test, coaTrain], RF[0]))

0.9902746971506569


  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()


In [13]:
singleRF = trainRF([coaTrain], [v2019Train])
print(testRF([coaTest], [v2019Train], singleRF))

  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()


0.9696472518457753


  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()


In [16]:
real_versus_inst_veri_RF = trainRF([realTrain], [v2019Train, coaTrain])
print(testRF([realTest], [v2019Test, coaTest], real_versus_inst_veri_RF))

  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()


1.0


  data = pd.concat(reals+bots).fillna(0).drop('Unnamed: 0', 1).to_numpy()
