In [2]:
import torch
import pandas as pd

from torch import nn, optim
from torchtext import data
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.autograd import Variable

In [3]:
datapaths = ['data/automated-v1.0/automatedAccountData.json', 
             'data/automated-v1.0/nonautomatedAccountData.json',
             'data/fake-v1.0/fakeAccountData.json',
             'data/fake-v1.0/realAccountData.json'
            ]

In [4]:
data = pd.read_json(datapaths[2])
data_ = pd.read_json(datapaths[3])
data = data.append(data_)

In [5]:
data[:10]

Unnamed: 0,userFollowerCount,userFollowingCount,userBiographyLength,userMediaCount,userHasProfilPic,userIsPrivate,usernameDigitCount,usernameLength,isFake
0,25,1937,0,0,1,1,0,10,1
1,324,4122,0,0,1,0,4,15,1
2,15,399,0,0,0,0,3,12,1
3,14,107,0,1,1,0,1,10,1
4,264,4651,0,0,1,0,0,14,1
5,33,1470,0,2,1,1,4,13,1
6,420,4883,30,8,1,0,0,12,1
7,442,6662,0,396,1,0,0,11,1
8,816,7497,0,85,1,0,3,15,1
9,150,6631,1,0,1,1,3,8,1


In [6]:
data.columns

Index(['userFollowerCount', 'userFollowingCount', 'userBiographyLength',
       'userMediaCount', 'userHasProfilPic', 'userIsPrivate',
       'usernameDigitCount', 'usernameLength', 'isFake'],
      dtype='object')

In [7]:
features = (data.drop('isFake', axis=1).values)
target = (data['isFake'].values)

train_X, test_X, train_y, test_y = train_test_split(features, target, test_size=0.3)

train_X = Variable(torch.Tensor(train_X).float())
test_X = Variable(torch.Tensor(test_X).float())
train_y = Variable(torch.Tensor(train_y).long())
test_y = Variable(torch.Tensor(test_y).long())

In [8]:
model = nn.Sequential(nn.Linear(8, 128),
                      nn.ReLU(),
                      nn.Linear(128,512),
                      nn.ReLU(),
                      nn.Linear(512,1024),
                      nn.ReLU(),
                      nn.Linear(1024,256),
                      nn.ReLU(),
                      nn.Linear(256,2),)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

In [9]:
epochs = 1000
loss_arr = []
for i in range(epochs):
   y_hat = model.forward(train_X)
   loss = criterion(y_hat, train_y)
   loss_arr.append(loss)
 
   if i % 100 == 0:
       print(f'Epoch: {i} Loss: {loss}')
 
   optimizer.zero_grad()
   loss.backward()
   optimizer.step()

Epoch: 0 Loss: 1.6651965379714966
Epoch: 100 Loss: 0.07695873826742172
Epoch: 200 Loss: 0.06260649859905243
Epoch: 300 Loss: 0.0482085645198822
Epoch: 400 Loss: 0.07293613255023956
Epoch: 500 Loss: 0.03799403831362724
Epoch: 600 Loss: 0.0314452089369297
Epoch: 700 Loss: 0.02800256945192814
Epoch: 800 Loss: 0.07478439807891846
Epoch: 900 Loss: 0.033634211868047714


In [10]:
preds = []
with torch.no_grad():
   for val in test_X:
       y_hat = model.forward(val)
       preds.append(y_hat.argmax().item())

In [11]:
df = pd.DataFrame({'Y': test_y, 'YHat': preds})
df['Correct'] = [1 if corr == pred else 0 for corr, pred in zip(df['Y'], df['YHat'])]

In [12]:
accu = df[df["Correct"]==1]["Correct"].shape[0] / df.shape[0]
print("accuracy : {:.2f} %".format(accu*100))

accuracy : 96.38 %
