In [263]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint
import h5py
from tqdm import tqdm_notebook as tqdm

import numpy as np
import torch
import json
import pandas as pd
import glob
import os

import tweepy
import sys
import jsonpickle
from sklearn.model_selection import train_test_split

# Pytorch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD
from torch.autograd import Variable
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [237]:
# InferSent setup
from models import InferSent
model_version = 1
MODEL_PATH = "../encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

# Keep it on CPU or put it on GPU
use_cuda = False
model = model.cuda() if use_cuda else model

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = '../dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [139]:
def GetEmbeddings(sentences, batch_size=128, verbose=False):
    embeddings = model.encode(sentences, bsize=batch_size, tokenize=False, verbose=verbose)
    if verbose: print('nb sentences encoded : {0}'.format(len(embeddings)))
    return embeddings

def GetBatch(df, X, y, batch_size=128):
    df_size = len(df)
    for counter in range(int(len(df)/batch_size)+1):
        yield df[X].iloc[counter*batch_size:min((counter + 1)*batch_size, len(df))], \
        df[y].iloc[counter*batch_size:min((counter + 1)*batch_size, len(df))]

# Prepare Non-Troll Data

In [2]:
non_troll_file = './data/non_trolls.csv'
troll_file = './data/trolls.csv'

In [3]:
non_troll_df = pd.read_csv(non_troll_file)

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
print('Top 10 languages used by non-trolls:')
non_troll_df.groupby('lang').agg({'id' : 'count'}).sort_values(by='id', ascending=False).rename(columns={'id' : 'Count'}).head(10)

Top 10 languages used by non-trolls:


Unnamed: 0_level_0,Count
lang,Unnamed: 1_level_1
en,1044537
und,39316
es,27108
fr,6544
pt,2868
it,2427
de,2198
in,1332
tr,1015
nl,971


In [14]:
non_troll_en_df = non_troll_df.loc[non_troll_df.lang == 'en']
len(non_troll_en_df)

1044537

In [298]:
num_samples = 20000
non_troll_sample_df = non_troll_en_df.sample(num_samples)
non_troll_sample_df['is_troll'] = 0

In [300]:
non_troll_text_df = non_troll_sample_df[['text', 'is_troll']]

# Prepare Troll Data

In [None]:
troll_df = pd.read_csv('./russian-troll-tweets/trolls_2016_en.csv')

In [301]:
troll_sample_df = troll_df.sample(num_samples)
troll_sample_df['is_troll'] = 1
troll_sample_df.rename(columns={'content' : 'text'}, inplace=True)
troll_text_df = troll_sample_df[['text', 'is_troll']]

# Combine Data and Write to h5py

In [302]:
text_df = pd.concat([non_troll_text_df, troll_text_df], axis=0)

In [303]:
# Write embeddings to h5py file
embed_file = './data/embeddings.h5' 
num_tweets = len(text_df)
batch_size = 128
pbar = tqdm(total=num_tweets)
with h5py.File(embed_file, "a") as f:
    embed_dset = f.create_dataset('embeddings', (num_tweets, 4096))
    response_dset = f.create_dataset('is_troll', (num_tweets,))
    counter = 0
    for batch in GetBatch(text_df, 'text', 'is_troll', batch_size=batch_size):
        embeddings_  = GetEmbeddings(batch[0].values, batch_size=batch_size)
        is_troll_ = batch[1].values
        num_tweets = batch[0].shape[0]
        embed_dset[counter:(counter + num_tweets)] = embeddings_
        response_dset[counter:(counter + num_tweets)] = is_troll_
        counter += num_tweets
        pbar.update(num_tweets)

HBox(children=(IntProgress(value=0, max=40000), HTML(value='')))

In [304]:
# Read h5py file
with h5py.File(embed_file, "r") as f:
    keys = list(f.keys())
    X = np.array(f[keys[0]])
    y = np.array(f[keys[1]])

In [339]:
test_size = 0.1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

In [317]:
def ShuffleArrays(X, y):
    arr = np.arange(X.shape[0])
    np.random.shuffle(arr)
    X = X[arr,:]
    y = y[arr]
    return X, y
def GetTrainBatches(X, y, batch_size=256):
    data_len = X.shape[0]
    for counter in range(int(data_len/batch_size)+1):
        yield X[counter*batch_size:min((counter + 1)*batch_size, data_len), :], \
        y[counter*batch_size:min((counter + 1)*batch_size, data_len)]

In [318]:
class TwitterNet(nn.Module):
    def __init__(self, num_classes):
        super(TwitterNet, self).__init__()
        self.fc1 = nn.Linear(4096,1024)
        self.drop1 = nn.Dropout()
        self.fc2 = nn.Linear(1024,512)
        self.fc3 = nn.Linear(512,128)
        self.fc4 = nn.Linear(128,num_classes)
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.drop1(x)
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
X_train = Variable(torch.FloatTensor(X_train))
y_train = Variable(torch.FloatTensor(y_train))
X_test = Variable(torch.FloatTensor(X_test))
y_test = Variable(torch.FloatTensor(y_test))

In [329]:
criterion = nn.CrossEntropyLoss()
model_net = TwitterNet(2).to(device)
optimizer = SGD(model_net.parameters(), lr = 0.1, momentum=0.9)
num_epochs = 15
batch_size = 256

In [330]:
# Check accuracy on untrained network (with randomly initialized weights)
torch.set_grad_enabled(False)
running_corrects = 0
for inputs, labels in GetTrainBatches(X_test, y_test, batch_size):
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model_net(inputs)
    _, preds = torch.max(outputs, 1)
    running_corrects += torch.sum(preds == labels.long().data)
val_acc = running_corrects.double()/X_test.shape[0]
torch.set_grad_enabled(True)
print('Validation Accuracy on untrained net is {:.2%}'.format(val_acc))

Validation Accuracy on untrained net is 49.88%


In [331]:
torch.set_grad_enabled(True)
for epoch in range(num_epochs):
    X_train, y_train = ShuffleArrays(X_train, y_train)
    running_loss = 0.0
    train_corrects = 0
    for inputs, labels in GetTrainBatches(X_train, y_train, batch_size):
        inputs = inputs.to(device)
        labels = labels.to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        
        # forward pass
        outputs = model_net(inputs)
        loss = criterion(outputs, labels.long())
        _, preds = torch.max(outputs, 1)
        train_corrects += torch.sum(preds == labels.long().data)
        
        # backward pass
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    # Validation set accuracy
    train_acc = train_corrects.double()/X_train.shape[0]
    torch.set_grad_enabled(False)
    running_corrects = 0
    for inputs, labels in GetTrainBatches(X_test, y_test, batch_size):
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model_net(inputs)
        _, preds = torch.max(outputs, 1)
        running_corrects += torch.sum(preds == labels.long().data)
    val_acc = running_corrects.double()/X_test.shape[0]
    torch.set_grad_enabled(True)
    print('Loss after epoch {} is {:.3f}. Train Acc. is {:.2%} and Validation Acc. is {:.2%}'.\
          format(epoch+1, running_loss, train_acc, val_acc))

Loss after epoch 1 is 68.002. Train Acc. is 74.58% and Validation Acc. is 84.15%
Loss after epoch 2 is 45.727. Train Acc. is 85.19% and Validation Acc. is 85.40%
Loss after epoch 3 is 42.936. Train Acc. is 86.12% and Validation Acc. is 86.17%
Loss after epoch 4 is 41.585. Train Acc. is 86.52% and Validation Acc. is 85.47%
Loss after epoch 5 is 41.159. Train Acc. is 86.63% and Validation Acc. is 85.67%
Loss after epoch 6 is 40.199. Train Acc. is 87.08% and Validation Acc. is 85.42%
Loss after epoch 7 is 40.141. Train Acc. is 86.85% and Validation Acc. is 85.88%
Loss after epoch 8 is 40.263. Train Acc. is 86.61% and Validation Acc. is 85.95%
Loss after epoch 9 is 39.558. Train Acc. is 87.10% and Validation Acc. is 84.95%
Loss after epoch 10 is 38.945. Train Acc. is 87.26% and Validation Acc. is 86.08%
Loss after epoch 11 is 38.207. Train Acc. is 87.34% and Validation Acc. is 86.17%
Loss after epoch 12 is 38.101. Train Acc. is 87.42% and Validation Acc. is 86.05%
Loss after epoch 13 is 38

In [332]:
# Check accuracy on trained network
torch.set_grad_enabled(False)
running_corrects = 0
for inputs, labels in GetTrainBatches(X_test, y_test, batch_size):
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model_net(inputs)
    _, preds = torch.max(outputs, 1)
    running_corrects += torch.sum(preds == labels.long().data)
val_acc = running_corrects.double()/X_test.shape[0]
torch.set_grad_enabled(True)
print('Validation Accuracy on untrained net is {:.2%}'.format(val_acc))

Validation Accuracy on untrained net is 85.05%


In [341]:
np.sum(y_test[0:100]

1995.0