## Preliminaries

In [25]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


'GeForce GTX 1080 Ti'

In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
UsageError: Line magic function `%` not found.


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

## Data Import

In [None]:
pd.set_option('display.max_colwidth', -1)

#Trolls comment data
Non_Russian=pd.read_csv("data/csv-zusammenfuehren.de_3mjg6fs7.csv", error_bad_lines=False)

#Non trolls comment data
Russian = pd.read_csv("data/comments.csv")

#Get only the comments
Non_Russian = Non_Russian[["body"]]
Russian = Russian[["body"]]

#Dropping rows with N/A
Non_Russian.dropna(inplace=True)
Russian.dropna(inplace=True)

num_samples = Russian.shape[0]
Non_Russian = Non_Russian.sample(num_samples)

In [38]:
Russian[['body']].head(10)

Unnamed: 0,body
0,"A hard look at training and tactics"" = They will be sent more $$$ for ""training"""
1,They deserve all of the hate
2,"I guess that's what they mean when say ""I don't see color"""
3,"It's never too late for them, It's never too cruel or brutal for them. He will still probably get away with this"
4,https://petitions.whitehouse.gov//petition/petition-create-nationwide-elected-and-publicly-reviewed-police-oversight-agency
5,"By submitting to an independent, non-profit community, the authors volunteered on a Good Samaritan basis to spread wokeness"
6,"Sorry, but if you bothered to read the article below the video you could've gotten the point"
7,That is not a personal blog
8,"Only 125? Why not 10?\r\nIslamic state has captured half Syria, they are close to Baghdad and keep advancing while all western leaders do is simply taking some minor measures. Smh"
9,What? Any other wild guesses about my private life?


In [40]:
Non_Russian.head(10)

Unnamed: 0,body
32317,Fine the way it is.
23630,With Vsync 45fps is the same as 30fps AFAIK.
10610,Having other good players is great but Chubb is our workhorse. He has the build to take the hits that come from 20 carries a game.
28958,"They changed it to say ""right wing"" authoritarianism."
34131,"John Wall averages 2008 Chris Paul numbers as he leads the Wizards to a top 2 seed and the Finals as MVP, sweeping LeBron and the Cavs in the conference finals but not before LeBron, at the line with a chance to win game 4 in the series and prevent a sweep, is iced by Wall and subsequently misses his free throws."
35811,Manen to the rescue!
234,&gt;Randall Cobb\n&gt;24\n&gt;old age\nkek
34179,"It's a rodent that died tried to get into the shake. RIP, Rat."
13946,"Ozai, as in this guy: http://static.comicvine.com/uploads/original/11114/111147344/3976312-4231998611-69941.gif\nI tried to make a hunter that was the complete opposite of me, so I made a huge strong guy who uses lances (not anymore though) and has a beard and stuff."
44477,"Its performance is unmatched and the camera is, without exaggeration, amazing. Also it's Google's new main focus now the nexus project has been shelved so expect it to be supported to all hell"


## Data Preprocessing

In [5]:
# Create sentence and label lists
Russian_sentences = Russian.body.values
Non_Russian_sentences = Non_Russian.body.values

# Special initial and EOS (end of sentence) tokens
Russian_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in Russian_sentences]
Non_Russian_sentences = ["[CLS] " + sentence + " [SEP]" for sentence in Non_Russian_sentences]

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Russian_tokenized_texts = [tokenizer.tokenize(sent) for sent in Russian_sentences]
Non_Russian_tokenized_texts = [tokenizer.tokenize(sent) for sent in Non_Russian_sentences]

Russian_tokenized_texts = [x for x in Russian_tokenized_texts if len(x) < 128]
Non_Russian_tokenized_texts = [x for x in Non_Russian_tokenized_texts if len(x) < 128]

tokenized_texts = Non_Russian_tokenized_texts + Russian_tokenized_texts

print ("Tokenize the first sentence:")
print (tokenized_texts[0])

Tokenize the first sentence:
['[CLS]', 'a', 'hard', 'look', 'at', 'training', 'and', 'tactics', '"', '=', 'they', 'will', 'be', 'sent', 'more', '$', '$', '$', 'for', '"', 'training', '"', '[SEP]']


In [10]:
#Adding Labels
Non_Russian_labels = [0 for x in range(len(Non_Russian_tokenized_texts))]
Russian_labels = [1 for x in range(len(Russian_tokenized_texts))]

labels = Non_Russian_labels + Russian_labels

In [12]:
# Max sequence length.
MAX_LEN = 128
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [16]:
# Create attention masks
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [17]:
# Split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [19]:
# Iterator of data
batch_size = 32
epochs = 4
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

## Load BERT model

In [None]:
# Load pretrained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
# Hyperparemeter information
optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1)

In [23]:
# Calculate the accuracy of predictions
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## Training

In [None]:
# Training
t = [] 
train_loss_set = []

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):


    # Training
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        train_loss_set.append(loss.item())    
        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))


    # Validation
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

## Results

In [None]:
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()