In [43]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
import torch.utils.data
from sklearn.model_selection import train_test_split
# from torchnlp.datasets import imdb_dataset
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from collections import Counter 
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

In [44]:
pos = open('Dataset/Positive_tweets(10000).csv').read()
npos = 0
labels, texts = [], []
for i, line in enumerate(pos.split("\n")):
    content = line.split(',')
    if len(content) < 4:
    	continue;
    if content[4] != "English":
    	continue;
    labels.append(1)
    texts.append(content[2])
    npos += 1

# load negative labels (random tweets)
neg = open('Dataset/Negative_tweets(10000).txt').read()
nneg = 0
for i, line in enumerate(neg.split("\n")):
    labels.append(0)
    texts.append(line)
    nneg += 1

texts, labels = shuffle(texts, labels)

print('Total number of datapoints: ', len(labels))
print('Positive labels: ', npos)
print('Negative labels: ', nneg)

df = pd.DataFrame()
df['text'] = texts
df['label'] = labels

df.head()

Total number of datapoints:  12514
Positive labels:  4541
Negative labels:  7973


Unnamed: 0,text,label
0,I so much want this election cycle to be over ...,0
1,@HillaryClinton One small measurement on our e...,0
2,Wahrheit ist die Krücke der Verlierer https://...,0
3,Clinton: I didn't think pneumonia would be a b...,0
4,a #SaturdayMorning poem:,0


In [46]:
enc = LabelEncoder()
y = enc.fit_transform(labels).reshape(-1,1)
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], y, test_size=0.20)

In [5]:
# if npos < nneg :
#     df_bot = df[df['label'] == 1] 
#     df_normal = df[df['label'] == 0] 
#     df_normal = df_normal.sample(n=len(df_bot))
#     df = df_bot.append(df_normal)
#     df = df.sample(frac=1, random_state = 24).reset_index(drop=True)
# else :
#     df_bot = df[df['label'] == 1] 
#     df_normal = df[df['label'] == 0] 
#     df_bot = df_bot.sample(n=len(df_normal))
#     df = df_normal.append(df_bot)
#     df = df.sample(frac=1, random_state = 24).reset_index(drop=True)

In [47]:
print(len(train_texts))

10011


In [7]:
# train_data = df.head(len(df)*0.8)
# test_data = df.tail(len(df) - len(train_data))

In [13]:
# train_data = [{'text': text, 'label': type_data } for text in list(train_data['text']) for type_data in list(train_data['label'])]
# test_data = [{'text': text, 'label': type_data } for text in list(test_data['text']) for type_data in list(test_data['label'])]

In [14]:
# train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['label']), train_data)))
# test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['label']), test_data)))

Notice we truncate the input strings to 128 characters. The maximum length BERT can handle is 512, but in the interest of computational time we will work with 128.

In [48]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:127], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:127], test_texts))

train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))

In [49]:
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=128, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=128, truncating="post", padding="post", dtype="int")

In [50]:
train_y = np.array(train_labels) == 1
test_y = np.array(test_labels) == 1
test_y = test_y.astype(int)
train_y = train_y.astype(int)

In [55]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

BATCH_SIZE = 1
EPOCHS = 1

In [56]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [57]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)


In [58]:
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [60]:
bert_clf = BertBinaryClassifier()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        # print('Epoch: ', epoch_num + 1)
        # print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))
        if step_num % 100 == 0:
            print('Epoch: ', epoch_num + 1)
            print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_texts) / BATCH_SIZE, train_loss / (step_num + 1)))


Epoch:  1
0/10011.0 loss: 0.6819517016410828 
Epoch:  1
100/10011.0 loss: 0.6793890368230272 
Epoch:  1
200/10011.0 loss: 0.6601348431104451 
Epoch:  1
300/10011.0 loss: 0.6223950892488822 
Epoch:  1
400/10011.0 loss: 0.5623385044664814 
Epoch:  1
500/10011.0 loss: 0.5087928249391015 
Epoch:  1
600/10011.0 loss: 0.4671702995251697 
Epoch:  1
700/10011.0 loss: 0.4403937614061863 
Epoch:  1
800/10011.0 loss: 0.41318554476777863 
Epoch:  1
900/10011.0 loss: 0.39412806693311536 
Epoch:  1
1000/10011.0 loss: 0.3776413478072617 
Epoch:  1
1100/10011.0 loss: 0.36157135857940803 
Epoch:  1
1200/10011.0 loss: 0.34727883566912265 
Epoch:  1
1300/10011.0 loss: 0.3363540981274147 
Epoch:  1
1400/10011.0 loss: 0.32573169580714606 
Epoch:  1
1500/10011.0 loss: 0.3132142810303616 
Epoch:  1
1600/10011.0 loss: 0.3042880925632022 
Epoch:  1
1700/10011.0 loss: 0.2964948245878827 
Epoch:  1
1800/10011.0 loss: 0.28933278705329035 
Epoch:  1
1900/10011.0 loss: 0.2796076552438766 
Epoch:  1
2000/10011.0 los

In [61]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])

In [62]:
print(classification_report(test_y, bert_predicted))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      1607
           1       0.97      0.93      0.95       896

   micro avg       0.97      0.97      0.97      2503
   macro avg       0.97      0.96      0.96      2503
weighted avg       0.97      0.97      0.97      2503



In [63]:
from sklearn.metrics import accuracy_score

In [64]:
print(accuracy_score(test_y, bert_predicted))

0.9664402716739912
