In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import re
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
from pytorch_pretrained_bert import BertModel, BertTokenizer
from keras.preprocessing.sequence import pad_sequences


Using TensorFlow backend.


# Model and tokenizer

In [2]:
# class BertBinary(nn.Module):
#     def __init__(self, dropout=0.1):
#         super(BertBinary, self).__init__()

#         self.bert = BertModel.from_pretrained('bert-base-uncased')

#         self.dropout = nn.Dropout(dropout)
#         self.linear = nn.Linear(768, 2)
    
#     def forward(self, tokens, masks=None):
#         _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
#         dropout_output = self.dropout(pooled_output)
#         linear_output = self.linear(dropout_output)

#         return linear_output

# model = BertBinary()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Load and format data

In [3]:
# Load data 
data = pd.read_csv(r'Data\News_DJIA.csv')

# combine titles 
title_cols = list(data.columns[2:24]) # using Top 22 titles due to BERTs max sequence length of 512
data['News'] = data[title_cols].agg(' '.join, axis = 1)

# remove, quotes and b, from news title cols 
def clean_titles(titles):
    titles = re.sub('b[(\')]','',titles)
    titles = re.sub('b[(\")]','',titles)
    titles = re.sub("\'",'',titles)
    return titles

data['News'] = data.apply(lambda x: clean_titles(x['News']), axis = 1)

# drop un-used cols
data = data.drop(data.columns[2:27], axis = 1)

# Date to datetime object
data['Date'] = pd.to_datetime(data['Date'])

In [4]:
# split data to train and test
split = dt.datetime(2015,1,1,0,0,0)
train = data[data.Date <= split]
test = data[data.Date > split]
X_train, y_train = np.array(train['News']),np.array(train['Label'])
X_test, y_test = np.array(test['News']),np.array(test['Label'])

# Tokenize, ID, mask, and pad text data

In [5]:
max_len = 512 # hard limit for BERT

# tokenize text
def tokenize_ID(text):
        
    # tokenize and add clasification and seperation tokens
    text_tokenized = tokenizer.tokenize(text)
    text_tokenized.insert(0,'[CLS]')
    if len(text_tokenized) >= max_len:
        text_tokenized.insert(max_len -1,'[SEP]')
    else:
        text_tokenized.append('[SEP]')
    
    # convert tokens to IDs
    IDs = tokenizer.convert_tokens_to_ids(text_tokenized)

    return IDs

X_train = [tokenize_ID(text) for text in X_train]
X_test = [tokenize_ID(text) for text in X_test]

sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (514 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (515 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (529 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (586 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is longer than the specified maximum  sequence length for this BERT model (616 > 512). Running this sequence through BERT will result in indexing errors
Token indices sequence length is

In [7]:
# Pad/trim IDs
X_train = pad_sequences(X_train, maxlen=max_len, truncating="post", padding="post")
X_test = pad_sequences(X_test, maxlen=max_len, truncating="post", padding="post")

In [10]:
# creat masks
train_masks = [[float(i > 0) for i in ii] for ii in X_train]
test_masks = [[float(i > 0) for i in ii] for ii in X_test]