Idee: BERT für Feature Extraction, bidirektionale LSTM zur Klassifikation

In [1]:
import torch
import torch.nn as nn
from transformers import *

---

In [2]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights,
                                        output_hidden_states=False,
                                        output_attentions=False)

In [3]:
tokenizer.encode("First sentence", "Second sentence")

[2034, 6251, 2117, 6251]

In [4]:
embs = tokenizer.encode('This is a test', 'bank account')
input_ids = torch.tensor([embs])
input_ids.shape

torch.Size([1, 6])

In [5]:
input_ids

tensor([[2023, 2003, 1037, 3231, 2924, 4070]])

In [28]:
class BERTFeatureExtractor():
    
    def __init__(self):
        pretrained_weights = 'bert-base-uncased'
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
        self.model = BertModel.from_pretrained(pretrained_weights,
                                                output_hidden_states=False,
                                                output_attentions=False)
    
    def extract(self, sentences):
        embs = [self.tokenizer.encode(sentence) for sentence in sentences]
        input_ids = torch.tensor([embs])
        return self.model(input_ids)[0]
    
fe = BERTFeatureExtractor()

In [32]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights,
                                        output_hidden_states=False,
                                        output_attentions=False)

In [42]:
sentences = ["Today was a good day!", "Not that good .."]
embs = [tokenizer.encode(sentence) for sentence in sentences]
print(embs)
input_ids = torch.tensor([embs])
model(input_ids)[0]

[[2651, 2001, 1037, 2204, 2154, 999], [2025, 2008, 2204, 1012, 1012]]


ValueError: expected sequence of length 6 at dim 2 (got 5)

In [29]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # 28 time steps
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out

In [30]:
input_dim = 768
hidden_dim = 128
layer_dim = 3
output_dim = 2
lstm = LSTMClassifier(input_dim, hidden_dim, layer_dim, output_dim)

In [31]:
lstm_input = fe.extract([["Today was a good day!", "Awesome!", "Bad"],["Bad"]])
print(lstm_input.shape)
lstm(lstm_input)

ValueError: expected sequence of length 3 at dim 2 (got 1)

---

## Test auf IMDB 

In [10]:
import pandas as pd
import os

In [11]:
data_dir = os.path.join('mix20_rand700_tokens_cleaned','tokens')
pos_files = os.path.join(data_dir, 'pos')
pos_sentences = []
for file in os.listdir(pos_files):
    if file.endswith('.txt'):
        with open(os.path.join(pos_files, file)) as f:
            pos_sentences.append({'sentence': f.read()[:500], 'category': 1})

neg_files = os.path.join(data_dir, 'pos')
neg_sentences = []
for file in os.listdir(pos_files):
    if file.endswith('.txt'):
        with open(os.path.join(neg_files, file)) as f:
            neg_sentences.append({'sentence': f.read()[:500], 'category': 1})
            
sentences = pos_sentences + neg_sentences

In [12]:
df = pd.DataFrame(sentences)
df.head()

Unnamed: 0,sentence,category
0,"lancelot du lac ( lancelot du lac ) ( france ,...",1
1,director : brian de palma writer : david koepp...,1
2,"six days , seven nights reviewed by jamie peck...",1
3,"cast : mel gibson ( jerry fletcher ) , julia r...",1
4,"all great things come to an end , and the dot-...",1


In [13]:
df['sentence_toks'] = df['sentence'].apply(lambda x: ['[CLS]'] + tokenizer.encode(x)[:511])

In [14]:
df.head()

Unnamed: 0,sentence,category,sentence_toks
0,"lancelot du lac ( lancelot du lac ) ( france ,...",1,"[[CLS], 9993, 10994, 4241, 18749, 1006, 9993, ..."
1,director : brian de palma writer : david koepp...,1,"[[CLS], 2472, 1024, 4422, 2139, 23985, 3213, 1..."
2,"six days , seven nights reviewed by jamie peck...",1,"[[CLS], 2416, 2420, 1010, 2698, 6385, 8182, 20..."
3,"cast : mel gibson ( jerry fletcher ) , julia r...",1,"[[CLS], 3459, 1024, 11463, 9406, 1006, 6128, 1..."
4,"all great things come to an end , and the dot-...",1,"[[CLS], 2035, 2307, 2477, 2272, 2000, 2019, 22..."


In [17]:
X, Y = df['sentence_toks'].values, df['category'].values

In [20]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, Y):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]