Idee: BERT für Feature Extraction, bidirektionale LSTM zur Klassifikation

In [3]:
import torch
import torch.nn as nn
from transformers import *

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


---

In [15]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights,
                                        output_hidden_states=False,
                                        output_attentions=False)

In [12]:
tokenizer.encode("First sentence", "Second sentence")

[2034, 6251, 2117, 6251]

In [None]:
embs = tokenizer.encode('This is a test', 'bank account')
input_ids = torch.tensor([embs])
input_ids.shape

In [None]:
input_ids

In [None]:
class BERTFeatureExtractor():
    
    def __init__(self):
        pretrained_weights = 'bert-base-uncased'
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
        self.model = BertModel.from_pretrained(pretrained_weights,
                                                output_hidden_states=False,
                                                output_attentions=False)
    
    def extract(self, sentence):
        embs = self.tokenizer.encode(sentence)
        input_ids = torch.tensor([embs])
        return self.model(input_ids)[0]
    
fe = BERTFeatureExtractor()

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.layer_dim = layer_dim

        # Building your LSTM
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()

        # 28 time steps
        # We need to detach as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        # Index hidden state of last time step
        # out.size() --> 100, 28, 100
        # out[:, -1, :] --> 100, 100 --> just want last time step hidden states! 
        out = self.fc(out[:, -1, :]) 
        # out.size() --> 100, 10
        return out

In [None]:
input_dim = 768
hidden_dim = 128
layer_dim = 3
output_dim = 2
lstm = LSTMClassifier(input_dim, hidden_dim, layer_dim, output_dim)

In [None]:
lstm_input = fe.extract(["Today was a good day!"])
print(lstm_input)
#lstm(lstm_input)

---

## Test auf IMDB 

In [19]:
import pandas as pd
import os

In [20]:
data_dir = os.path.join('mix20_rand700_tokens_cleaned','tokens')
pos_files = os.path.join(data_dir, 'pos')
pos_sentences = []
for file in os.listdir(pos_files):
    if file.endswith('.txt'):
        with open(os.path.join(pos_files, file)) as f:
            pos_sentences.append({'sentence': f.read()[:500], 'category': 1})

neg_files = os.path.join(data_dir, 'pos')
neg_sentences = []
for file in os.listdir(pos_files):
    if file.endswith('.txt'):
        with open(os.path.join(neg_files, file)) as f:
            neg_sentences.append({'sentence': f.read()[:500], 'category': 1})
            
sentences = pos_sentences + neg_sentences

In [24]:
df = pd.DataFrame(sentences)
df.head()

Unnamed: 0,sentence,category
0,"lancelot du lac ( lancelot du lac ) ( france ,...",1
1,director : brian de palma writer : david koepp...,1
2,"six days , seven nights reviewed by jamie peck...",1
3,"cast : mel gibson ( jerry fletcher ) , julia r...",1
4,"all great things come to an end , and the dot-...",1


In [30]:
df['sentence_toks'] = df['sentence'].apply(lambda x: ['[CLS]'] + tokenizer.encode(x)[:511])

In [31]:
df.head()

Unnamed: 0,sentence,category,sentence_toks
0,"lancelot du lac ( lancelot du lac ) ( france ,...",1,"[[CLS], 9993, 10994, 4241, 18749, 1006, 9993, ..."
1,director : brian de palma writer : david koepp...,1,"[[CLS], 2472, 1024, 4422, 2139, 23985, 3213, 1..."
2,"six days , seven nights reviewed by jamie peck...",1,"[[CLS], 2416, 2420, 1010, 2698, 6385, 8182, 20..."
3,"cast : mel gibson ( jerry fletcher ) , julia r...",1,"[[CLS], 3459, 1024, 11463, 9406, 1006, 6128, 1..."
4,"all great things come to an end , and the dot-...",1,"[[CLS], 2035, 2307, 2477, 2272, 2000, 2019, 22..."


In [34]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], ['test 1', 'test 2']))

In [35]:
train_tokens

[['[CLS]', 'test', '1'], ['[CLS]', 'test', '2']]