In [89]:
import torch
import random
import numpy
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader

In [90]:
def set_seed(seed: int, n_gpu: int):
    random.seed(seed)
    numpy.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)
        
set_seed(42, 0)

In [91]:
reviews_train = []
for line in open("/Users/macbook/aclImdb/movie_data/full_train.txt", "r"):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open("/Users/macbook/aclImdb/movie_data/full_test.txt", "r"):
    reviews_test.append(line.strip())

In [92]:
train_target = [1 if i < 12500 else 0 for i in range(25000)]
test_target = [1 if i < 12500 else 0 for i in range(25000)]

In [93]:
model_name = 'bert-base-cased'
model = AutoModel.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)


In [94]:
prepared_data = []

for label, text in zip(train_target, reviews_train):
    #tokenized_text = tokenizer.tokenize(text)
    prepared_data.append({'label' : label, 'text' : text})

prepared_data.sort(key = lambda x: len(x['text']), reverse = True)

In [95]:
prepared_test_data = []

for label, text in zip(test_target, reviews_test):
    #tokenized_text = tokenizer.tokenize(text)
    prepared_data.append({'label' : label, 'text' : text})

prepared_data.sort(key = lambda x: len(x['text']), reverse = True)

In [96]:
prepared_data[:5]

for i in prepared_data[:5]:
    encoded_text = tokenizer.encode_plus(i['text'])
    print(encoded_text)
    break

{'input_ids': [101, 11492, 122, 131, 9697, 2649, 11389, 11492, 21471, 4150, 1105, 12901, 12840, 5016, 5040, 18079, 1105, 2929, 3096, 23508, 21471, 4150, 1105, 12901, 12840, 1408, 1614, 1228, 1114, 170, 9697, 2649, 11389, 11492, 1222, 5040, 18079, 1105, 2929, 3096, 23508, 119, 1792, 1106, 1103, 2995, 1104, 1103, 1801, 117, 1241, 7741, 1138, 1106, 1301, 1194, 7072, 1107, 1546, 1106, 1243, 1103, 1782, 119, 3096, 23508, 1105, 18079, 9511, 1146, 1346, 1113, 1118, 1781, 3587, 12582, 1158, 1148, 12901, 1105, 1173, 21471, 4150, 119, 138, 1528, 28117, 21790, 1118, 3096, 23508, 1106, 21471, 1261, 1103, 3223, 1149, 1104, 1103, 12840, 1711, 119, 12901, 1793, 1106, 1494, 1117, 1711, 117, 1133, 1103, 11601, 21775, 1140, 1229, 3096, 23508, 1105, 18079, 6939, 1174, 1146, 1113, 1140, 1107, 1103, 2655, 119, 1556, 3096, 23508, 188, 18778, 2624, 1283, 1113, 21471, 117, 18079, 1383, 1146, 170, 1952, 1796, 119, 12901, 19647, 1154, 1103, 3170, 1105, 1199, 24129, 7067, 1174, 1166, 1103, 1499, 8090, 2135, 1807

In [104]:
prepared_data[:5][0]


{'label': 1,
 'text': 'Match 1: Tag Team Table Match Bubba Ray and Spike Dudley vs Eddie Guerrero and Chris Benoit Bubba Ray and Spike Dudley started things off with a Tag Team Table Match against Eddie Guerrero and Chris Benoit. According to the rules of the match, both opponents have to go through tables in order to get the win. Benoit and Guerrero heated up early on by taking turns hammering first Spike and then Bubba Ray. A German suplex by Benoit to Bubba took the wind out of the Dudley brother. Spike tried to help his brother, but the referee restrained him while Benoit and Guerrero ganged up on him in the corner. With Benoit stomping away on Bubba, Guerrero set up a table outside. Spike dashed into the ring and somersaulted over the top rope onto Guerrero on the outside! After recovering and taking care of Spike, Guerrero slipped a table into the ring and helped the Wolverine set it up. The tandem then set up for a double superplex from the middle rope which would have put Bubba

In [97]:
encoded_text.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [98]:
BATCH_SIZE = 64
MAX_INPUT_LENGTH = 512

In [99]:
def generate_batch(input_data, max_length = MAX_INPUT_LENGTH):
    encoded_batch = []

    input_data.sort(key = lambda x: len(x['text']), reverse = True)
    max_batch_length = len(tokenizer.encode(input_data[0]['text']))

    for sample in input_data:
        encoded_batch.append(tokenizer.encode_plus(sample['text'], pad_to_max_length=True, max_length=min(max_length, max_batch_length)))
        
    input_ids = torch.cat([torch.tensor(x['input_ids'], dtype=torch.long).unsqueeze(0) for x in encoded_batch], dim=0)
    token_type_ids = torch.cat([torch.tensor(x['token_type_ids'], dtype=torch.long).unsqueeze(0) for x in encoded_batch], dim=0)
    attention_mask = torch.cat([torch.tensor(x['attention_mask'], dtype=torch.long).unsqueeze(0) for x in encoded_batch], dim=0)
    
    labels = torch.tensor([sample["label"] for sample in input_data], dtype=torch.long)
    return (input_ids, token_type_ids, attention_mask), labels

In [100]:
data = DataLoader(prepared_data, batch_size=BATCH_SIZE, collate_fn=generate_batch)

In [101]:
data1 = next(data.__iter__())

In [102]:
data1[0][0].shape

torch.Size([64, 512])