### Initialization

In [None]:
import numpy as np
import pandas as pd

import torch
import transformers

from tqdm.auto import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Global Variables 

In [None]:
max_sample_size = 200

### Load Data

In [None]:
df_reviews = pd.read_csv('/datasets/imdb_reviews_200.tsv', sep=' ')

### Preprocessing for BERT

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

ids_list = []
attention_mask_list = []

max_length = 512

for input_text in df_reviews.iloc[:max_sample_size]['review']:
    ids = tokenizer.encode(input_text.lower(), add_special_tokens=True, truncation=True, max_length=max_length)
    padded = np.array(ids + [0]*(max_length - len(ids)))
    attention_mask = np.where(padded != 0, 1, 0)
    ids_list.append(padded)
    attention_mask_list.append(attention_mask)

### Get Embeddings

In [None]:
# set configurations

config = transformers.BertConfig.from_pretrained('bert-base-uncased')
model = transformers.BertModel.from_pretrained('bert-base-uncased')

In [None]:
# typically the batch size is equal to 100 but we can set it to lower values to lower the memory requirements

batch_size = 25
embeddings = []

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using the {device} device.')
model.to(device)

for i in tqdm(range(len(ids_list) // batch_size)):
    
    ids_batch = torch.LongTensor(ids_list[batch_size*i:batch_size*(i+1)]).to(device)
    attention_mask_batch = torch.LongTensor(attention_mask_list[batch_size*i:batch_size*(i+1)]).to(device)

    with torch.no_grad():
        model.eval()
        batch_embeddings = model(ids_batch, attention_mask=attention_mask_batch)

    embeddings.append(batch_embeddings[0][:,0,:].detach().cpu().numpy())

### Modeling

In [None]:
features = np.concatenate(embeddings)
target = df_reviews.iloc[:max_sample_size]['pos']

print(features.shape)
print(target.shape)

In [None]:
# train and test your model

train, test = train_test_split(pd.concat([pd.DataFrame(features),target], axis=1), test_size=0.2)
x_train, y_train = train.drop('pos', axis=1), train['pos']
x_test, y_test = test.drop('pos', axis=1), test['pos']

In [None]:
# model training

lg = LogisticRegression()
cross_val_score(lg, x_train, y_train).mean()