## Using BERT models from Google to embed the review text

In [None]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import time
import pdb
import glob

In [None]:
# Get all chunk files from the chunks folder
chunks = glob.glob('chunks/chunk_*.parquet')

datafiles = chunks
dataset = load_dataset("parquet", data_files=datafiles)
dataset['train'].train_test_split(test_size=0.1)

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModel

model_name = "bert-base-uncased"  # Choose a pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModel.from_pretrained(model_name)

extractor = pipeline(
    'feature-extraction', # We're just interested in the embeddings
    model=model_name, # Use BERT
    device=0, # Use the GPU
    framework="pt"
)

def embed(datum):
    # Texts is a list of texts. This is called batching, and it's more
    # efficient than running two separate embedding calls.
    embeddings = extractor(datum['text'],padding="longest", truncation=True)

    # Since we are using a BERT model, we can just use the first embedding, which is
    # for the special token CLS. CLS is known ad the "phrase embedding"
    return { 'embedding': embeddings[0][0] }

# Fastest feature extraction possible, using Huggingface datasets.
def batch_embed(datum):
    
    features = extractor(datum['text'], padding="longest", truncation=True)
    batch_size = len(datum['text'])
    if batch_size > 1:
        # We're in batch mode, so massage the data differently
        batched_features = []
        for b in range(batch_size):
            # Since we are using a BERT model, we can just use the first embedding, which is
            # for the special token CLS. CLS is known ad the "phrase embedding"
            batched_features.append(features[b][0][0])
        return {'embedding': batched_features }

def tokenize(example):
    return tokenizer(example['text'], padding="longest", truncation=True, max_length=512)


In [None]:
import random

random.shuffle(chunks)

batch_size=8

for c in chunks:
    # Load the parquet chunk
    data_chunk = load_dataset("parquet", data_files=c,split='train')

    chunk_number = c.split("_")[1].split(".")[0]
    print(f"Processing chunk {chunk_number} using batch size {batch_size}")
    
    if batch_size > 1:
        data_chunk = data_chunk.map(batch_embed, batched=True, batch_size=batch_size)
    else:
        data_chunk = data_chunk.map(embed, batched=False)

    # Save featurized chunk
    output_file = f"chunks/featurized_chunks_{chunk_number}"
    print(f"Saving featurized chunk {output_file} to disk")
    data_chunk.save_to_disk(output_file)

print("All done!")


In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

# Total size of data
N = len(dataset)

X = [[1] + fv for fv in dataset['embedding']]
y = dataset['rating']

Ntrain,Nvalid,Ntest = int(N*0.8), int(N*0.1), int(N*0.1)
X_train,X_valid,X_test = X[:Ntrain],X[Ntrain:Ntrain+Nvalid],X[Ntrain+Nvalid:]
y_train,y_valid,y_test = y[:Ntrain],y[Ntrain:Ntrain+Nvalid],y[Ntrain+Nvalid:]

model_lr = sklearn.linear_model.Ridge(1, fit_intercept=False)
model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print(f"Mean Squared Error: {mse}") 