## Using BERT models from Google to embed the review text

In [None]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import time
import pdb
import glob

In [None]:
# Get all chunk files from the chunks folder
chunks = glob.glob('chunks/chunk_*.parquet')

datafiles = chunks
dataset = load_dataset("parquet", data_files=datafiles)
dataset['train'].train_test_split(test_size=0.1)

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModel

model_name = "bert-base-uncased"  # Choose a pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModel.from_pretrained(model_name)

extractor = pipeline(
    'feature-extraction', # We're just interested in the embeddings
    model=model_name, # Use BERT
    device=0, # Use the GPU
    framework="pt"
)

def embed(datum):
    # Texts is a list of texts. This is called batching, and it's more
    # efficient than running two separate embedding calls.
    embeddings = extractor(datum['text'],padding="longest", truncation=True)

    # Since we are using a BERT model, we can just use the first embedding, which is
    # for the special token CLS. CLS is known ad the "phrase embedding"
    return { 'embedding': embeddings[0][0] }

# Fastest feature extraction possible, using Huggingface datasets.
def batch_embed(datum):
    
    features = extractor(datum['text'], padding="longest", truncation=True)
    batch_size = len(datum['text'])
    if batch_size > 1:
        # We're in batch mode, so massage the data differently
        batched_features = []
        for b in range(batch_size):
            # Since we are using a BERT model, we can just use the first embedding, which is
            # for the special token CLS. CLS is known ad the "phrase embedding"
            batched_features.append(features[b][0][0])
        return {'embedding': batched_features }

def tokenize(example):
    return tokenizer(example['text'], padding="longest", truncation=True, max_length=512)


In [None]:
import random

random.shuffle(chunks)

batch_size=8

for c in chunks:
    # Load the parquet chunk
    data_chunk = load_dataset("parquet", data_files=c,split='train')

    chunk_number = c.split("_")[1].split(".")[0]
    print(f"Processing chunk {chunk_number} using batch size {batch_size}")
    
    if batch_size > 1:
        data_chunk = data_chunk.map(batch_embed, batched=True, batch_size=batch_size)
    else:
        data_chunk = data_chunk.map(embed, batched=False)

    # Save featurized chunk
    output_file = f"chunks/featurized_chunks_{chunk_number}"
    print(f"Saving featurized chunk {output_file} to disk")
    data_chunk.save_to_disk(output_file)

print("All done!")


Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 759 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_759 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 595 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_595 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 426 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_426 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 732 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_732 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 8 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_8 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 840 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_840 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 372 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_372 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 409 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_409 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 594 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_594 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 851 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_851 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 455 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_455 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 574 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_574 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 220 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_220 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 333 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_333 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 315 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_315 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 688 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_688 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 500 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_500 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 106 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_106 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 94 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_94 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 75 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_75 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 722 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_722 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 579 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_579 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 345 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_345 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 244 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_244 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 415 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_415 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 287 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_287 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 95 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_95 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 294 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_294 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 305 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_305 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 539 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_539 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 184 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_184 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 280 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_280 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 691 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_691 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 193 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_193 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 753 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_753 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 263 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_263 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 510 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_510 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 874 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_874 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 793 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_793 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 262 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_262 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 564 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_564 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 268 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_268 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 10 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_10 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 41 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_41 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 603 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving featurized chunk chunks/featurized_chunks_603 to disk


Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Processing chunk 258 using batch size 8


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

# Total size of data
N = len(dataset)

X = [[1] + fv for fv in dataset['embedding']]
y = dataset['rating']

Ntrain,Nvalid,Ntest = int(N*0.8), int(N*0.1), int(N*0.1)
X_train,X_valid,X_test = X[:Ntrain],X[Ntrain:Ntrain+Nvalid],X[Ntrain+Nvalid:]
y_train,y_valid,y_test = y[:Ntrain],y[Ntrain:Ntrain+Nvalid],y[Ntrain+Nvalid:]

model_lr = sklearn.linear_model.Ridge(1, fit_intercept=False)
model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print(f"Mean Squared Error: {mse}") 