In [None]:
from transformers import AutoTokenizer, AutoModel

model_name = "bert-base-uncased"  # Choose a pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

text = "This is an example sentence."
inputs = tokenizer(text, return_tensors="pt") 
outputs = model(**inputs)
embeddings = outputs.last_hidden_state  # Access the embeddings

## Using BERT models from Google to embed the review text

In [1]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

In [44]:
#datafiles = chunks
datafiles = "chunks/chunk_1.parquet"
dataset = load_dataset("parquet", data_files=datafiles,split='train')


In [45]:
small_dataset

Dataset({
    features: ['user_id', 'rating', 'helpful_vote', 'timestamp', 'asin', 'text', 'title', 'parent_asin', 'verified_purchase', 'embedding'],
    num_rows: 10
})

In [48]:
from transformers import pipeline, AutoTokenizer, AutoModel

model_name = "bert-base-uncased"  # Choose a pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModel.from_pretrained(model_name)

extractor = pipeline(
    'feature-extraction', # We're just interested in the embeddings
    model=model_name, # Use BERT
    device=0, # Use the GPU
)

def embed(datum):
    # Texts is a list of texts. This is called batching, and it's more
    # efficient than running two separate embedding calls.
    embeddings = extractor(datum['text'],padding="longest", truncation=True)

    # Since we are using a BERT model, we can just use the first embedding, which is
    # for the special token CLS. CLS is known ad the "phrase embedding"
    return { 'embedding': embeddings[0][0] }

# Fastest feature extraction possible, using Huggingface datasets.
def batch_embed(datum):
    
    features = extractor(datum['text'], padding="longest", truncation=True)
    batch_size = len(datum['text'])
    if batch_size > 1:
        # We're in batch mode, so massage the data differently
        batched_features = []
        for b in range(batch_size):
            # Since we are using a BERT model, we can just use the first embedding, which is
            # for the special token CLS. CLS is known ad the "phrase embedding"
            batched_features.append(features[b][0][0])
        return {'embedding': batched_features }

def tokenize(example):
    return tokenizer(example['text'], padding="max_length", truncation=True, max_length=512)


In [None]:
tokenizer

In [None]:
extractor.tokenizer.model_max_length

In [None]:
tokenizer

In [None]:
# This is how you embed one piece of text at a time.
embed("I hate this product with a passion! It sucks balls! I would never ever buy it")

In [None]:
# This is how you would batch the embedding of the title and the review simultaneously
fastembed(["Don't buy this!", "I hate this product with a passion! It sucks balls! I would never ever buy it"])

In [None]:
tokenized_dataset = dataset.map(tokenize,batched=True)

In [None]:
tokenized_dataset['input_ids'][:10]

In [6]:
import time
import pdb

In [None]:
start_time = time.perf_counter()

batch_size = 1
if batch_size > 1:
    dataset = dataset.map(batch_embed, batched=True, batch_size=batch_size)
else:
    dataset = dataset.map(embed, batched=False)
    
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [42]:
len(small_dataset['embedding'])

10

In [None]:
# Embed the review text
start_time = time.perf_counter()

dfs['embedding_review'] = dfs['text'].apply(embed)
dfs['embedding_title'] = dfs['title'].apply(embed)

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")


In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

# Total size of data
N = len(dfs)

X = [[1]+ fvs[0] + fvs[1] for fvs in zip(dfs['embedding_review'],dfs['embedding_title'])]
y = dfs['rating']

Ntrain,Nvalid,Ntest = int(N*0.8), int(N*0.1), int(N*0.1)
X_train,X_valid,X_test = X[:Ntrain],X[Ntrain:Ntrain+Nvalid],X[Ntrain+Nvalid:]
y_train,y_valid,y_test = y[:Ntrain],y[Ntrain:Ntrain+Nvalid],y[Ntrain+Nvalid:]

model_lr = sklearn.linear_model.Ridge(1, fit_intercept=False)
model_lr.fit(X_train, y_train)

y_pred = model_lr.predict(X_valid)
mse = mean_squared_error(y_valid, y_pred)
print(f"Mean Squared Error: {mse}") 