# Instructions



# Setup


In [None]:
# SET THESE BEFORE EXECUTION

save_dataset_to_drive = False  # saves dataframe after cleaning/tokenizing
save_model_to_drive = False  # saves vectorizer and model after training

# dataset to import (specify this even when importing from drive)
kaggle_dataset = 'snap/amazon-fine-food-reviews'
#kaggle_dataset = 'eswarchandt/amazon-music-reviews'

# type of logistic regression model to use (based on which dataset it was trained on)
lr_model = 'amazon-fine-food-reviews'
#lr_model = 'amazon-music-reviews'

In [None]:
if kaggle_dataset == 'snap/amazon-fine-food-reviews':
  review_text = "Text"
  review_rating = "Score"
elif kaggle_dataset == 'eswarchandt/amazon-music-reviews':
  review_text = "reviewText"
  review_rating = "overall"

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(torch.cuda.is_available())

True


## Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import pickle
import glob
from google.colab import drive

In [None]:
# Make sure you added the shared project folder to your drive
drive.mount('/content/drive')
project_folder = '/content/drive/MyDrive/CIS 519 Project'

Mounted at /content/drive


In [None]:
# Clear files from any past session
files = glob.glob("*.*")
for f in files:
    ! rm '{f}'

In [None]:
# Run this to load in the saved dataset if you don't want to retokenize (skip to bag of words code section)
if not save_dataset_to_drive:
  path = project_folder + '/' + kaggle_dataset.split('/')[1] + '-prepared-dataset.csv'

  prepared_dataset = pd.read_csv(path).drop(['Unnamed: 0'], axis=1)
  print(prepared_dataset)

                UserId   ProductId  Score  HelpfulnessNumerator  \
0       A3SGXH7AUHU8GW  B001E4KFG0      5                     1   
1       A1D87F6ZCVE5NK  B00813GRG4      1                     0   
2        ABXLMWJIXXAIN  B000LQOCH0      4                     1   
3       A395BORC6FGVXV  B000UA0QIQ      2                     3   
4       A1UQRSCLF8GW1T  B006K2ZZ7K      5                     0   
...                ...         ...    ...                   ...   
568422  A28KG5XORO54AY  B001EO7N10      5                     0   
568423  A3I8AFVPEE8KI5  B003S1WTCU      2                     0   
568424  A121AA1GQV751Z  B004I613EE      5                     2   
568425   A3IBEVCTXKNOH  B004I613EE      5                     1   
568426  A3LGQPJCZVL9UC  B001LR2CU2      5                     0   

        HelpfulnessDenominator                             Summary  \
0                            1               Good Quality Dog Food   
1                            0                   Not as

In [None]:
# Run this to load in the saved model if you don't want to retrain (skip to bag of words code section)
if not save_model_to_drive:
  path = project_folder + '/' + lr_model + '-lr-model.pkl'

  with open(path, 'rb') as f:
      lr_bow = pickle.load(f)
      
  print(lr_bow)

LogisticRegression(max_iter=500, random_state=0, solver='sag')


## Download/Read Data

In [None]:
# use Avi's kaggle api key to permit downloading the data
! mkdir ~/.kaggle
! cp '{project_folder}/kaggle.json' ~/.kaggle
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# download the dataset
! pip install kaggle
! kaggle datasets download --unzip {kaggle_dataset}

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading amazon-fine-food-reviews.zip to /content
 93% 225M/242M [00:01<00:00, 139MB/s]
100% 242M/242M [00:01<00:00, 166MB/s]


# Bag of Words Featurization

## Split dataset

In [None]:
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

In [None]:
x_train,x_test,y_train,y_test = train_test_split(prepared_dataset[review_text], prepared_dataset[review_rating], test_size = 0.3, random_state = 0)

## Featurize

In [None]:
path = project_folder + '/' + lr_model + '-count-vectorizer.pkl'

if save_model_to_drive:
  cv=CountVectorizer(binary=False,ngram_range=(1,3))
  cv.fit(x_train)

  # save
  with open(path, 'wb') as f:
    pickle.dump(cv, f)

  bow_train_reviews=cv.transform(x_train) #transformed train reviews
  print('BOW_cv_train:', bow_train_reviews.shape)

else:
  with open(path, 'rb') as f:
      cv = pickle.load(f)

bow_test_reviews=cv.transform(x_test) #transformed test reviews
print('BOW_cv_test:', bow_test_reviews.shape)

# Check how many words appear <= 10 times
print(sum([1 if v <= 10 else 0 for (k, v) in cv.vocabulary_.items()]))

BOW_cv_test: (170529, 11454254)
11


# Evaluation Metric

## Mean Squared Error

In [None]:
def get_mse(labels, pred):
  score = mean_squared_error(labels, pred)
  print("error :", score)

# Models

## Majority Class Baseline

In [None]:
maj_class_train = y_train.mode()
maj_class_test = y_test.mode()

maj_class_train_preds = np.full_like(y_train, maj_class_train)
maj_class_test_preds = np.full_like(y_test, maj_class_test)

In [None]:
maj_class_train_err = get_mse(maj_class_train_preds, y_train)
maj_class_test_err = get_mse(maj_class_test_preds, y_test)

error : 2.3808312683149953
error : 2.391751549589806


## Transformers

In [None]:
x_train_tr = x_train[:]
y_train_tr = y_train[:]
x_test_tr = x_test[:]
y_test_tr = y_test[:]

In [None]:
# Huggingface transformer dataset
!pip install -q transformers datasets
from datasets import Dataset, load_dataset
import datasets
from transformers import AutoTokenizer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Convert to Huggingface dataset
train_ds = pd.DataFrame({"text": x_train_tr, "labels": y_train_tr})
test_ds = pd.DataFrame({"text": x_test_tr, "labels": y_test_tr})

dataset_train = Dataset.from_pandas(train_ds)
dataset_test = Dataset.from_pandas(test_ds)
dataset = datasets.DatasetDict({'train': dataset_train, 'test': dataset_test})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 397898
    })
    test: Dataset({
        features: ['text', 'labels', '__index_level_0__'],
        num_rows: 170529
    })
})

In [None]:
dataset['train'][0]

{'text': "much satisfied box chocolate arrive time party everyone enjoy much put another order new years. href= '' http //www.amazon.com/gp/product/b000kjrf40 '' special liquor fill chocolate 48 count christmas holiday gift box chocolate gift pack /a",
 'labels': 5,
 '__index_level_0__': 9862}

In [None]:
labels = ['labels']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=64)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/397898 [00:00<?, ? examples/s]

Map:   0%|          | 0/170529 [00:00<?, ? examples/s]

In [None]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])


In [None]:
tokenizer.decode(example['input_ids'])

"[CLS] much satisfied box chocolate arrive time party everyone enjoy much put another order new years. href ='' http / / www. amazon. com / gp / product / b000kjrf40'' special liquor fill chocolate 48 count christmas holiday gift box chocolate gift pack / a [SEP] [PAD] [PAD] [PAD] [PAD]"

In [None]:
example['labels']

[5.0]

In [None]:
[id2label[idx] for idx, label in enumerate(example['labels'])]

['labels']

In [None]:
encoded_dataset.set_format("torch")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
batch_size = 32
metric_name = "f1"

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [None]:
from transformers import EvalPrediction
from sklearn.metrics import f1_score, accuracy_score
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(predictions.shape)
    y_pred[np.where(predictions < 1.5)] = 1
    y_pred[np.where((predictions >= 1.5) & (predictions < 2.5))] = 2
    y_pred[np.where((predictions >= 2.5) & (predictions < 3.5))] = 3
    y_pred[np.where((predictions >= 3.5) & (predictions < 4.5))] = 4
    y_pred[np.where(predictions >= 4.5)] = 5
    # finally, compute metrics
    y_true = labels
    f1_weighted_average = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_weighted_average,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [None]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  2172,  8510,  3482,  7967,  7180,  2051,  2283,  3071,  5959,
         2172,  2404,  2178,  2344,  2047,  2086,  1012, 17850, 12879,  1027,
         1005,  1005,  8299,  1013,  1013,  7479,  1012,  9733,  1012,  4012,
         1013, 14246,  1013,  4031,  1013,  1038,  8889,  2692,  2243,  3501,
        12881, 12740,  1005,  1005,  2569, 13207,  6039,  7967,  4466,  4175,
         4234,  6209,  5592,  3482,  7967,  5592,  5308,  1013,  1037,   102,
            0,     0,     0,     0])

In [None]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0).to(device), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0).to(device))
outputs

SequenceClassifierOutput(loss=tensor(0.0032, device='cuda:0', grad_fn=<MseLossBackward0>), logits=tensor([[5.0566]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.5471,0.495309,0.718132,0.712524
2,0.3758,0.471109,0.743613,0.751925


TrainOutput(global_step=12436, training_loss=0.5183702458442695, metrics={'train_runtime': 8914.7354, 'train_samples_per_second': 89.267, 'train_steps_per_second': 1.395, 'total_flos': 2.6172605681032704e+16, 'train_loss': 0.5183702458442695, 'epoch': 2.0})

In [None]:
train_ds_full = pd.DataFrame({"text": x_train, "labels": y_train})
dataset_train_full = Dataset.from_pandas(train_ds_full)

In [None]:
train_ds_full = pd.DataFrame({"text": x_train, "labels": y_train})
test_ds_full = pd.DataFrame({"text": x_test, "labels": y_test})
dataset_train_full = Dataset.from_pandas(train_ds_full)
dataset_test_full = Dataset.from_pandas(test_ds_full)
dataset_full = datasets.DatasetDict({'train': dataset_train_full, 'test': dataset_test_full})

In [None]:
encoded_dataset_full = dataset_full.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/397898 [00:00<?, ? examples/s]

Map:   0%|          | 0/170529 [00:00<?, ? examples/s]

In [None]:
!pip install swifter
import swifter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting swifter
  Downloading swifter-1.3.4.tar.gz (830 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m830.9/830.9 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16
  Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.3.4-py3-none-any.whl size=16321 sha256=cb4a7f0807d2ccb428848ec3f9b2cfb6c4d99a68421a5f0fac4717ff0b077fba
  Stored in directory: /root/.cache/pip/wheels/2b/5e/f2/3931524f702ffd03309e96d35ee2fbf9c61c27377511ee8d4c
Successfully built swifter
Installing collected packages: jedi, swifter
Successfully installe

In [None]:
def print_pred_to_file(filename, preds, dataset):
    df = pd.DataFrame()
    df['predicted'] = pd.Series(preds[0].flatten()) # Get predictions
    df['predicted'] = df['predicted'].swifter.apply(lambda x: 1 if x < 1.5 else (2 if x < 2.5 else (3 if x < 3.5 else (4 if x < 4.5 else 5)))) # Convert probability to label
    df['labels'] = pd.Series(np.array(dataset['labels']).flatten()) # Get true labels
    df.to_csv(filename)
    print(df)

In [None]:
train_preds = trainer.predict(encoded_dataset_full["train"])
test_preds = trainer.predict(encoded_dataset_full["test"])

In [None]:
print_pred_to_file(project_folder + '/bert_output_train_full.csv', train_preds, encoded_dataset_full["train"])
print_pred_to_file(project_folder + '/bert_output_test_full.csv', test_preds, encoded_dataset_full["test"])

Pandas Apply:   0%|          | 0/397898 [00:00<?, ?it/s]

        predicted  labels
0               5     5.0
1               5     5.0
2               5     5.0
3               4     5.0
4               2     3.0
...           ...     ...
397893          1     1.0
397894          5     5.0
397895          5     3.0
397896          5     5.0
397897          5     4.0

[397898 rows x 2 columns]


Pandas Apply:   0%|          | 0/170529 [00:00<?, ?it/s]

        predicted  labels
0               5     5.0
1               5     5.0
2               5     5.0
3               5     5.0
4               5     5.0
...           ...     ...
170524          3     3.0
170525          3     4.0
170526          5     5.0
170527          5     5.0
170528          5     2.0

[170529 rows x 2 columns]


In [None]:
from sklearn.metrics import f1_score

def evaluate(filename):
    df = pd.read_csv(filename)
    f1 = f1_score(y_true=df['labels'], y_pred=df['predicted'], average ='weighted')
    print(f1)

In [None]:
evaluate(project_folder + '/bert_output_train_full.csv')
evaluate(project_folder + '/bert_output_test_full.csv')

0.7831711000152991
0.7436133673023752
