[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/13muBlbis8SlltnzisMY8Hpg1TZpO0djA?usp=sharing)


# Install and import packages 



In [None]:
!pip install -r requirements.txt

# Include numerical/categorical data with a language model - Multimodal_transformers Demo for Review prediction 

We will use the multimodal_transformers along with huggingface libraires to classify reviews basaed on numerical/categorical data and text.

In [2]:
from datasets import load_dataset, Dataset
import transformers
import pandas as pd
import numpy as np

# Data Processing

For demonstration purposes we will use this Amazon Review dataset. We will shrink it down to only 10,000 examples and remove irrelivant columns.

In [3]:
# Load our demo dataset
amz_reviews = load_dataset('amazon_us_reviews', 'Personal_Care_Appliances_v1_00')
am_reviews_backup = amz_reviews.copy()

Downloading builder script:   0%|          | 0.00/7.45k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/195k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/18.3k [00:00<?, ?B/s]

Downloading and preparing dataset amazon_us_reviews/Personal_Care_Appliances_v1_00 (download: 16.82 MiB, generated: 46.76 MiB, post-processed: Unknown size, total: 63.58 MiB) to /root/.cache/huggingface/datasets/amazon_us_reviews/Personal_Care_Appliances_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563...


Downloading data:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/85981 [00:00<?, ? examples/s]

Dataset amazon_us_reviews downloaded and prepared to /root/.cache/huggingface/datasets/amazon_us_reviews/Personal_Care_Appliances_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
# First we will shrink it down and remove the columns we dont need.
small_amz_reviews = amz_reviews['train'].shuffle(seed=42).select(range(10000))
small_amz_reviews = small_amz_reviews.remove_columns(['customer_id','review_id',
 'product_id',
 'product_parent',
 'vine',
 'review_date'])
#Here is more or less what you input CSV should look like
data_df = small_amz_reviews.to_pandas()
data_df['star_rating'] = data_df['star_rating']-1
data_df

Unnamed: 0,marketplace,product_title,product_category,star_rating,helpful_votes,total_votes,verified_purchase,review_headline,review_body
0,US,MedMobile® BATHTUB TRANSFER BENCH / BATH CHAIR...,Personal_Care_Appliances,4,0,0,1,Highl recommand!,This is a true lifesaver! I just had hip repl...
1,US,Stainless Steel Toe Nail Clipper,Personal_Care_Appliances,3,0,0,1,Best clipper I've ever seen.,"Works well, looks good, nice weight. What els..."
2,US,Exergen Temporal Artery Thermometer MODEL# TAT...,Personal_Care_Appliances,3,0,0,1,Good for Baby and need to warm it before use i...,This is the 3rd Thermometer of our family. The...
3,US,Homedics AG-2001TL3C Inversion Massage Recline...,Personal_Care_Appliances,2,8,9,1,Tony Little's Inversion Massage Recliner w/heat,"I have the older version of this chair, when m..."
4,US,Mens Quartz Watch - Style JEMWCHTS,Personal_Care_Appliances,3,1,1,1,Does the job,The watch is attractive and of a good weight. ...
...,...,...,...,...,...,...,...,...,...
9995,US,The Giant Print Address Book II,Personal_Care_Appliances,4,1,1,1,LOVE IT!!!!!!!!!!!!,THIS WAS FOR MY MOTHER WHO REALLY NEED A NEW ...
9996,US,Human Touch HT-1470 Back Massage Pad - Quad Ro...,Personal_Care_Appliances,2,19,19,0,Human Touch HT-1470 Back Massage Pad is okay f...,The HT-1470 is well-made but like any back mas...
9997,US,Basic Essentials Stretchy Beaded Blue Massage ...,Personal_Care_Appliances,0,1,1,1,Horrible!!!,"It's unconfortable, not easily cleaning, hard ..."
9998,US,Body Back Company’s Body Back Buddy Trigger Po...,Personal_Care_Appliances,4,0,0,1,Five Stars,Got this for my husband-a massage therapist- a...


# Load real product reviews
replace data_df with your product reviews 

In [5]:
# You can simply load you csv and replace small_amz_reviews with your_reviews
#csv_path = '/your/csv/locaiton.csv'
#data_df = pd.read_csv(csv_path)

Indentify the columns in the dataset by type of data

In [6]:
text_cols = ['product_title', 'review_headline','review_body']
# The label column is expected to contain integers from 0 to N_classes - 1
label_col = 'star_rating'
categorical_cols = ['product_category', 'verified_purchase', 'marketplace']
numerical_cols = ['helpful_votes', 'total_votes']
label_list = list(np.unique(data_df[label_col])) # what each label class represents

Lets insure that the label column only has ints from 0 to N_classes - 1

In [7]:
label_adj = []
for label in data_df['star_rating']:
    label_adj.append(label_list.index(label))
data_df[label_col] = label_adj

In [8]:
!mkdir ./data

In [9]:
train_df, val_df, test_df = np.split(data_df.sample(frac=1), [int(.8*len(data_df)), int(.9 * len(data_df))])
print('Num examples train-val-test')
print(len(train_df), len(val_df), len(test_df))

train_df.to_csv('./data/train.csv')
val_df.to_csv('./data/val.csv')
test_df.to_csv('./data/test.csv')

Num examples train-val-test
8000 1000 1000


# Tokenize the data

In [10]:
import pandas as pd
from multimodal_transformers.data import load_data_from_folder

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# make sure NaN values for cat columns are filled before passing to load_data
for c in categorical_cols:  
    data_df.loc[:, c] = data_df.loc[:, c].astype(str).fillna("-9999999")

train_dataset, val_dataset, test_dataset = load_data_from_folder(
    './data/',
    text_cols,
    tokenizer,
    label_col=label_col,
    #label_list=column_info_dict['label_list'],
    categorical_cols=categorical_cols,
    numerical_cols= numerical_cols,
    sep_text_token_str=tokenizer.sep_token,
)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in

# Create Config and Load Model

In [11]:
from multimodal_transformers.model import AutoModelWithTabular, TabularConfig
from transformers import AutoConfig
# Create the config for a tabular bert and load tabular bert.
num_labels = len(np.unique(train_dataset.labels))
config = AutoConfig.from_pretrained('bert-base-uncased')
tabular_config = TabularConfig(
    num_labels=num_labels,
    cat_feat_dim=train_dataset.cat_feats.shape[1],
    numerical_feat_dim=train_dataset.numerical_feats.shape[1],
    combine_feat_method='weighted_feature_sum_on_transformer_cat_and_numerical_feats',
)
config.tabular_config = tabular_config

model = AutoModelWithTabular.from_pretrained('bert-base-uncased', config=config)


Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertWithTabular: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertWithTabular were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifi

In [12]:
import evaluate
# Select and evaluation metric 
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

# Run trainer

In [None]:

from transformers import Trainer, TrainingArguments
#Defnine training arguments
training_args = TrainingArguments(
    output_dir="./outputs/model",
    logging_dir="./outputs/runs",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    evaluate_during_training=True,
    logging_steps=25,
    eval_steps=250
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1000 [00:00<?, ?it/s]

{'loss': 1.3731059265136718, 'learning_rate': 4.875e-05, 'epoch': 0.025, 'step': 25}
{'loss': 1.1602633666992188, 'learning_rate': 4.75e-05, 'epoch': 0.05, 'step': 50}
{'loss': 1.0620286560058594, 'learning_rate': 4.6250000000000006e-05, 'epoch': 0.075, 'step': 75}
{'loss': 0.9716357421875, 'learning_rate': 4.5e-05, 'epoch': 0.1, 'step': 100}
{'loss': 0.89804443359375, 'learning_rate': 4.375e-05, 'epoch': 0.125, 'step': 125}
{'loss': 0.86322998046875, 'learning_rate': 4.25e-05, 'epoch': 0.15, 'step': 150}
{'loss': 0.875430908203125, 'learning_rate': 4.125e-05, 'epoch': 0.175, 'step': 175}
{'loss': 0.817550048828125, 'learning_rate': 4e-05, 'epoch': 0.2, 'step': 200}
{'loss': 0.8577392578125, 'learning_rate': 3.875e-05, 'epoch': 0.225, 'step': 225}
{'loss': 0.96875, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.25, 'step': 250}


Evaluation:   0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.7792838366031647, 'eval_accuracy': 0.694, 'epoch': 0.25, 'step': 250}
{'loss': 0.7342431640625, 'learning_rate': 3.625e-05, 'epoch': 0.275, 'step': 275}
{'loss': 0.9091650390625, 'learning_rate': 3.5e-05, 'epoch': 0.3, 'step': 300}


In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./logs/runs --port=6006