# Modeling
-------------


In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
import warnings

warnings.filterwarnings('ignore')

## Access the data

In [23]:
data = pd.read_parquet('./data/preprocessed_data.parquet')

In [24]:
data.head()

Unnamed: 0,review_id,product_id,product_category,total_votes,review,usefulness
41,R1FBO737KD9F2N,B00NG57H4S,Electronics,23,Great noise cancelling headphones for the pric...,0.826087
145,R227GSNWI6BSZV,B00ICNXESC,Electronics,20,"Garbage, lasted 8 months... warranty is useles...",1.0
265,R4PF7S0TOV9S7,B00XR1MW4G,Electronics,17,A long lasting bluetooth sound bazooka!\nThis ...,0.882353
274,R22LKIOKMSOG8A,B00XS3HGEO,Electronics,13,nice!\nThis is a nice little turntable. Don't ...,0.923077
304,R3SJTYZBYBG4EE,B00L108SAW,Electronics,99,Very good charger for the price! But has a dow...,1.0


In [25]:
# Split the data into train, val, and test sets
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['usefulness'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

## Select modeling technique & Build model

In [5]:
train_pool = Pool(X_train.reset_index(drop=True), y_train.reset_index(drop=True), text_features=['review'], feature_names=['review'])
eval_pool  = Pool(X_val.reset_index(drop=True), y_val.reset_index(drop=True), text_features=['review'], feature_names=['review'])
test_pool  = Pool(X_test.reset_index(drop=True), y_test.reset_index(drop=True), text_features=['review'], feature_names=['review'])

model = CatBoostClassifier(text_features=['review'], verbose=False, loss_function='CrossEntropy', random_seed=0)

In [6]:
# Train the model
model.fit(train_pool, eval_set=eval_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x22aae20cad0>

## Evaluate the model

In [7]:
# Make predictions on the test set
preds = model.predict(test_pool)

print(f"Brier loss = {brier_score_loss(preds, y_test.to_numpy()) :.5f}")
print(f"ROC_AUC = {roc_auc_score(y_test.to_numpy() > 0.5, preds) :.5f}")

Brier loss = 0.11326
ROC_AUC = 0.62692


In [10]:
model.save_model('catboost_model')

## Inference of summarization model

In [26]:
model = CatBoostClassifier()
model.load_model('catboost_model')

<catboost.core.CatBoostClassifier at 0x1f09ed137d0>

In [82]:
from transformers import pipeline, logging
logging.set_verbosity_error()


summarizer = pipeline("summarization", model="Falconsai/text_summarization", device='cuda:0')

In [28]:
_, X_test_with_id = train_test_split(data, test_size=0.2, random_state=0)
_, X_test_with_id = train_test_split(X_test_with_id, test_size=0.5, random_state=0)

In [30]:
probs = model.predict_proba(test_pool)[:, 1]

In [33]:
X_test_with_id["usefulness"] = probs

In [49]:
grouped = X_test_with_id.groupby('product_id').apply(lambda x: x.sort_values('usefulness', ascending=False))

top_reviews = grouped.groupby(level=0).head(5).groupby(level=0)['review'].apply(lambda x: '\n\n'.join(x))

In [83]:
from tqdm.auto import tqdm
from datasets import Dataset
from transformers import pipeline

summaries = []
reviews_dataset = Dataset.from_dict({"text": top_reviews.to_list()[:1000]})

for i in tqdm(range(0, len(reviews_dataset), 8)):
    batch = reviews_dataset.select(range(i, i+8))
    batch_summaries = summarizer(batch["text"], min_length=16, max_length=96)
    summaries.extend(batch_summaries)

  0%|          | 0/125 [00:00<?, ?it/s]

In [85]:
from evaluate import load

rouge = load("rouge")

## Ensure that after summarization meaning is preserved

In [93]:
results = rouge.compute(predictions=[sentence["summary_text"] for sentence in summaries], references=top_reviews[:1000])
print(f"ROUGE-1 F1 score: {results['rouge1'] * 100:.2f}")
print(f"ROUGE-2 F1 score: {results['rouge2'] * 100:.2f}")
print(f"ROUGE-L F1 score: {results['rougeL'] * 100:.2f}")

ROUGE-1 F1 score: 47.89
ROUGE-2 F1 score: 45.57
ROUGE-L F1 score: 47.18
