# Modeling
-------------


In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
import warnings

warnings.filterwarnings('ignore')

## Access the data

In [2]:
data = pd.read_parquet('./data/preprocessed_data.parquet')

In [3]:
data.head()

Unnamed: 0,review_id,product_category,total_votes,review,usefulness
41,R1FBO737KD9F2N,Electronics,23,Great noise cancelling headphones for the pric...,0.826087
145,R227GSNWI6BSZV,Electronics,20,"Garbage, lasted 8 months... warranty is useles...",1.0
265,R4PF7S0TOV9S7,Electronics,17,A long lasting bluetooth sound bazooka!\nThis ...,0.882353
274,R22LKIOKMSOG8A,Electronics,13,nice!\nThis is a nice little turntable. Don't ...,0.923077
304,R3SJTYZBYBG4EE,Electronics,99,Very good charger for the price! But has a dow...,1.0


## Select modeling technique & Build model

In [4]:
# Split the data into train, val, and test sets
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['usefulness'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [5]:
train_pool = Pool(X_train.reset_index(drop=True), y_train.reset_index(drop=True), text_features=['review'], feature_names=['review'])
eval_pool  = Pool(X_val.reset_index(drop=True), y_val.reset_index(drop=True), text_features=['review'], feature_names=['review'])
test_pool  = Pool(X_test.reset_index(drop=True), y_test.reset_index(drop=True), text_features=['review'], feature_names=['review'])

model = CatBoostClassifier(text_features=['review'], verbose=False, loss_function='CrossEntropy', random_seed=0)

In [6]:
# Train the model
model.fit(train_pool, eval_set=eval_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x22aae20cad0>

In [7]:
# Make predictions on the test set
preds = model.predict(test_pool)

print(f"Brier loss = {brier_score_loss(preds, y_test.to_numpy()) :.5f}")
print(f"ROC_AUC = {roc_auc_score(y_test.to_numpy() > 0.5, preds) :.5f}")

Brier loss = 0.11326
ROC_AUC = 0.62692


In [10]:
model.save_model('catboost_model')

## Inference of summarization model

In [27]:
model = CatBoostClassifier()
model.load_model('catboost_model')

<catboost.core.CatBoostClassifier at 0x22b6a6eb2f0>

In [8]:
from transformers import pipeline

summarizer = pipeline("summarization", model="Falconsai/text_summarization")

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [14]:
probs = model.predict_proba(test_pool)[:, 1]

In [15]:
probs[:10]

array([0.92529049, 0.91536677, 0.9090101 , 0.75488401, 0.60369318,
       0.83708404, 0.88674995, 0.82113807, 0.4334201 , 0.69430861])

In [21]:
X_test[:10]

47355      For street photography or 100% landscapes it i...
7514086    Great Phone for a Great Price\nThis phone is b...
1320171    Will the Product Meet the 3-Year Warranty\nGot...
6604354    Kindle DX\nI purchased this for my husband who...
2982011    Good quality\nI use on a Plextor DVD/CD R/W an...
708382     Nice Auto-Diagnosis Device for Auto Diagnosis\...
1326176    The look & feel are and remain good. It does a...
1477329    First DSLR\nI am using a D7000 + 18-200mm VRII...
8717154    Piss Poor!\nThis really sucks. Did not even co...
419206     Not Anti Glare\nThis went on smoothly enough i...
Name: review, dtype: object

In [22]:
X_test.iloc[0]



In [26]:
print(summarizer(X_test.iloc[0], max_length=256, min_length=30, do_sample=False))

[{'summary_text': 'I have been using the X-T1 for about a year now . My motivation for purchasing it was to have a lightweight and compact camera for landscapes and occasional wildlife . I am a professional photographer -- photography is my full time job .'}]
