***I tried to predict ratings with CatBoostRegressor.***

* Applied several preprocessing operations,
* I used a pretrained embeddings for the text feature extraction stage [1],
* Used a tuned CatBoostRegressor for rating predictions (tuned with optuna)


## My Another Projects

* [Gemma 2B Text Summarization w/Zero-Shot Prompting](https://www.kaggle.com/code/banddaniel/gemma-2b-text-summarization-w-zero-shot-prompting)
* [Mammals Classification w/Ensemble Deep Learning](https://www.kaggle.com/code/banddaniel/mammals-classification-w-ensemble-deep-learning)


## References
1. https://huggingface.co/sentence-transformers/all-mpnet-base-v2

In [1]:
from IPython.display import clear_output
!pip install sentence-transformers
clear_output()

In [2]:
import os
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import  stopwords
from nltk.stem import PorterStemmer
import string


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from catboost import CatBoostRegressor

from sentence_transformers import SentenceTransformer

# feature extractor
emmedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# <span style="color:#e74c3c;"> Reading </span> Data

In [3]:
data = pd.read_csv('/kaggle/input/crocs-clog-reviews/croc_reviews.csv')
data.head()

Unnamed: 0,id,review,date,rating
0,croc_review_0,!!!!!! E X C E L L E N T!!!!!!!!!!,"April 7, 2022",5.0
1,croc_review_1,"""They're crocs; people know what crocs are.""","April 3, 2021",5.0
2,croc_review_2,- Quick delivery and the product arrived when ...,"March 19, 2023",5.0
3,croc_review_3,"...amazing ""new"" color!! who knew?? love - lov...","July 17, 2022",5.0
4,croc_review_4,0 complaints from me; this is the 8th pair of ...,"June 4, 2021",5.0


# <span style="color:#e74c3c;"> Preprocessing </span>

In [4]:
# preprocessing functions
stop_words = set(stopwords.words('english'))

def text_preprocessing(text):
    text = text.lower()
    text = re.sub(r'https?://www.', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)    
    return text

def drop_stopwords(text):
    dropped = [word for word in text.split() if word not in stop_words]
    final_text = ' '.join(dropped)
    return final_text


# applying preprocessing functions
data_processed = data.copy()
data_processed['review'] = data_processed['review'].apply(text_preprocessing).apply(drop_stopwords)

data_processed.head()

Unnamed: 0,id,review,date,rating
0,croc_review_0,e x c e l l e n,"April 7, 2022",5.0
1,croc_review_1,crocs people know crocs,"April 3, 2021",5.0
2,croc_review_2,quick delivery product arrived company said wo...,"March 19, 2023",5.0
3,croc_review_3,amazing new color knew love love love,"July 17, 2022",5.0
4,croc_review_4,complaints pair crocs bought like two months d...,"June 4, 2021",5.0


In [5]:
# train and test splitting
type_train_data, type_test_data = train_test_split(data_processed, test_size = 0.2, random_state = 55, shuffle = True)

type_train_data = type_train_data.reset_index(drop = True)
type_test_data = type_test_data.reset_index(drop = True)

# <span style="color:#e74c3c;"> SentenceTransformer Feature Extractor </span>

In [6]:
# creating text features (768 features)

train_embeddings = emmedding_model.encode(list(type_train_data['review'].values), show_progress_bar = True)
test_embeddings = emmedding_model.encode(list(type_test_data['review'].values), show_progress_bar = True)

Batches:   0%|          | 0/231 [00:00<?, ?it/s]

Batches:   0%|          | 0/58 [00:00<?, ?it/s]

In [7]:
X_train = train_embeddings
y_train = type_train_data['rating'].values

X_test = test_embeddings
y_test = type_test_data['rating'].values

# <span style="color:#e74c3c;"> CatBoostRegressor </span> 

In [8]:
# a tuned model
model = CatBoostRegressor(learning_rate = 0.06849305169297759, subsample = 0.35986600120369017, colsample_bylevel = 0.4231425586636978,
                          min_data_in_leaf = 96, depth = 5, iterations = 1000, verbose = 200)

# training
model.fit(X_train, y_train)

0:	learn: 0.9608818	total: 108ms	remaining: 1m 48s
200:	learn: 0.5854304	total: 13.3s	remaining: 52.7s
400:	learn: 0.4807109	total: 25.6s	remaining: 38.3s
600:	learn: 0.4070232	total: 38.6s	remaining: 25.6s
800:	learn: 0.3501940	total: 51.1s	remaining: 12.7s
999:	learn: 0.3037398	total: 1m 3s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x7b275ec38040>

# <span style="color:#e74c3c;"> Regression </span>  Predictions

In [9]:
# test predictions 
preds = model.predict(X_test)

# adding predictions to test dataframe
type_test_data['preds'] = preds

msa = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)

print('Mean absolute error: \t{0:.5f}'.format(msa))
print('Mean squared error: \t{0:.5f}'.format(mse))

Mean absolute error: 	0.40358
Mean squared error: 	0.50046


In [10]:
# checking for exceeding values

len(np.where(type_test_data['preds'] > 5)[0])

507

In [11]:
# a function for clipping prediction values that bigger than 5.0 (max value)

type_test_data['preds'] = type_test_data['preds'].apply(lambda X: 5.0 if X > 5.0 else X)

# <span style="color:#e74c3c;"> Final </span>  Predictions

In [12]:
# predictions after clipping

msa = mean_absolute_error(y_test, type_test_data['preds'])
mse = mean_squared_error(y_test, type_test_data['preds'])

print('Mean absolute error: \t{0:.5f}'.format(msa))
print('Mean squared error: \t{0:.5f}'.format(mse))

Mean absolute error: 	0.38166
Mean squared error: 	0.49690


In [13]:
type_test_data.head(10)

Unnamed: 0,id,review,date,rating,preds
0,croc_review_7353,fit true size looking shoe could stand day beh...,"January 23, 2022",5.0,4.298502
1,croc_review_707,every day wear great walking great comfort,"March 14, 2023",5.0,4.961751
2,croc_review_4886,worn around lot since got awesome great product,"April 3, 2021",5.0,4.559014
3,croc_review_1469,love pink color glad ordered go perfect lisa f...,"July 25, 2023",5.0,5.0
4,croc_review_7725,husband favorite ones color perform exactly pr...,"November 8, 2021",5.0,4.704875
5,croc_review_8643,narrow short wide foot high arch even get foot...,"July 22, 2021",2.0,2.071754
6,croc_review_8438,things good fast shipping excellent quality,"March 2, 2022",5.0,4.870465
7,croc_review_2265,got god son absolutely loves,"March 18, 2021",5.0,5.0
8,croc_review_334,came strap broken almost sliced extremely comfy,"August 11, 2021",1.0,2.053446
9,croc_review_180,arrived week late scuff front,"December 15, 2021",1.0,3.764175
