# Reading the Reivews and the Product Medatdata Dataset

In [1]:
import pandas as pd

In [2]:
# Function to load a .jsonl file
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return pd.DataFrame(data)

# Load the datasets
reviews_df = load_jsonl('Subscription_Boxes.jsonl')

# Data Preprocessing

## Filter duplicates and not verified review

In [3]:
reviews_df = reviews_df[reviews_df['verified_purchase'] == True] # only keep the review from verified purchase

In [4]:
print(reviews_df.columns)

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')


## Join the two datasets togather by parent_asin

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

reviews_df['input'] = reviews_df['title'] + ": " + reviews_df['text']
reviews_df['input'] = reviews_df['input'].str.strip()

# 删除那些 input 为空的行
reviews_df.dropna(subset=['input'], inplace=True)

reviews_df['labels'] = reviews_df['rating']

# 选择需要的列
data = reviews_df[['input', 'labels']]

In [6]:
from bs4 import BeautifulSoup

# 定义一个函数来去除 HTML
def remove_html(content):
    soup = BeautifulSoup(content, "lxml")  # 使用 lxml 作为解析器
    return soup.get_text()

# 应用这个函数到 'input' 列
data.loc[:, 'input_clean'] = data['input'].apply(remove_html)

# 查看结果
print(data[['input', 'input_clean']].head())

  soup = BeautifulSoup(content, "lxml")  # 使用 lxml 作为解析器


                                               input  \
0  USELESS: Absolutely useless nonsense and a com...   
1  Manufactured where?: With a couple of the item...   
2  Little bang for your buck.: Two SMALL stuffed ...   
3  New favorite box: Although I don’t remember si...   
4  Coctique: I loved every thing and could use it...   

                                         input_clean  
0  USELESS: Absolutely useless nonsense and a com...  
1  Manufactured where?: With a couple of the item...  
2  Little bang for your buck.: Two SMALL stuffed ...  
3  New favorite box: Although I don’t remember si...  
4  Coctique: I loved every thing and could use it...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, 'input_clean'] = data['input'].apply(remove_html)


In [7]:
import html

def remove_html_and_decode(content):
    soup = BeautifulSoup(content, "lxml")
    text = soup.get_text()
    return html.unescape(text)

# 应用改进的函数去除 HTML 和解码 HTML 实体
data.loc[:, 'input_clean'] = data['input_clean'].apply(remove_html_and_decode)

# 查看结果
print(data[['input', 'input_clean']].head())

  soup = BeautifulSoup(content, "lxml")


                                               input  \
0  USELESS: Absolutely useless nonsense and a com...   
1  Manufactured where?: With a couple of the item...   
2  Little bang for your buck.: Two SMALL stuffed ...   
3  New favorite box: Although I don’t remember si...   
4  Coctique: I loved every thing and could use it...   

                                         input_clean  
0  USELESS: Absolutely useless nonsense and a com...  
1  Manufactured where?: With a couple of the item...  
2  Little bang for your buck.: Two SMALL stuffed ...  
3  New favorite box: Although I don’t remember si...  
4  Coctique: I loved every thing and could use it...  


In [8]:
# 删除原始的 'input' 列
data.drop('input', axis=1, inplace=True)
data['labels'] = data['labels'].astype(int)
data['labels'] = data['labels'] -1
# 重命名 'input_clean' 列为 'input'
data.rename(columns={'input_clean': 'input'}, inplace=True)
data.dropna(subset=['input'], inplace=True)
# 查看结果
print(data.head())

   labels                                              input
0       0  USELESS: Absolutely useless nonsense and a com...
1       1  Manufactured where?: With a couple of the item...
2       0  Little bang for your buck.: Two SMALL stuffed ...
3       4  New favorite box: Although I don’t remember si...
4       4  Coctique: I loved every thing and could use it...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop('input', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['labels'] = data['labels'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['labels'] = data['labels'] -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs

In [9]:
# 分割数据集：先分割出60%的训练数据
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)

# 再从剩余的40%中分割出50%验证数据和50%测试数据
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# 保存为CSV文件
train_data.to_csv('train_data.csv', index=False)
validation_data.to_csv('validation_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

# Fine Tuning

In [10]:
import pandas as pd

train_data = pd.read_csv('train_data.csv')
validation_data = pd.read_csv('validation_data.csv')
test_data = pd.read_csv('test_data.csv')
print(train_data.head())

   labels                                              input
0       3  Lots of fun stuff!: This box was stuffed full ...
1       4  Where is the link to subscribe for 8-13?: We l...
2       4  Worth it!: Loved everything in there, definite...
3       4                Educational!: My grandkids love it!
4       4  Great small company to support: I love these b...


In [11]:
print(train_data.dtypes)

labels     int64
input     object
dtype: object


In [12]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(validation_data)
test_dataset = Dataset.from_pandas(test_data)

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

def preprocess_function(examples):
    return tokenizer(examples['input'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8557 [00:00<?, ? examples/s]

Map:   0%|          | 0/2852 [00:00<?, ? examples/s]

Map:   0%|          | 0/2853 [00:00<?, ? examples/s]

In [14]:
from transformers import AutoModelForSequenceClassification

num_labels = len(train_data['labels'].unique())
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=num_labels, problem_type="single_label_classification")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
print(num_labels)

5


In [16]:
import numpy as np
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
np.object = object
np.bool = bool
np.int = int
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',          # 输出目录保存模型和日志
    evaluation_strategy="epoch",     # 每个 epoch 完成后进行评估
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True      # 训练结束时加载最佳模型
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': (predictions == labels).mean()}
    
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.8442,0.71544,0.721599
2,0.6454,0.712754,0.724053
3,0.5595,0.732085,0.725456


TrainOutput(global_step=1605, training_loss=0.6729126356843848, metrics={'train_runtime': 653.1052, 'train_samples_per_second': 39.306, 'train_steps_per_second': 2.457, 'total_flos': 1688626458798336.0, 'train_loss': 0.6729126356843848, 'epoch': 3.0})

In [18]:
results = trainer.evaluate(test_dataset)
print(results)

{'eval_loss': 0.6793504953384399, 'eval_accuracy': 0.7315106905012267, 'eval_runtime': 20.752, 'eval_samples_per_second': 137.481, 'eval_steps_per_second': 8.626, 'epoch': 3.0}
