# Reading the Reivews Dataset

In [1]:
import pandas as pd

In [2]:
# Function to load a .jsonl file
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return pd.DataFrame(data)

# Load the datasets
reviews_df = load_jsonl('Subscription_Boxes.jsonl')

# Data Preprocessing

## Filter duplicates and not verified review

In [3]:
reviews_df = reviews_df[reviews_df['verified_purchase'] == True] # only keep the review from verified purchase

In [4]:
print(reviews_df.columns)

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')


## Concatanate title and review text

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

reviews_df['input'] = reviews_df['title'] + ": " + reviews_df['text']
reviews_df['input'] = reviews_df['input'].str.strip()

# Drop the rows which has empty content
reviews_df.dropna(subset=['input'], inplace=True)

reviews_df['labels'] = reviews_df['rating']

# 选择需要的列
data = reviews_df[['input', 'labels']]

NameError: name 'reviews_df' is not defined

## Escape from HTML

In [6]:
from bs4 import BeautifulSoup

# Define a function to remove html tag
def remove_html(content):
    soup = BeautifulSoup(content, "lxml")  # 使用 lxml 作为解析器
    return soup.get_text()

# 应用这个函数到 'input' 列
data.loc[:, 'input_clean'] = data['input'].apply(remove_html)

# 查看结果
print(data[['input', 'input_clean']].head())

  soup = BeautifulSoup(content, "lxml")  # 使用 lxml 作为解析器


                                               input  \
0  USELESS: Absolutely useless nonsense and a com...   
1  Manufactured where?: With a couple of the item...   
2  Little bang for your buck.: Two SMALL stuffed ...   
3  New favorite box: Although I don’t remember si...   
4  Coctique: I loved every thing and could use it...   

                                         input_clean  
0  USELESS: Absolutely useless nonsense and a com...  
1  Manufactured where?: With a couple of the item...  
2  Little bang for your buck.: Two SMALL stuffed ...  
3  New favorite box: Although I don’t remember si...  
4  Coctique: I loved every thing and could use it...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, 'input_clean'] = data['input'].apply(remove_html)


In [7]:
import html

def remove_html_and_decode(content):
    soup = BeautifulSoup(content, "lxml")
    text = soup.get_text()
    return html.unescape(text)

# 应用改进的函数去除 HTML 和解码 HTML 实体
data.loc[:, 'input_clean'] = data['input_clean'].apply(remove_html_and_decode)

# 查看结果
print(data[['input', 'input_clean']].head())

  soup = BeautifulSoup(content, "lxml")


                                               input  \
0  USELESS: Absolutely useless nonsense and a com...   
1  Manufactured where?: With a couple of the item...   
2  Little bang for your buck.: Two SMALL stuffed ...   
3  New favorite box: Although I don’t remember si...   
4  Coctique: I loved every thing and could use it...   

                                         input_clean  
0  USELESS: Absolutely useless nonsense and a com...  
1  Manufactured where?: With a couple of the item...  
2  Little bang for your buck.: Two SMALL stuffed ...  
3  New favorite box: Although I don’t remember si...  
4  Coctique: I loved every thing and could use it...  


In [8]:
# 删除原始的 'input' 列
data.drop('input', axis=1, inplace=True)
data['labels'] = data['labels'].astype(int)
data['labels'] = data['labels'] -1
# 重命名 'input_clean' 列为 'input'
data.rename(columns={'input_clean': 'input'}, inplace=True)
data.dropna(subset=['input'], inplace=True)
# 查看结果
print(data.head())

   labels                                              input
0       0  USELESS: Absolutely useless nonsense and a com...
1       1  Manufactured where?: With a couple of the item...
2       0  Little bang for your buck.: Two SMALL stuffed ...
3       4  New favorite box: Although I don’t remember si...
4       4  Coctique: I loved every thing and could use it...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop('input', axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['labels'] = data['labels'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['labels'] = data['labels'] -1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs

# Split data into training, validation, test 3 parts

In [9]:
# 分割数据集：先分割出60%的训练数据
train_data, temp_data = train_test_split(data, test_size=0.4, random_state=42)

# 再从剩余的40%中分割出50%验证数据和50%测试数据
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# 保存为CSV文件
train_data.to_csv('train_data.csv', index=False)
validation_data.to_csv('validation_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)