# Reading the Reivews and the Product Medatdata Dataset

In [26]:
import pandas as pd
import json

In [27]:
# Function to load a .jsonl file
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line.strip()))
    return pd.DataFrame(data)

# Load the datasets
reviews_df = load_jsonl('Magazine_Subscriptions.jsonl')
metadata_df = load_jsonl('meta_Magazine_Subscriptions.jsonl')
print(len(metadata_df))
print(len(reviews_df))

3391
71497


# Data Preprocessing

In [1]:
import pandas as pd

## Filter duplicates and not verified review

In [6]:
medadata_df = metadata_df.drop_duplicates(subset=['parent_asin']) # drop duplication row
reviews_df = reviews_df[reviews_df['verified_purchase'] == True] # only keep the review from verified purchase
reviews_df['review_date'] = pd.to_datetime(reviews_df['timestamp'], unit='ms')
medadata_df.rename(columns={'title': 'product_title'}, inplace=True)
reviews_df.rename(columns={'title': 'review_title'}, inplace=True)

In [7]:
print(medadata_df.columns)
print(reviews_df.columns)

Index(['main_category', 'product_title', 'average_rating', 'rating_number',
       'features', 'description', 'price', 'images', 'videos', 'store',
       'categories', 'details', 'parent_asin', 'bought_together', 'subtitle',
       'author'],
      dtype='object')
Index(['rating', 'review_title', 'text', 'images', 'asin', 'parent_asin',
       'user_id', 'timestamp', 'helpful_vote', 'verified_purchase',
       'review_date'],
      dtype='object')


## Join the two datasets togather by parent_asin

In [9]:
combined_df = pd.merge(reviews_df, metadata_df, on='parent_asin', how='left') # join the review and the product metadata
print(combined_df['title'].head())
print(combined_df.columns)

0                      Cooking With Paula Deen
1    Sports Illustrated KIDS    Print Magazine
2                    Outside    Print Magazine
3                  Us Weekly    Print Magazine
4      O, The Oprah Magazine    Print Magazine
Name: title, dtype: object
Index(['rating', 'review_title', 'text', 'images_x', 'asin', 'parent_asin',
       'user_id', 'timestamp', 'helpful_vote', 'verified_purchase',
       'review_date', 'main_category', 'title', 'average_rating',
       'rating_number', 'features', 'description', 'price', 'images_y',
       'videos', 'store', 'categories', 'details', 'bought_together',
       'subtitle', 'author'],
      dtype='object')


## Join the two datasets process the review

In [10]:
combined_df['review_content'] = combined_df.apply(lambda x: f"Product Title: {x['title']}, Rating: {x['rating']}, Review Title: {x['review_title']}, Review Text: {x['text']}, Reiview Data: {x['review_date']}", axis=1)

In [11]:
print(combined_df['review_content'].head())

0    Product Title: Cooking With Paula Deen, Rating...
1    Product Title: Sports Illustrated KIDS    Prin...
2    Product Title: Outside    Print Magazine, Rati...
3    Product Title: Us Weekly    Print Magazine, Ra...
4    Product Title: O, The Oprah Magazine    Print ...
Name: review_content, dtype: object


In [12]:
# 将所有评论聚合为一个列表，按照每个产品（parent_asin）
reviews_grouped = combined_df.groupby('parent_asin')['review_content'].apply(list).reset_index()

# 将列表转换为字符串，每条评论之间用新行分隔
reviews_grouped['reviews'] = reviews_grouped['review_content'].apply(lambda x: '\n'.join(x))

# 将这个聚合后的评论数据合并回产品元数据中
final_metadata_df = pd.merge(metadata_df, reviews_grouped[['parent_asin', 'reviews']], on='parent_asin', how='left')

In [13]:
print(final_metadata_df.columns)

Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin', 'bought_together', 'subtitle', 'author',
       'reviews'],
      dtype='object')


In [14]:
def format_features(features):
    if isinstance(features, list):
        return ', '.join(features)
    return ''

def format_description(description):
    if isinstance(description, list):
        return ' '.join(description)
    return ''

def format_details(details):
    if isinstance(details, dict):
        return ', '.join([f"{k}: {v}" for k, v in details.items()])
    return ''

# 格式化文本
def format_row(row):
    return (
        f"Title: {row['title']}, "
        f"Subtitle: {row.get('subtitle', '')}, "
        f"Features: {format_features(row['features'])}, "
        f"Description: {format_description(row['description'])}, "
        f"Average Rating: {row['average_rating']}, "
        f"Price: ${row['price']}, "
        f"Store: {row['store']}, "
        f"Categories: {', '.join(row.get('categories', []))}, "
        f"Details: {format_details(row['details'])}, "
        f"Author: {row.get('author', '')}, "
        f"Reviews: {row['reviews']}"
    )

# 应用格式化函数到每一行
final_metadata_df['formatted_text'] = final_metadata_df.apply(format_row, axis=1)

# 创建新的DataFrame仅包含formatted_text
result_df = final_metadata_df[['formatted_text']]

# 保存到本地文件
result_df.to_csv('formatted_metadata.csv', index=False)


In [15]:
import gc

# 将DataFrame变量设为None来释放引用
reviews_df = None
metadata_df = None
combined_df = None
final_metadata_df = None
result_df = None

# 强制进行垃圾回收
gc.collect()

180

# Fine Tuning

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
from transformers import AutoTokenizer

In [8]:
formatted_metadata = 'formatted_metadata.csv'

# 读取文件
# 因为文件没有列名，我们可以使用header=None让pandas不把第一行作为列名
# 并且可以通过names参数指定列名，这里我们将其命名为'formatted_text'
final_df = pd.read_csv(formatted_metadata)

In [33]:
tokenizer = AutoTokenizer.from_pretrained('gpt2', use_fast=True)
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [34]:
def split_text(text, max_length=1024):
    # 使用tokenizer的encode方法获取token ids，这里不直接生成tensor
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    
    # 按照max_length拆分token ids
    chunks = [token_ids[i:i + max_length] for i in range(0, len(token_ids), max_length)]
    
    # 将token ids列表转换回文本
    split_texts = [tokenizer.decode(chunk, clean_up_tokenization_spaces=True) for chunk in chunks]
    
    return split_texts

In [35]:
# 应用拆分函数
split_results = final_df['formatted_text'].apply(split_text)

# 展开拆分结果为新的DataFrame行
expanded_rows = []
for original_index, text_list in enumerate(split_results):
    for text in text_list:
        expanded_rows.append({'formatted_text': text})

expanded_df = pd.DataFrame(expanded_rows)

Token indices sequence length is longer than the specified maximum sequence length for this model (3412 > 1024). Running this sequence through the model will result in indexing errors


In [36]:
print(expanded_df)
dataset = Dataset.from_pandas(expanded_df)

                                         formatted_text
0     Title: GQ Print Access    Print Magazine, Subt...
1     Title: Hi-Fi +    Print Magazine, Subtitle: na...
2     Title: Paper Crafts, Subtitle: nan, Features:,...
3     Title: Horse Illustrated, Subtitle: nan, Featu...
4      with lots of useful information, Review Text:...
...                                                 ...
7109  Title: Karavan Istorij, Subtitle: nan, Feature...
7110  Title: V Magazine - Ny    Print Magazine, Subt...
7111  Title: Victorian Review    Print Magazine, Sub...
7112  Title: Visto    Print Magazine, Subtitle: nan,...
7113  Title: Modern Pioneer    Print Magazine, Subti...

[7114 rows x 1 columns]


In [68]:
def preprocess_function(examples):
    # 对文本进行Tokenize处理
    tokenized_inputs = tokenizer(examples['formatted_text'], truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    # GPT-2的标准训练过程中，labels字段通常与input_ids相同
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    return tokenized_inputs


batch_size = 3
tokenizer.pad_token = tokenizer.eos_token
tokenized_dataset = dataset.map(preprocess_function, batched=True, batch_size = batch_size)

Map:   0%|          | 0/7114 [00:00<?, ? examples/s]

In [69]:
from torch.optim import SGD
from transformers import get_scheduler

def create_optimizer_and_scheduler(model):
    # 定义优化器
    optimizer = SGD(model.parameters(), lr=0.0001, momentum=0.9)

    # 定义学习率调度器，这里以线性调度器为例
    num_training_steps = 500  # 假设的训练步数，你需要根据实际情况设定
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=500,  # 预热步数
        num_training_steps=num_training_steps
    )
    
    return optimizer, lr_scheduler


# 定义训练参数
training_args = TrainingArguments(
    output_dir="./gpt2_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=4,
    gradient_accumulation_steps = 4,
    per_device_train_batch_size=batch_size,
    eval_steps=500,
    save_total_limit=2,
    auto_find_batch_size = True,
    fp16=True
)

# 使用Trainer进行训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # 注意这里需要根据实际情况调整
    optimizers = create_optimizer_and_scheduler(model)
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [70]:
# 开始训练
trainer.train()

Step,Training Loss
500,1.6521
1000,1.662
1500,1.6601
2000,1.6523


TrainOutput(global_step=2372, training_loss=1.6549579691122998, metrics={'train_runtime': 1726.1801, 'train_samples_per_second': 16.485, 'train_steps_per_second': 1.374, 'total_flos': 7435326062592000.0, 'train_loss': 1.6549579691122998, 'epoch': 4.0})

In [71]:
# 保存微调后的模型
model.save_pretrained("./gpt2_finetuned")

In [81]:
import torch

# 确保PyTorch能够使用GPU（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载预训练的模型和tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)  # 将模型移动到正确的设备上

# 输入文本
input_text = "Recommende me a high-rating magazine"
# 使用tokenizer编码输入文本，添加批次维度，并将其移动到模型所在的设备
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# 生成文本
# 注意：你可以根据需要调整generate方法的参数来控制生成的文本
generated_ids = model.generate(input_ids, max_length=50, num_return_sequences=1)

# 将生成的token ids解码为文本，并移除特殊的token
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# 将生成的文本打印到控制台
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Recommende me a high-rating magazine.

I'm not sure if I'm going to be able to do that. I'm not sure if I'm going to be able to do that.

I'm not sure if I'm


In [82]:
model = GPT2LMHeadModel.from_pretrained("./gpt2_finetuned")

In [83]:
# 确保PyTorch能够使用GPU（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载预训练的模型和tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)  # 将模型移动到正确的设备上

# 输入文本
input_text = "Recommende me a high-rating magazine"
# 使用tokenizer编码输入文本，添加批次维度，并将其移动到模型所在的设备
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

# 生成文本
# 注意：你可以根据需要调整generate方法的参数来控制生成的文本
generated_ids = model.generate(input_ids, max_length=50, num_return_sequences=1)

# 将生成的token ids解码为文本，并移除特殊的token
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# 将生成的文本打印到控制台
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Recommende me a high-rating magazine.

I'm not sure if I'm going to be able to do that. I'm not sure if I'm going to be able to do that.

I'm not sure if I'm
