## From ChatGPT

In [None]:
import json
from datasets import Dataset

# Step 1: Load JSON data
with open("amazon_reviews.json", "r") as file:
    data = json.load(file)

# Step 2: Preprocess data
reviews = [entry["text"] for entry in data]
ratings = [entry["rating"] for entry in data]
sentiments = ["positive" if rating >= 4 else "negative" for rating in ratings]

# Step 3: Tokenize text
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenized_reviews = tokenizer(reviews, padding=True, truncation=True, return_tensors="pt")

# Step 4: Encode labels
label_mapping = {"positive": 1, "negative": 0}
encoded_labels = [label_mapping[sentiment] for sentiment in sentiments]

# Step 5: Split dataset
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(tokenized_reviews, encoded_labels, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size=0.5, random_state=42)

# Step 6: Create Dataset objects
train_dataset = Dataset.from_dict({"input_ids": train_texts["input_ids"], "attention_mask": train_texts["attention_mask"], "labels": train_labels})
val_dataset = Dataset.from_dict({"input_ids": val_texts["input_ids"], "attention_mask": val_texts["attention_mask"], "labels": val_labels})
test_dataset = Dataset.from_dict({"input_ids": test_texts["input_ids"], "attention_mask": test_texts["attention_mask"], "labels": test_labels})

# Step 7: Save dataset
train_dataset.save_to_disk("train_dataset")
val_dataset.save_to_disk("val_dataset")
test_dataset.save_to_disk("test_dataset")
