In [None]:
# Cleaning and prepping Data

In [1]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

# Load the JSONL file into a DataFrame
file_path = "Cell_Phones_and_Accessories_5.json"

# Read the JSONL data
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))  # Load each JSON object
df = pd.DataFrame(data)

# Data Cleaning
# Drop rows with missing values
df.dropna(inplace=True)

# Standardize text fields
df['reviewText'] = df['reviewText'].str.lower()
df['reviewText'] = df['reviewText'].str.replace(r'[^\w\s]', '', regex=True)

# Convert 'reviewTime' to a datetime object
df['reviewTime'] = pd.to_datetime(df['reviewTime'], errors='coerce')

# Dataset Splitting
train, temp = train_test_split(df, test_size=0.2, random_state=42)  # 80% train
validation, test = train_test_split(temp, test_size=0.5, random_state=42)  # 10% validation, 10% test

# Save splits if needed
train.to_csv("train.csv", index=False)
validation.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)


In [None]:
# Baseline Predictor Model

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

# Check if necessary columns are present in the DataFrame
if 'reviewText' in df.columns and 'overall' in df.columns:
    # Split dataset into input (X) and target (y)
    X = df['reviewText']  # Feature: review text
    y = df['overall']     # Target: ratings

    # **1. Most Frequent Class Baseline**
    # Find the most frequent rating in the dataset
    most_frequent_rating = y.mode()[0]

    # Predict all ratings as the most frequent rating
    y_pred_most_frequent = [most_frequent_rating] * len(y)

    # Calculate accuracy for the most frequent class baseline
    accuracy_most_frequent = accuracy_score(y, y_pred_most_frequent)

    # **2. Mean Baseline**
    # Predict the rounded mean of ratings
    mean_rating = round(y.mean())
    y_pred_mean = [mean_rating] * len(y)

    # Calculate accuracy for the mean baseline
    accuracy_mean = accuracy_score(y, y_pred_mean)

    # Save baseline metrics to a CSV file
    baseline_metrics = pd.DataFrame({
        "Model": ["Most Frequent", "Mean"],
        "Accuracy": [accuracy_most_frequent, accuracy_mean],
        "Prediction": [most_frequent_rating, mean_rating]
    })
    baseline_metrics.to_csv("baseline_metrics.csv", index=False)
    print("Baseline metrics saved to 'baseline_metrics.csv'.")

    # Save baseline predictions for comparison
    baseline_predictions = pd.DataFrame({
        "Actual": y,
        "Most Frequent Prediction": y_pred_most_frequent,
        "Mean Prediction": y_pred_mean
    })
    baseline_predictions.to_csv("baseline_predictions.csv", index=False)
    print("Baseline predictions saved to 'baseline_predictions.csv'.")

    # Print metrics
    print(f"Most Frequent Class Baseline Accuracy: {accuracy_most_frequent:.2f}")
    print(f"Mean Baseline Accuracy: {accuracy_mean:.2f}")
    print(f"Most Frequent Prediction: {most_frequent_rating}")
    print(f"Mean Prediction: {mean_rating}")

else:
    print("The required columns 'reviewText' and 'overall' are not present in the DataFrame.")


Baseline metrics saved to 'baseline_metrics.csv'.
Baseline predictions saved to 'baseline_predictions.csv'.
Most Frequent Class Baseline Accuracy: 0.56
Mean Baseline Accuracy: 0.21
Most Frequent Prediction: 5.0
Mean Prediction: 4
