In [None]:
#============ Sentiment Analysis Review =============#
import json

file_path = r'C:\Users\Administrator\PycharmProjects\PythonProjecteSemproj\reviewdata.jsonl'

def categorize_sentiment(rating):
    try:
        rating = float(rating)
        if rating >= 3.5:
            return "positive"
        elif 3.0 <= rating <= 3.4:
            return "neutral"
        else:
            return "negative"
    except (ValueError, TypeError):
        return "unknown"

fields = ["main_category", "title", "average_rating", "rating_number"]

with open(file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        try:
            review = json.loads(line)
            extracted = {field: review.get(field, "N/A") for field in fields}
            sentiment = categorize_sentiment(extracted["average_rating"])
            extracted["sentiment"] = sentiment

            print(f"Review {i+1}")
            print(json.dumps(extracted, indent=2))
            print("=" * 50)
        except json.JSONDecodeError as e:
            print(f"Line {i + 1} is not valid JSON: {e}")

        if i == 49:
            break


In [None]:
#============ Sentiment Analysis Count =============#
import json

file_path = r'C:\Users\Administrator\PycharmProjects\PythonProjecteSemproj\reviewdata.jsonl'

def categorize_sentiment(rating):
    try:
        rating = float(rating)
        if 3.5 <= rating <= 5.0:
            return "positive"
        elif 3.0 <= rating <= 3.4:
            return "neutral"
        elif rating < 3.0:
            return "negative"
        else:
            return "unknown"
    except (ValueError, TypeError):
        return "unknown"

# Initialize counters
sentiment_counts = {
    "positive": 0,
    "neutral": 0,
    "negative": 0,
    "unknown": 0
}

# Read and process the dataset
with open(file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f, 1):
        try:
            review = json.loads(line)
            rating = review.get("average_rating")
            sentiment = categorize_sentiment(rating)
            sentiment_counts[sentiment] += 1
        except json.JSONDecodeError as e:
            print(f"Line {i} is not valid JSON: {e}")
            sentiment_counts["unknown"] += 1

# Display final aggregated results
print("Sentiment Summary (Total Counts):")
for sentiment, count in sentiment_counts.items():
    print(f"{sentiment.capitalize()}: {count}")


In [None]:
#============ Sentiment Disribution by Product main category  =============#
import json
import matplotlib.pyplot as plt
from collections import defaultdict

file_path = r'C:\Users\Administrator\PycharmProjects\PythonProjecteSemproj\reviewdata.jsonl'

# Step 1 & 2: Parse and assign sentiment
sentiment_counts_by_category = defaultdict(lambda: {"positive": 0, "neutral": 0, "negative": 0})

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            review = json.loads(line)
            category = review.get("main_category", "Unknown")
            rating = review.get("average_rating")

            if isinstance(rating, str):
                try:
                    rating = float(rating)
                except ValueError:
                    continue

            # Categorize sentiment
            if rating is None:
                continue
            elif rating >= 3.5:
                sentiment = "positive"
            elif 3.0 <= rating < 3.5:
                sentiment = "neutral"
            elif rating < 3.0:
                sentiment = "negative"
            else:
                continue

            sentiment_counts_by_category[category][sentiment] += 1

        except json.JSONDecodeError:
            continue

# Step 3: Prepare for plotting
categories = list(sentiment_counts_by_category.keys())
positives = [sentiment_counts_by_category[cat]["positive"] for cat in categories]
neutrals = [sentiment_counts_by_category[cat]["neutral"] for cat in categories]
negatives = [sentiment_counts_by_category[cat]["negative"] for cat in categories]

# Step 4: Plot grouped bar chart
x = range(len(categories))
width = 0.25

plt.figure(figsize=(14, 7))
plt.bar([i - width for i in x], positives, width=width, label='Positive', color='green')
plt.bar(x, neutrals, width=width, label='Neutral', color='gray')
plt.bar([i + width for i in x], negatives, width=width, label='Negative', color='red')

plt.xlabel('Main Category')
plt.ylabel('Number of Reviews')
plt.title('Sentiment Distribution by Main Category')
plt.xticks(ticks=x, labels=categories, rotation=45, ha='right')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
#============ Average product Rating by year  =============#
from mrjob.job import MRJob
from mrjob.step import MRStep
import json
import logging


class MRAverageRatingPerSyntheticYear(MRJob):

    def configure_args(self):
        super(MRAverageRatingPerSyntheticYear, self).configure_args()
        self.records_per_year = 20000
        self.start_year = 2003

    def steps(self):
        return [
            MRStep(mapper=self.mapper_extract_ratings),
            MRStep(mapper=self.mapper_assign_year,
                   reducer=self.reducer_compute_average)
        ]

    def mapper_extract_ratings(self, _, line):
        """Extract ratings and output each with a sequential key."""
        try:
            record = json.loads(line)
            rating = record.get("average_rating")
            if rating:
                rating_val = float(rating)
                if 0 <= rating_val <= 5:
                    yield None, rating_val
        except Exception as e:
            logging.warning(f"Skipping line due to error: {e}")

    def mapper_assign_year(self, _, rating):
        """
        Assign each rating to a synthetic year based on position.
        """
        # Global line number is implicit — MRJob will internally sort and assign in order.
        yield "rating", rating

    def reducer_compute_average(self, key, ratings):
        """
        Assign year using synthetic logic and compute average.
        """
        count = 0
        year_buckets = {}

        for rating in ratings:
            year_index = count // self.records_per_year
            year = str(self.start_year + year_index)

            if year not in year_buckets:
                year_buckets[year] = [0, 0]  # [total_rating, count]

            year_buckets[year][0] += rating
            year_buckets[year][1] += 1
            count += 1

        for year, (total, cnt) in year_buckets.items():
            yield year, (round(total / cnt, 2), cnt)


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)

    try:
        file_path = r'C:\Users\Administrator\PycharmProjects\PythonProjecteSemproj\reviewdata.jsonl'
        job = MRAverageRatingPerSyntheticYear(args=[file_path])

        with job.make_runner() as runner:
            runner.run()

            print("\nAverage Ratings by Synthetic Year (every 20,000 records):")
            print("=" * 60)
            print("Year  |  Average Rating  |  Number of Ratings")
            print("-" * 60)

            results = []
            for line in runner.cat_output():
                year_str, value_str = line.decode('utf-8').split('\t')
                year = int(year_str.strip('"'))
                avg_rating, count = eval(value_str)
                results.append((year, avg_rating, count))

            for year, avg_rating, count in sorted(results):
                print(f"{year}  |      {avg_rating:.2f}       |     {count:,}")

            print("=" * 60)

    except Exception as e:
        logging.error(f"Job failed: {e}")


In [None]:
#============ Average product Rating by year over time   =============#
import matplotlib.pyplot as plt

# Replace this with the actual output from your MRJob
average_ratings_by_year = {
    '2003': 4.29,
    '2004': 4.27,
    '2005': 4.27,
    '2006': 4.25,
    '2007': 4.25,
    '2008': 4.25,
    '2009': 4.24,
    '2010': 4.24,
    '2011': 4.23,
    '2012': 4.24,
    '2013': 4.24,
    '2014': 4.23,
    '2015': 4.22,
    '2016': 4.23,
    '2017': 4.24,
    '2018': 4.23,
    '2019': 4.23,
    '2020': 4.21,
    '2021': 4.09,
    '2022': 4.08,
    '2023': 4.09

}

# Sort the data by year
years = sorted(average_ratings_by_year.keys())
ratings = [average_ratings_by_year[year] for year in years]

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(years, ratings, marker='o', linestyle='-', color='blue')
plt.title('Average Ratings Over Time')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.ylim(0, 5)
plt.grid(True)
plt.tight_layout()
plt.show()
