In [1]:
import json
import re
import statistics

# Step 1: Data Loading & Preprocessing

# Read and parse the JSON data
with open('C:\\Users\\Muhammad Omer Hafeez\\Desktop\\22i1859_Assignment 1\\Cell_Phones_and_Accessories_5.json', 'r') as file:
    data = [json.loads(line) for line in file]

# Filter necessary columns
filtered_data = [{'reviewText': d['reviewText'], 'overall': d['overall']} for d in data]

# Stop words list
stop_words = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about",
    "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up",
    "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when",
    "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no",
    "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don",
    "should", "now"
}

# Text preprocessing
def preprocess_text(text):
    """
    Preprocesses the given text by converting to lowercase, removing punctuation,
    and removing stop words.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stop words
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply preprocessing to review texts
for d in filtered_data:
    d['reviewText'] = preprocess_text(d['reviewText'])

# Step 2: Thematic Analysis

# Determine key phrases or words for positive and negative reviews
positive_words = ['good', 'great', 'excellent', 'amazing', 'awesome', 'fantastic', 'superb', 'wonderful', 'terrific', 'impressive', 'outstanding', 'brilliant', 'stellar', 'splendid', 'phenomenal']
negative_words = ['bad', 'poor', 'terrible', 'awful', 'horrible', 'dreadful', 'abysmal', 'atrocious', 'pathetic', 'disappointing', 'lousy', 'mediocre', 'inferior', 'unsatisfactory', 'subpar']

# Step 3: Sentiment Analysis

# Define rules and weights
word_weights = {
    'good': 0.65, 'great': 0.7, 'excellent': 0.75, 'amazing': 0.8, 'awesome': 0.8, 'fantastic': 0.8, 'superb': 0.8,
    'wonderful': 0.75, 'terrific': 0.7, 'impressive': 0.7, 'outstanding': 0.75, 'brilliant': 0.75, 'stellar': 0.7,
    'splendid': 0.7, 'phenomenal': 0.8,
    'bad': -0.15, 'poor': -0.2, 'terrible': -0.25, 'awful': -0.3, 'horrible': -0.3, 'dreadful': -0.3, 'abysmal': -0.3,
    'atrocious': -0.3, 'pathetic': -0.2, 'disappointing': -0.2, 'lousy': -0.2, 'mediocre': -0.2, 'inferior': -0.2,
    'unsatisfactory': -0.2, 'subpar': -0.2
}

# Calculate sentiment scores
sentiment_scores = []
for review in filtered_data:
    score = 0
    words = review['reviewText'].split()
    for word in words:
        if word in word_weights:
            score += word_weights[word]
    sentiment_scores.append({'reviewText': review['reviewText'], 'overall': review['overall'], 'score': score})

# Define threshold for categorization
positive_threshold = 0.75
negative_threshold = 0.07

# Categorize reviews
categorized_reviews = []
for score in sentiment_scores:
    if score['score'] > positive_threshold:
        categorized_reviews.append({'reviewText': score['reviewText'], 'sentiment': 'positive'})
    elif score['score'] < negative_threshold:
        categorized_reviews.append({'reviewText': score['reviewText'], 'sentiment': 'negative'})
    else:
        categorized_reviews.append({'reviewText': score['reviewText'], 'sentiment': 'neutral'})

# Step 4: Storage

# Save categorized reviews to a text file
with open('resultfile.txt', 'w') as output_file:
    for i in range(len(categorized_reviews)):
        review = categorized_reviews[i]
        score = sentiment_scores[i]['score']
        output_file.write(f"Review: {review['reviewText']}\nSentiment: {review['sentiment']}\nScore: {score}\n\n")
