# Haroon Wajid
# 21i-1763
# DS-C

# 1. Data Loading 

In [49]:
# import json

# Read JSON data from a file line by line
with open("inputcell.json", "r") as file:
    for line in file:
        try:
            # Parse JSON data from each line
            json_data = json.loads(line)
            
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
# Open the JSON file
with open("inputcell.json", "r") as file:
    # Read and print the first 10 lines
    for i, line in enumerate(file):
        print(line)
        if i == 3:  # Stop after printing 3 lines
            break


{"reviewerID": "A30TL5EWN6DFXT", "asin": "120401325X", "reviewerName": "christina", "helpful": [0, 0], "reviewText": "They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again", "overall": 4.0, "summary": "Looks Good", "unixReviewTime": 1400630400, "reviewTime": "05 21, 2014"}

{"reviewerID": "ASY55RVNIL0UD", "asin": "120401325X", "reviewerName": "emily l.", "helpful": [0, 0], "reviewText": "These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my sister. :)", "overall": 5.0, "summary": "Really great product.", "unixReviewTime": 1389657600, "reviewTime": "01 14, 2014"}

{"reviewerID": "A2TMXE2AFO7ONB", "asin": "120401325X", "reviewerName": "Erica", "helpful": [0, 0], "reviewText": "These are awesome and make my phone look so stylish! I have only used one so far and

# Preprocessing

In [50]:
import json
import re

def load_json_iteratively(file_path):
    """Loads JSON data line by line for memory efficiency."""
    with open(file_path, 'r') as file:
        for line in file:
            yield json.loads(line)

def preprocess_text_inplace(text):
    """Preprocesses text in-place for potential performance gains."""
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text).lower()
    return text


def remove_stop_words_set(text, stop_words):
    """Removes stop words using a set for efficient membership checks."""
    words = text.split()
    filtered_words = {word for word in words if word not in stop_words}
    return ' '.join(filtered_words)

# Load stop words from a local file (replace with your file path)
with open('custom_stopwords.txt', 'r') as file:
    stop_words = set(file.read().splitlines())

# Preprocess data using a generator for memory optimization
def preprocess_data_generator(file_path):
    for review in load_json_iteratively(file_path):
        text = preprocess_text_inplace(review['reviewText'])
        text = remove_stop_words_set(text, stop_words)
        yield text, review['overall']

# Step 1: Data Loading & Preprocessing
preprocessed_data = list(preprocess_data_generator('inputcell.json'))

# Explore preprocessed dataset
print("Number of total preprocessed reviews:", len(preprocessed_data))
print("Sample preprocessed review:", preprocessed_data[8])


Number of preprocessed reviews: 194439
Sample preprocessed review: ('10pm case protects 7am lasts end battery usually start day buttons phone 55 full around glad good house leave return charges im build solid access', 5.0)


# 2. Thematic Analysis

In [51]:
def separate_reviews(preprocessed_data, rating_threshold=4):
    """Separates reviews into positive and negative based on a rating threshold."""
    positive_reviews = []
    negative_reviews = []
    for review, rating in preprocessed_data:
        if rating >= rating_threshold:
            positive_reviews.append(review)
        elif rating <= (rating_threshold - 2):  # Adjust threshold for negative reviews
            negative_reviews.append(review)
    return positive_reviews, negative_reviews

def count_words(words):
    """Counts word occurrences using a dictionary."""
    word_counts = {}
    for word in words:
        word_counts[word] = word_counts.get(word, 0) + 1
    return word_counts

def sort_and_print_top_words(word_counts, label, num_words=5):
    """Sorts and prints the top words for a given label."""
    sorted_words = sorted(word_counts.items(), key=lambda item: item[1], reverse=True)[:num_words]
    print(f"\nTop {num_words} most common {label} words:")
    for word, count in sorted_words:
        print(f"{word}: {count}")

# Load and preprocess data
file_path = 'inputcell.json'
json_data = json.loads(line)
preprocessed_data = [(preprocess_text(review['reviewText']), review['overall']) for review in data]

# Separate reviews and count words
positive_reviews, negative_reviews = separate_reviews(preprocessed_data)
positive_word_counts = count_words(' '.join(positive_reviews).split())
negative_word_counts = count_words(' '.join(negative_reviews).split())

# Print results
sort_and_print_top_words(positive_word_counts, "positive")
sort_and_print_top_words(negative_word_counts, "negative")



Top 5 most common positive words:
the: 731041
and: 388935
i: 384546
it: 357198
a: 355952

Top 5 most common negative words:
the: 119284
i: 64810
it: 62416
to: 54703
and: 52320


# 3. Semantic Analysis and storage

In [52]:
def split_and_lowercase(text):
    """Splits text into words and converts them to lowercase."""
    return re.sub(r"[^\w]", " ", text.lower()).split()

def count_word_occurrences(words, word_sets):
    """Counts occurrences of words in each given word set within a list of words."""
    counts = {tuple(word_set): 0 for word_set in word_sets}
    for word in words:
        for word_set in word_sets:
            if word in word_set:
                counts[tuple(word_set)] += 1
    return counts

def analyze_sentiment(review_text, positive_words, negative_words, neutral_words=None):
    """Analyzes sentiment of a review text based on weighted word lists."""
    words = split_and_lowercase(review_text)
    word_counts = count_word_occurrences(words, [positive_words, negative_words] + [neutral_words] if neutral_words else [positive_words, negative_words])
    sentiment_score = 0
    
    for word_set, weight in [(positive_words, 0.65), (negative_words, -0.2)]:
        sentiment_score += word_counts.get(tuple(word_set), 0) * weight
    
    if neutral_words:
        sentiment_score += word_counts.get(tuple(neutral_words), 0) * 0.1  # Adjust neutral weight

    # Define thresholds for classification based on your data and desired accuracy
    if sentiment_score > 0.2:
        return "Positive"
    elif sentiment_score < -0.2:
        return "Negative"
    else:
        return "Neutral"


# Define word lists with weights and (optional) neutral words
positive_words = {"good", "excellent", "great", "wonderful", "awesome",
    "fantastic", "amazing", "superb", "terrific", "outstanding",
    "marvelous", "splendid", "brilliant", "phenomenal", "remarkable",
    "stellar", "spectacular", "fabulous", "incredible", "glorious",
    "majestic", "unbelievable", "top-notch", "first-rate", "praiseworthy"}  

negative_words = {"bad", "terrible", "horrible","disappointing", "awful",
    "horrible", "subpar", "mediocre", "inferior", "low-quality",
    "unsatisfactory", "unreliable", "defective", "faulty", "displeasing",
    "disgusting", "unpleasant", "worse", "crap", "junk",
    "garbage", "useless", "flawed", "cheap", "shoddy"}

neutral_words = {"the", "is", "it", "this", "that", "and", 
                 "or", "but", "with", "for", "as", "on", "at",
                 "in", "of", "to", "from"}  


# Example review text
review_text = "product was  terrible and bad and horrible."

# Analyze sentiment
sentiment = analyze_sentiment(review_text, positive_words, negative_words, neutral_words)
print("Sentiment of example text:", sentiment)
#---------------------------------------------------------------------------
# Categorize reviews and print results
sentiment_results = categorize_reviews(preprocessed_data, positive_words, negative_words)

print("Some of the sentiment Results are:")
for i in range(3, 7):
    review, sentiment = sentiment_results[i]
    print(f"Review: {review}... | Sentiment: {sentiment}")

# Write results to a file
with open("sentiment_results.txt", "w") as file:
    for index, (review, sentiment) in enumerate(sentiment_results):
        if 3 <= index < 100:
            file.write(f"Review: {review} | Sentiment: {sentiment}\n")


Sentiment of example text: Negative
Some of the sentiment Results are:
Review: item arrived in great time and was in perfect cond... | Sentiment: Positive
Review: awesome stays on and looks great can be used on mu... | Sentiment: Positive
Review: these make using the home button easy my daughter ... | Sentiment: Neutral
Review: came just as described it doesnt come unstuck and ... | Sentiment: Positive
