In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
import re

from preprocessing import ReviewPreprocessor
from embeddings import EmbeddingsManager
from sentiments import Sentiments

# Create an instance of classes used in the script
sentimentsManager: Sentiments = None
embeddingManager: EmbeddingsManager = None
preprocessor: ReviewPreprocessor = None

seed = 1967
file_path = "digital_music.jsonl"
topicGeneral = os.path.splitext(os.path.basename(file_path))[0]
topicPath = os.path.join("data", topicGeneral)
if not os.path.exists(topicPath):
    os.makedirs(topicPath)
# Create a directory for the topic and seed
topicSeedPath = os.path.join(topicPath, str(seed))
if not os.path.exists(topicSeedPath):
    os.makedirs(topicSeedPath)


In [2]:

# Set result file (CSV format, containing original and adjusted ratings)
result_file = os.path.join(topicGeneral, f"{topicGeneral}_results.csv")

# Instantiate the embeddings manager
#embeddingManager = EmbeddingsManager(cachePath = topicPath)
# Initialize sentiment cache using the instance method
sentimentsManager = Sentiments(cachePath=topicPath)
# Instantiate the ReviewPreprocessor class (which now handles cache initialization)
preprocessor = ReviewPreprocessor(cachePath = topicPath)

In [3]:
label_text = "text"
label_rating = "rating"
reviewsToProcess: int = 1000


In [4]:

# Load a random sample of reviews from the file.
original_reviews, original_indices = preprocessor.LoadReviews(
    file_path, reviewsToProcess, label_text, label_rating, seed
)
print(f"\nLoaded {len(original_reviews)} reviews.")


Loaded 1000 reviews.


In [5]:

print("Preprocessing reviews...")
# Build a dict mapping review text to its rating (this is useful for caching).
reviews_dict: dict[str, float] = {
    str(review[label_text]): review[label_rating] for review in original_reviews
}
# Call the class method on the preprocessor instance.
preprocessed_reviews = preprocessor.PreprocessReviews(reviews_dict)
del original_reviews,original_indices

Preprocessing reviews...


In [6]:
# Define the analyze_sentiment function
def analyze_sentiment(pairs: tuple[str, str],sid = None) -> dict[str, dict]:
    """
    Analyze sentiment for a list of noun/adjective pairs using VADER.
    Args:
        pairs (list of tuples): List of tuples containing noun/adjective pairs.
        sid (SentimentIntensityAnalyzer): Optional VADER sentiment analyzer instance.
        If not provided, a new instance will be created.
    Returns:
        dict: A dictionary with sentiment scores for each pair, using VADER
        format (compound, pos, neg, neu).
    """
    if sid is None:
        sid = SentimentIntensityAnalyzer()
    scores: dict[str, dict] = {}

    for noun, adj in pairs:
        phrase = f"{adj} {noun}"
        score = sid.polarity_scores(phrase)
        scores[phrase] = score
    return scores

In [7]:
# Extract adjective-noun pairs
pairs: list[tuple[str, str]] = []
nouns: list[str] = []
# The dictionary associates the original review with its corrected form and
# the adjective-noun pairs.
reviews_dict: dict[tuple[str,str, str]] = {}

# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

for rawReview, rawReviewData in preprocessed_reviews.items():

    # Check if the data we are about to calculate are already in the cache.
    # If so, skip the calculation and use the cached data.
    cachedReview = preprocessor.GetReviewFromCache(rawReview)
    if cachedReview is not None and "pairs" in cachedReview:
        # Use the cached data
        pairs = cachedReview["pairs"]
        nouns = cachedReview["nouns"]
    else:
        # If not, process the review to extract adjective-noun pairs.
        # split sentences on hard punctuation (periods, exclamation marks, question marks)
        sentences = re.split(r'(?<=[.!?]) +', rawReviewData["corrected"])
        pairs = []
        nouns = []
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence) < 4:
                continue
            # Process the sentence with SpaCy.
            # This is the core idea of the method: we assume that the sentiment in a review
            # is mainly expressed by nouns combined with adjectives, like in "good music"
            # or "awful service"
            # The extraction uses Spacy.
            # - "amod" means adjectival modifier (e.g., "good" in "good music")
            # - "acomp" means adjectival complement (e.g., "good" in "the product is good")
            # - "nsubj" means nominal subject (e.g., "product" in "the product is good") 
            doc = nlp(sentence)
            for token in doc:
                if token.pos_ == "NOUN":
                    # Token "children" are the words that depend on it.
                    for child in token.children:
                        if child.dep_ == "amod":
                            # adjective modifier (e.g., "good" in "good music")
                            pairs.append((token.text, child.text))
                            nouns.append(token.text)
                elif token.dep_ == "acomp":
                    # adjectival complement (e.g., "good" in "the product is good").
                    # Now search its subject (the noun).
                    subjects = [child for child in token.head.children if child.dep_ == "nsubj"]
                    if subjects:
                        # Found, we can add the pair
                        pairs.append((subjects[0].text, token.text))
                        nouns.append(subjects[0].text)

        # Lemmatization is useful for cases where singual and plural forms are used
        # interchangeably, like "good music" and "good musics".
        pairs = [(preprocessor.LemmatizeText(noun), adj) for noun, adj in pairs]
        # Remove duplicates from pairs
        pairs = sorted(list(set(pairs)))
        # Recalculate the nouns based on the pairs
        nouns = sorted(list(set([noun for noun, _ in pairs])))

        # Add the pairs to the preprocessing cache.
        # Note the use of item as the key, which is the original review text.
#        preprocessor.AddSubitemsToReviewCache(rawReview, {"pairs": pairs})
#        preprocessor.AddSubitemsToReviewCache(rawReview, {"nouns": nouns})

    # Add the pairs to the review_dict for later sentiment analysis.
    # Differently, the review_dict uses the corrected review text as the key.
    reviews_dict[rawReview] = {
        "O-Score": rawReviewData["score"],
        "readable": rawReviewData["readable"],
        "corrected": rawReviewData["corrected"],
        "nouns": nouns,
        "pairs": pairs
    }

# Print the results
if 0:
    for rawReviewData in reviews_dict.values():
        print(f"Review: {rawReviewData['corrected'][:64]}")
        print(f"\tNouns: {rawReviewData['nouns']}")
        print(f"\tPairs: {rawReviewData['pairs']}")


In [None]:

filtered_reviews_dict: dict[str, list[dict]] = {}

sid = SentimentIntensityAnalyzer()
index: int = 0

print("\nCalculating sentiment scores for the reviews...")
for rawReview, rawReviewData in reviews_dict.items():

    index += 1
    if (index % 100) == 0:
        print(f"\n{index} of {len(reviews_dict)}")

    pairs = rawReviewData["pairs"]
    # Calculate the sentiment scores for the pairs, then filter out
    # those with a compound score below 0.05
    scores = analyze_sentiment(pairs = pairs, sid = sid)
    filtered_pairs = [
        (pair.split()[1], pair.split()[0]) 
        for pair, score in scores.items()
        if abs(score['compound']) >= 0.05
    ]
    V_Pairs: float = np.sum([score['compound'] for score in scores.values()])
    # Skip if no pair meets the criteria
    if not filtered_pairs:
        continue
     # Calculate and store:
    # - V-whole: the compound score of the review (VADER on the whole review)
    # - O-Score: the original score of the review (from the dataset)
    V_Whole = sid.polarity_scores(rawReview)["compound"]
    O_Score = rawReviewData["O-Score"]

    # Add a new key to the filtered_reviews_dict dictionary. We also store
    # compound, it will be used later.
    filtered_reviews_dict[rawReview] = {
        "readable": rawReviewData["readable"],
        "corrected": rawReviewData["corrected"],
        "pairs": filtered_pairs,
        "nouns": sorted(list(set([noun for noun, _ in filtered_pairs]))),
        "V-pairs": V_Pairs,
        "O-Score": O_Score,
        "V-whole": V_Whole
    }

    # Also update the cache, as the pairs and nouns may have changed.
    # We are not interested in storing the scores.
    preprocessor.AddSubitemsToReviewCache(rawReview, {"pairs": filtered_pairs})
    preprocessor.AddSubitemsToReviewCache(rawReview, {"nouns": filtered_reviews_dict[rawReview]["nouns"]})

print(f"{len(filtered_reviews_dict)} of {len(reviews_dict)} have relevant sentiments.")



Calculating sentiment scores for the reviews...
1 of 985 have relevant sentiments.
2 of 985 have relevant sentiments.
3 of 985 have relevant sentiments.
4 of 985 have relevant sentiments.
5 of 985 have relevant sentiments.
6 of 985 have relevant sentiments.
7 of 985 have relevant sentiments.
8 of 985 have relevant sentiments.
9 of 985 have relevant sentiments.
10 of 985 have relevant sentiments.
11 of 985 have relevant sentiments.
12 of 985 have relevant sentiments.
13 of 985 have relevant sentiments.
14 of 985 have relevant sentiments.
15 of 985 have relevant sentiments.
16 of 985 have relevant sentiments.
17 of 985 have relevant sentiments.
18 of 985 have relevant sentiments.
19 of 985 have relevant sentiments.
20 of 985 have relevant sentiments.
21 of 985 have relevant sentiments.
22 of 985 have relevant sentiments.
23 of 985 have relevant sentiments.
24 of 985 have relevant sentiments.
25 of 985 have relevant sentiments.
26 of 985 have relevant sentiments.
27 of 985 have relevant 

In [None]:
# In this last step, we invoke a LLM to parse the sentiment score, the so-called "L-score".
# The LLM will be asked to parse the review text and the noun list, and return a score.

# Create a dictionary to store the results
parsed_scores: dict[str, dict] = {}
index: int = 0

print("\nCalculating LLM scores for the reviews...")
# Iterate through each review in the filtered_reviews_dict
for rawReview, rawReviewData in filtered_reviews_dict.items():
    index += 1
    if (index % 100) == 0:
        print(f"\n{index} of {len(filtered_reviews_dict)}")
    # Invoke parseScore() and store the result in the dictionary
    parsed_scores[rawReview] = sentimentsManager.parseScore(rawReviewData["readable"], rawReviewData["nouns"])
    # Calculate the LLM score as the sum of the parsed scores, then
    # add it to the review dictionary.
    plusValues = sum([score for score in parsed_scores[rawReview].values() if score > 0])
    minusValues = sum([score for score in parsed_scores[rawReview].values() if score < 0])
    neutralValues = sum([score for score in parsed_scores[rawReview].values() if score == 0])
    llm_score = sum(parsed_scores[rawReview].values())
    # Update the review dictionary with the LLM score
    rawReviewData["L-score"] = llm_score
    rawReviewData["L-scoreP"] = plusValues
    rawReviewData["L-scoreM"] = minusValues
    rawReviewData["L-scoreN"] = neutralValues
    # Add the LLM score to the cache
    preprocessor.AddSubitemsToReviewCache(rawReview, {"L-score": llm_score})
    preprocessor.AddSubitemsToReviewCache(rawReview, {"L-scoreP": plusValues})
    preprocessor.AddSubitemsToReviewCache(rawReview, {"L-scoreM": minusValues})
    preprocessor.AddSubitemsToReviewCache(rawReview, {"L-scoreN": neutralValues})
print("\nDone.")



CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC

In [None]:

# Print the filtered results from the new dictionary
if 0:
    for rawReview, rawReviewData in filtered_reviews_dict.items():
        print(f"Review: {rawReview[:32]}...")
        print(f"\tNouns: {rawReviewData['nouns']}")
        print(f"\tPairs: {rawReviewData['pairs']}")
        print(f"\tO-Score (stars): {rawReviewData['O-Score']:.2f}")
        print(f"\tL-Score: {rawReviewData['L-score']:.2f}")


In [None]:
# Now write the results to a CSV file with adjusted rewiews and all the scores.
import csv
import time
result_file = os.path.join(topicSeedPath, f"scores.csv")
with open(result_file, mode='w', newline='', encoding='utf-8') as csvfile:

    fieldnames = [
        'timestamp','O-score',
        'L-score','L-scoreP','L-scoreM','L-scoreN',
        'V-Whole','readable','corrected','review'
    ]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for rawReview, rawReviewData in filtered_reviews_dict.items():
        writer.writerow({
            'timestamp': time.strftime("%Y-%m-%d %H:%M:%S"),
            'O-score': preprocessed_reviews[rawReview]["score"],
            'L-score': f"{rawReviewData['L-score']:.2f}",
            'L-scoreP': f"{rawReviewData['L-scoreP']:.2f}",
            'L-scoreM': f"{rawReviewData['L-scoreM']:.2f}",
            'L-scoreN': f"{rawReviewData['L-scoreN']:.2f}",
            'V-Whole': f"{rawReviewData['V-whole']:.2f}",
            'readable': f"{rawReviewData['readable']}",
            'corrected': rawReviewData["corrected"],
            'review': rawReview
        })

In [None]:
# now select reviews and save them to a file for human scoring.
import random
result_file = os.path.join(topicSeedPath, f"scores.csv")
# Select up to 100 reviews
num_reviews_to_select = min(100, len(filtered_reviews_dict))
random.seed(seed)  # Set the seed for reproducibility
selected_reviews = random.sample(list(filtered_reviews_dict.items()), num_reviews_to_select)
# Save the selected reviews to a csv file


In [None]:
import csv
selected_reviews_file = os.path.join(topicSeedPath, f"selected_reviews.csv")
# Check if the file already exists and ask the user if they want to overwrite it
if os.path.exists(selected_reviews_file):
    overwrite = input(f"{selected_reviews_file} already exists. Overwrite it? (y/n): ")
    if overwrite.lower() != 'y':
        print("Exiting without overwriting the file.")
        exit()

with open(selected_reviews_file, mode='w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['readable','hscore','review']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for rawReview, rawReviewData in selected_reviews:
        writer.writerow({
            'readable': f"{rawReviewData['readable']}",
            'hscore': 0,
            'review': rawReview
        })

In [None]:
print("The code stops here. Please manually grade the reviews and run the next script to plot the results.")
exit(0)

The code stops here. Please manually grade the reviews and run the next script to plot the results.


In [16]:
# Combine human scores and preprocessing cache for ML training
X = []
Y = []

# Read the updated file with human scores
with open(selected_reviews_file, mode='r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        review = row['review']
        hscore = float(row['hscore'])  # Convert human score to float

        # Retrieve data from the preprocessing cache
        cached_data = preprocessor.GetReviewFromCache(review)
        if cached_data:
            O_score = cached_data.get("score", 0)
            L_scoreP = cached_data.get("L-scoreP", 0)
            L_scoreM = cached_data.get("L-scoreM", 0)
            L_scoreN = cached_data.get("L-scoreN", 0)

            # Append features to X and target to Y
            X.append([O_score, L_scoreP, L_scoreM, L_scoreN])
            Y.append(hscore)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets (85% train, 15% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=seed)

# Train a Random Forest Regressor
model = RandomForestRegressor(random_state=seed)
model.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 0.40
R^2 Score: 0.80


In [18]:
import pickle

model_file = os.path.join(topicSeedPath, "ML_model.pkl")
with open(model_file, "wb") as f:
    pickle.dump(model, f)

print(f"Model saved to {model_file}")


Model saved to data\Magazine_Subscriptions\1967\ML_model.pkl


In [20]:
import random

# Select another review from the filtered_reviews_dict

random.seed()  # Ensure reproducibility
random_review_key = random.choice(list(filtered_reviews_dict.keys()))
random_review_data = filtered_reviews_dict[random_review_key]

# Extract features for prediction
features = [
    random_review_data["O-Score"],
    random_review_data["L-scoreP"],
    random_review_data["L-scoreM"],
    random_review_data["L-scoreN"]
]

# Predict the human score
predicted_hscore = model.predict([features])[0]

# Print the readable review and the predicted score
print(f"Readable Review: {random_review_data['readable']}")
print(f"Predicted Human Score: {predicted_hscore:.2f}")

Readable Review: i paid for a two year subscription, after two issues they changed to a 'web only' magazine and stopped sending me issues. so now i paid to subscribe to some blog, worthless!
Predicted Human Score: 3.71


In [None]:
grades: dict[str, int] = {}

if 0:
    f = open(os.path.join(topicSeedPath, "grades.txt"), "w", encoding="utf-8")
    for rawReview, rawReviewData in filtered_reviews_dict.items():
        grades[rawReview] = sentimentsManager.assignGradeToReview(rawReviewData["readable"])
        f.write(f"{grades[rawReview]}, \"{rawReviewData["readable"]}\"\n")
        f.flush()
    f.close()
