In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('archive/sentiment_analysis_results.csv')

# Filter out reviews with fewer than 50 characters
df = df[df['review_text'].str.len() >= 50]

# Print column names
print("Column names in the dataset:")
print(df.columns.tolist())


In [4]:
import pandas as pd
import numpy as np
from transformers import pipeline
import torch

# —1) Load your DataFrame (or assume it's already in memory)—
# df = pd.read_csv('zeroshot_analysis_results.csv')

# —2) Zero‐shot pipeline on MPS (if available)—
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    hypothesis_template="The reader found the book to be {}.",
    device=torch.device("mps")  # or "cuda"/-1 for CPU
)

candidate_labels = [
    "Very Helpful",
    "Somewhat Helpful",
    "Somewhat Harmful",
    "Very Harmful",
]

# —3) Select only unlabeled rows—
to_label = df["predicted_label"].isna()
pending_idxs = df.index[to_label].tolist()

# —4) Process in batches—
batch_size = 50
for batch_idxs in np.array_split(pending_idxs, np.ceil(len(pending_idxs) / batch_size)):
    texts   = df.loc[batch_idxs, "review_text"].tolist()
    outputs = classifier(texts, candidate_labels)

    # —5) Unpack top label & score—
    preds  = [o["labels"][0] for o in outputs]
    scores = [o["scores"][0] for o in outputs]

    # —6) Write back using the same index labels—
    df.loc[batch_idxs, "predicted_label"]   = preds
    df.loc[batch_idxs, "confidence_score"]  = scores

    # —7) (Optional) persist progress—
    df.to_csv("zeroshot_analysis_results.csv", index=False)

    print(f"Processed {len(batch_idxs)} reviews; total left: {len(pending_idxs) - len(batch_idxs)}")


# import pandas as pd
# from transformers import pipeline
# import torch

# # 1. Set up zero-shot classifier
# classifier = pipeline(
#     "zero-shot-classification",
#     model="facebook/bart-large-mnli",
#     hypothesis_template="The reader found the book to be {}.",
#     device=torch.device("mps")
# )

# # 2. Define candidate labels
# candidate_labels = [
#     "Very Helpful",
#     "Somewhat Helpful", 
#     "Somewhat Harmful",
#     "Very Harmful"
# ]

# # 3. Function to classify one review
# def classify_review(text):
#     out = classifier(text, candidate_labels, multi_label=False)
#     # take the top-scoring label
#     top_label = out["labels"][0]
#     top_score = out["scores"][0]
#     return pd.Series({"predicted_label": top_label, "score": top_score})

# # 4. Process all reviews in batches and save results as we go
# batch_size = 50
# total_rows = len(df)
# records_processed = 0

# for i in range(0, total_rows, batch_size):
#     end_idx = min(i + batch_size, total_rows)
#     records_processed += end_idx - i
#     print(f"\nProcessing reviews {i} to {end_idx} out of {total_rows}")
#     print(f"Total records processed so far: {records_processed}")
    
#     # Process batch
#     batch_results = df["review_text"].iloc[i:end_idx].apply(classify_review)
    
#     # Update dataframe
#     df.loc[i:end_idx-1, "predicted_label"] = batch_results["predicted_label"]
#     df.loc[i:end_idx-1, "confidence_score"] = batch_results["score"]
    
#     # Save progress
#     df.to_csv('zeroshot_analysis_results.csv', index=False)
    
#     # Print sample of processed results
#     print("\nSample of recently processed results:")
#     print(df.iloc[i:end_idx][["name", "review_text", "predicted_label", "confidence_score"]].head(3))

Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 reviews; total left: 277951
Processed 50 rev

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# in df where author_clean is "Malcolm Gladwell"
# Create a spider chart of the predicted_label counts

# Count occurrences of each label
label_counts = df[df['author_clean'] == 'Malcolm Gladwell']['predicted_label'].value_counts()

# Create a spider chart
categories = ['Very Helpful', 'Somewhat Helpful', 'Somewhat Harmful', 'Very Harmful']
values = label_counts.reindex(categories, fill_value=0)

import matplotlib.pyplot as plt

# Create a spider chart
# Set up the figure and polar subplot
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, polar=True)

# Calculate angles for each category
angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False)

# Close the plot by appending the first value
values = np.concatenate((values, [values[0]]))
angles = np.concatenate((angles, [angles[0]]))

# Plot the data
ax.plot(angles, values)

# Fill the area
ax.fill(angles, values, alpha=0.25)

# Set the labels
ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)

# Add title
plt.title('Distribution of Predicted Labels for Malcolm Gladwell Reviews')

# Show the plot
plt.tight_layout()
plt.show()





In [None]:
# Filter dataframe for Malcolm Gladwell reviews and export to CSV
gladwell_df = df[df['author_clean'] == 'Malcolm Gladwell']
gladwell_df.to_csv('malcolm_gladwell_reviews.csv', index=False)


In [None]:
import pandas as pd

# Read the CSV file
gladwell_df = pd.read_csv('malcolm_gladwell_reviews.csv')


# Get unique names and their counts
print("Unique Names and Counts:")
print(gladwell_df['name'].value_counts())
print("\n")

# Get count of each predicted label
print("Predicted Label Distribution:")
print(gladwell_df['predicted_label'].value_counts())
print("\n")

# Extract adjectives from review text and get most frequent ones
import re
from collections import Counter
from nltk.corpus import stopwords
import nltk

# Download required NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Function to extract adjectives
def extract_adjectives(text):
    # Convert to lowercase and remove special characters
    text = re.sub(r'[^\w\s]', '', text.lower())
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Get POS tags
    pos_tags = nltk.pos_tag(tokens)
    # Extract adjectives (JJ, JJR, JJS)
    adjectives = [word for word, tag in pos_tags if tag.startswith('JJ')]
    return adjectives

# Get all adjectives from review texts
all_adjectives = []
for review in gladwell_df['review_text']:
    all_adjectives.extend(extract_adjectives(review))

# Remove stopwords
stop_words = set(stopwords.words('english'))
all_adjectives = [adj for adj in all_adjectives if adj not in stop_words]

# Save all adjectives to a text file
with open('all_adjectives.txt', 'w') as f:
    f.write(' '.join(all_adjectives))
