In [1]:
!pip install vaderSentiment
import pandas as pd
import json
import gzip
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def read_jsonl_with_error_handling(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error parsing line {line_num}: {e}")
                print(f"Problematic line: {line.strip()}")
    return pd.DataFrame(data)

df = read_jsonl_with_error_handling('Appliances.jsonl')

Error parsing line 352807: Unterminated string starting at: line 1 column 74 (char 73)
Problematic line: {"rating": 5.0, "title": "Great, especially with SF Bay K Cups", "text": "I am reviewing the &#34;upgraded&#34; version. It seems from other reviews they've solved the problem where sometimes the machine gets clogged. I have had no problems yet. But another idea, if you are worried - San Francisco Bay K Cups, sold on Amazon are both the cheapest K cups out there and surprisingly strong and delicious. Amazon sel


In [5]:
file_path = '/content/drive/My Drive/Electronics.json.gz'
data_list = []
with gzip.open(file_path, 'rb') as f:
    for i, line in enumerate(f):
        if i >= 3_000_000:
            break
        data = json.loads(line.decode('utf-8'))
        data_list.append(data)
df1 = pd.DataFrame(data_list)

In [6]:
def add_sentiment_label(text):
    # Get sentiment scores
    sentiment_score = sentiment.polarity_scores(text)

    # Check compound score for overall sentiment
    compound = sentiment_score['compound']

    if compound >= 0.75:
        return "This is a very positive review: " + text
    elif 0.05 < compound < 0.75:
        return "This is a positive review: " + text
    elif -0.05 <= compound <= 0.05:
        return "This is a neutral review: " + text
    elif -0.75 < compound < -0.05:
        return "This is a negative review: " + text
    else:  # compound <= -0.75
        return "This is a very negative review: " + text

def word_count(text):
    return len(text.split())

In [7]:
filtered_df = df[df['asin'] == 'B004VV8GOQ']

filtered_text = filtered_df['text']

filtered_text = filtered_text.str.replace(r'<br\s*/?>', '', regex=True)
filtered_text = filtered_text.str.replace(r'<a[^>]*>(.*?)</a>', '', regex=True)

filtered_text = filtered_text[filtered_text.apply(word_count) >= 25]

formatted_reviews = pd.DataFrame({'review': filtered_text})

output_path = 'IceBucket_text.json'
formatted_reviews.to_json(output_path, orient='records', lines=True, force_ascii=False)

In [8]:
filtered_df = df[df['asin'] == 'B002C8HR9A']

filtered_text = filtered_df['text']

filtered_text = filtered_text.str.replace(r'<br\s*/?>', '', regex=True)
filtered_text = filtered_text.str.replace(r'<a[^>]*>(.*?)</a>', '', regex=True)

filtered_text = filtered_text[filtered_text.apply(word_count) >= 25]

formatted_reviews = pd.DataFrame({'review': filtered_text})

output_path = 'WashingMachine_text.json'
formatted_reviews.to_json(output_path, orient='records', lines=True, force_ascii=False)

In [9]:
filtered_df = df1[df1['asin'] == 'B00007E7C8']

filtered_text = filtered_df['reviewText']

filtered_text = filtered_text.str.replace(r'<br\s*/?>', '', regex=True)
filtered_text = filtered_text.str.replace(r'<a[^>]*>(.*?)</a>', '', regex=True)

filtered_text = filtered_text[filtered_text.apply(word_count) >= 25]

formatted_reviews = pd.DataFrame({'review': filtered_text})

output_path = 'Headphones_text.json'
formatted_reviews.to_json(output_path, orient='records', lines=True, force_ascii=False)

In [10]:
filtered_df = df1[df1['asin'] == 'B0016CWV3U']

filtered_text = filtered_df['reviewText']

filtered_text = filtered_text.str.replace(r'<br\s*/?>', '', regex=True)
filtered_text = filtered_text.str.replace(r'<a[^>]*>(.*?)</a>', '', regex=True)

filtered_text = filtered_text[filtered_text.apply(word_count) >= 25]

formatted_reviews = pd.DataFrame({'review': filtered_text})

output_path = 'AlarmClock_text.json'
formatted_reviews.to_json(output_path, orient='records', lines=True, force_ascii=False)

With Sentiment


In [11]:
filtered_df = df[df['asin'] == 'B004VV8GOQ']

filtered_text = filtered_df['text'].apply(add_sentiment_label)

filtered_text = filtered_text.str.replace(r'<br\s*/?>', '', regex=True)
filtered_text = filtered_text.str.replace(r'<a[^>]*>(.*?)</a>', '', regex=True)

filtered_text = filtered_text[filtered_text.apply(word_count) >= 30]

formatted_reviews = pd.DataFrame({'review': filtered_text})

output_path = 'SentimentIceBucket_text.json'
formatted_reviews.to_json(output_path, orient='records', lines=True, force_ascii=False)

In [12]:
filtered_df = df[df['asin'] == 'B002C8HR9A']

filtered_text = filtered_df['text'].apply(add_sentiment_label)

filtered_text = filtered_text.str.replace(r'<br\s*/?>', '', regex=True)
filtered_text = filtered_text.str.replace(r'<a[^>]*>(.*?)</a>', '', regex=True)

filtered_text = filtered_text[filtered_text.apply(word_count) >= 30]

formatted_reviews = pd.DataFrame({'review': filtered_text})

output_path = 'SentimentWashingMachine_text.json'
formatted_reviews.to_json(output_path, orient='records', lines=True, force_ascii=False)

In [13]:
filtered_df = df1[df1['asin'] == 'B00007E7C8']

filtered_text = filtered_df['reviewText'].apply(add_sentiment_label)

filtered_text = filtered_text.str.replace(r'<br\s*/?>', '', regex=True)
filtered_text = filtered_text.str.replace(r'<a[^>]*>(.*?)</a>', '', regex=True)

filtered_text = filtered_text[filtered_text.apply(word_count) >= 30]

formatted_reviews = pd.DataFrame({'review': filtered_text})

output_path = 'SentimentHeadphones_text.json'
formatted_reviews.to_json(output_path, orient='records', lines=True, force_ascii=False)

In [14]:
filtered_df = df1[df1['asin'] == 'B0016CWV3U']

filtered_text = filtered_df['reviewText'].apply(add_sentiment_label)

filtered_text = filtered_text.str.replace(r'<br\s*/?>', '', regex=True)
filtered_text = filtered_text.str.replace(r'<a[^>]*>(.*?)</a>', '', regex=True)

filtered_text = filtered_text[filtered_text.apply(word_count) >= 30]

formatted_reviews = pd.DataFrame({'review': filtered_text})

output_path = 'SentimentAlarmClock_text.json'
formatted_reviews.to_json(output_path, orient='records', lines=True, force_ascii=False)