In [1]:
import pandas as pd
import requests
import os
import json
import gzip

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re


In [2]:
# INPUT_JSON = "goodreads_reviews_romance.json"
# OUTPUT_CSV = "goodreads_reviews_romance_full.csv"

# def convert_json_to_csv(input_json, output_csv, chunk_size=100000):
#     """Converts a JSON (line-by-line) dataset to CSV format for easier use."""
#     data = []
#     count = 0

#     with gzip.open(input_json, 'rt', encoding="utf-8") if input_json.endswith('.gz') else open(input_json, "r", encoding="utf-8") as f:
#         for i, line in enumerate(f):
#             try:
#                 data.append(json.loads(line))
                
#                 if len(data) >= chunk_size: 
#                     df = pd.DataFrame(data)
#                     df.to_csv(output_csv, mode='a', header=(i == 0), index=False)
#                     data = []  
                    
#                 count += 1
#             except json.JSONDecodeError:
#                 print(f"Skipping malformed JSON on line {i+1}")

#     if data:
#         df = pd.DataFrame(data)
#         df.to_csv(output_csv, mode='a', header=(count == len(data)), index=False)

#     print(f"Conversion complete! Total records processed: {count}")
#     print(f"Saved full dataset as {output_csv}")

# # Run conversion
# convert_json_to_csv(INPUT_JSON, OUTPUT_CSV)


In [17]:
# DEAL WITH SMALL SAMPLE
FILENAME_JSON = "goodreads_reviews_romance.json"

def load_json_lines(filename, sample_size=10000):
    """Loads a JSON file where each line is a separate JSON object."""
    data = []
    with open(filename, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            try:
                data.append(json.loads(line))
                if len(data) >= sample_size:
                    break
            except json.JSONDecodeError:
                print(f"Skipping malformed JSON on line {i+1}")
    
    df = pd.DataFrame(data)
    return df

df_reviews = load_json_lines(FILENAME_JSON, sample_size=20000)


df_reviews.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,1893,5347a776a1703b823ce029d68ae98275,5,** spoiler alert ** \n So the other day Elizab...,Tue Oct 10 19:08:05 -0700 2006,Sun Feb 19 02:04:48 -0800 2017,Sun Oct 01 00:00:00 -0700 2006,,155,4
1,72fb0d0087d28c832f15776b0d936598,17939501,719711cc71eec0bb54d2d97322c0e11b,5,"It is very hard to believe this is all true, b...",Fri Nov 08 20:17:30 -0800 2013,Fri Nov 08 20:22:59 -0800 2013,,,0,0
2,72fb0d0087d28c832f15776b0d936598,15706923,6a870a66f732183b60214d57fa553093,2,Ehhhhhh. \n Really nothing to rave about. It w...,Fri Nov 08 20:14:53 -0800 2013,Fri Nov 08 20:16:50 -0800 2013,,,0,0
3,72fb0d0087d28c832f15776b0d936598,7840190,f73a70f64564d4a8f4cfb2d2e9d5836f,4,Enjoyable read! I liked that Connie is not a t...,Fri Nov 08 19:57:47 -0800 2013,Fri Nov 08 20:01:41 -0800 2013,,,0,0
4,72fb0d0087d28c832f15776b0d936598,15463724,547aeff3c7ee5b4a39a93cd2a720b001,4,There are definitely too many books lately wit...,Wed Oct 30 11:19:32 -0700 2013,Wed Oct 30 11:20:53 -0700 2013,,,0,0


In [19]:
# SEE DATA TYPES
df_reviews.dtypes

user_id         object
book_id         object
review_id       object
rating           int64
review_text     object
date_added      object
date_updated    object
read_at         object
started_at      object
n_votes          int64
n_comments       int64
dtype: object

In [21]:
# SEE SHAPE
df_reviews.shape

(20000, 11)

In [5]:
columns_to_keep = ['book_id', 'review_text', 'rating', 'n_votes', 'n_comments']
df_clean = df_reviews[columns_to_keep].copy()

# drop rows where review_text is missing
df_clean['review_text'] = df_clean['review_text'].astype(str)
df_clean.dropna(subset=['review_text'], inplace=True)

In [None]:
import pandas as pd
import spacy
from collections import defaultdict
import yake

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load your DataFrame (assuming it's already available as df)
# df = pd.read_csv("your_data.csv")  # Modify this line based on your data source

def preprocess_text(text):
    """Preprocesses text: lowercasing, lemmatization, stopword removal."""
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

def extract_keywords(text, max_keywords=5):
    """Extract keywords using YAKE!"""
    kw_extractor = yake.KeywordExtractor(n=2, top=max_keywords)
    keywords = kw_extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]

# Group reviews by book_id
book_reviews = df_reviews.groupby("book_id")["review_text"].apply(lambda x: " ".join(x.dropna()))

# Extract salient features for each book
book_features = {}
for book_id, reviews in book_reviews.items():
    processed_text = preprocess_text(reviews)
    keywords = extract_keywords(processed_text)
    book_features[book_id] = keywords

# Convert to DataFrame for better readability
book_features_df = pd.DataFrame(list(book_features.items()), columns=["book_id", "salient_features"])

In [16]:
book_features_df.head(10)

Unnamed: 0,book_id,salient_features
0,10000269,"[Linda Howard, late Linda, Howard favorite, An..."
1,10000270,"[Angie Dare, Dare rocky, autumn afternoon, lat..."
2,10004056,"[Opiniao completa, reading Opiniao, good read,..."
3,10029845,[]
4,10033635,"[Dead Run, decision Paris, action Taylor, Tayl..."
5,10033916,"[Blackwater Brides, book Blackwater, Melodrama..."
6,10034342,"[John Jocelyn, book write, heroine Joceline, H..."
7,10047683,"[Creed Legacy, Brody Davis, HEA brody, Davis t..."
8,10049095,"[read book, wait read, great, book, wait]"
9,10052373,"[Marianne Willman, Witching Hour, Jill Gregory..."


In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/kayweeee/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kayweeee/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
df_clean.dtypes

book_id        object
review_text    object
rating          int64
n_votes         int64
n_comments      int64
dtype: object

In [7]:
df_clean.head()

Unnamed: 0,book_id,review_text,rating,n_votes,n_comments
0,1893,** spoiler alert ** \n So the other day Elizab...,5,155,4
1,17939501,"It is very hard to believe this is all true, b...",5,0,0
2,15706923,Ehhhhhh. \n Really nothing to rave about. It w...,2,0,0
3,7840190,Enjoyable read! I liked that Connie is not a t...,4,0,0
4,15463724,There are definitely too many books lately wit...,4,0,0


In [11]:
import os
import zipfile
import nltk

# Define your target nltk_data directory (the one your code is using)
nltk_data_dir = '/home/kayweeee/booksage/nltk_data'

# Path to the corpora folder under that directory
corpora_dir = os.path.join(nltk_data_dir, 'corpora')

# Path to the wordnet folder and wordnet.zip file
wordnet_dir = os.path.join(corpora_dir, 'wordnet')
wordnet_zip = os.path.join(corpora_dir, 'wordnet.zip')

# Check if the wordnet folder exists; if not, try extracting it from the zip file.
if not os.path.isdir(wordnet_dir):
    if os.path.exists(wordnet_zip):
        print(f"Found {wordnet_zip}. Extracting WordNet...")
        with zipfile.ZipFile(wordnet_zip, 'r') as zip_ref:
            zip_ref.extractall(corpora_dir)
        print("Extraction complete.")
    else:
        print("Error: wordnet.zip not found in the expected location:", wordnet_zip)
else:
    print("WordNet directory already exists.")

# Verify that NLTK can now find WordNet:
try:
    wordnet_path = nltk.data.find('corpora/wordnet')
    print("WordNet found at:", wordnet_path)
except LookupError as e:
    print("LookupError:", e)


WordNet directory already exists.
WordNet found at: /home/kayweeee/booksage/nltk_data/corpora/wordnet


In [9]:
import string
from nltk.stem import WordNetLemmatizer


def preprocess_text(text, use_lemmatization=True):
    text = text.lower()
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Optionally: you can add stemming or lemmatization here
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

    processed_text = " ".join(tokens)
    return processed_text


In [None]:
df_clean['processed_review'] = df_clean['review_text'].apply(lambda x: preprocess_text(x, use_lemmatization=True))


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/kayweeee/nltk_data'
    - '/home/kayweeee/booksage/venv/nltk_data'
    - '/home/kayweeee/booksage/venv/share/nltk_data'
    - '/home/kayweeee/booksage/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/home/kayweeee/booksage/nltk_data'
**********************************************************************
