In [1]:
import pandas as pd
from tqdm import tqdm
from matplotlib import pyplot as plt
import numpy as np
import gzip
import json
import seaborn as sns

In [3]:
filename = '/data/amazon/Books.json.gz'  

In [5]:
book_interaction = []
with gzip.open(filename , 'rb') as gzip_file:
    for line in gzip_file:  # Read one line.
        line = line.rstrip()
        if line:  # Any JSON data on it?
            obj = json.loads(line)
            book_interaction.append({key: obj[key] for key in ["overall", "reviewerID", "unixReviewTime", "asin"]})
            
            
book_interaction = pd.DataFrame(book_interaction)
book_interaction = book_interaction[~book_interaction["overall"].isna()]

In [6]:
user_counts = book_interaction["reviewerID"].value_counts()
users = list(user_counts[(user_counts >= 30) & (user_counts <= 300)].index) # filter out users who read more than 300 or less than 30 books
book_interaction = book_interaction[book_interaction["reviewerID"].isin(users)]

In [7]:
len(list(user_counts[(user_counts >= 30) & (user_counts <= 300)].index))

156879

In [8]:
len(users)

156879

In [9]:
book_interaction = book_interaction.sort_values(["reviewerID", "unixReviewTime"])
book_interaction["date"] = pd.to_datetime(book_interaction.unixReviewTime, unit="s")

In [11]:
unique_date_test = book_interaction.groupby(["reviewerID"]).date.value_counts()
unique_date_test = pd.DataFrame(unique_date_test)
unique_date_test.columns = ["count"]
unique_date_test = unique_date_test.reset_index()
unique_date_test = unique_date_test.groupby("reviewerID").size().reset_index()
date_test = book_interaction.groupby(["reviewerID"]).size().reset_index()
unique_date_test.columns = ['reviewerID', "unique"]
date_test.columns = ['reviewerID', "all"]
unique_date_test= unique_date_test.merge(date_test, how="left", on="reviewerID")
unique_date_test["percentage"] = unique_date_test["unique"]/unique_date_test["all"]

In [13]:
unique_date_user = unique_date_test[unique_date_test.percentage >= 0.9].reviewerID.tolist() # filter out users whose 10% books are read at the same day

In [14]:
book_interaction_filter = book_interaction[book_interaction["reviewerID"].isin(unique_date_user)]

In [16]:
gr = book_interaction_filter.sort_values('unixReviewTime').groupby('reviewerID')
book_interaction_filter['order'] = gr.cumcount()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_interaction_filter['order'] = gr.cumcount()


In [20]:
book_interaction_filter.to_csv("book_interaction_filter.csv") # save data file

In [12]:
book_interaction_filter = pd.read_csv("book_interaction_filter.csv")

In [13]:
book_asins = book_interaction_filter.asin.unique()

In [16]:
np.save("book_asin_filter.npy", book_asins) # save book ids 