In [1]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd

In [2]:
# Function to process a chunk of data
def process_data(chunk, columns=None):
    # If columns is not None, keep only those columns
    if columns is not None:
        chunk = chunk[columns]
    return chunk

# Function to read data in chunks and process each chunk
def load_data(file_name, head = None, columns=None, chunksize = 1000):
    chunks = []
    count = 0
    with gzip.open(file_name) as fin:
        for chunk in pd.read_json(fin, lines=True, chunksize=chunksize):
            # Process the chunk
            processed_chunk = process_data(chunk, columns)
            chunks.append(processed_chunk)
            
            count += 1
            # break if reaches the head-th chunk
            if (head is not None) and (count > head):
                break

    # Combine all chunks into a single DataFrame
    df = pd.concat(chunks, ignore_index=True)
    
    return df

In [3]:
DIR = 'C:\\Users\\jesse\\Desktop\\Honors Project\\goodreads_data\\raw\\'
ya_interactions = load_data(DIR + 'goodreads_interactions_young_adult.json.gz', head = 1)

In [4]:
ya_interactions

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
0,8842281e1d1347389f2ab93d60773d4d,18667753,be53fe83a6fc83474052f84692f6e90a,False,0,,Wed Mar 29 00:12:52 -0700 2017,Wed Mar 29 00:12:52 -0700 2017,,
1,8842281e1d1347389f2ab93d60773d4d,428263,2030f56879ebcc307a4b9cd8c83200e8,False,0,,Mon Mar 27 22:01:42 -0700 2017,Mon Mar 27 22:01:42 -0700 2017,,
2,8842281e1d1347389f2ab93d60773d4d,11387515,2fd3cd1acb30b099c135e358669639da,False,0,,Thu Jan 26 13:35:10 -0800 2017,Thu Jan 26 13:35:10 -0800 2017,,
3,8842281e1d1347389f2ab93d60773d4d,8684868,d29b8238762d70b7c2b67941bc81fbe0,True,3,,Tue Dec 17 13:42:25 -0800 2013,Tue Dec 17 13:47:26 -0800 2013,Sun Dec 15 00:00:00 -0800 2013,Sat Dec 14 00:00:00 -0800 2013
4,8842281e1d1347389f2ab93d60773d4d,8423493,357c8c178fd0e06cff5c025649231672,True,2,,Sun Dec 08 01:26:12 -0800 2013,Tue Dec 27 05:37:48 -0800 2016,Tue Dec 10 00:00:00 -0800 2013,
...,...,...,...,...,...,...,...,...,...,...
1995,f8a89075dc6de14857561522e729f82c,10429092,a84fe4c0b491547bd41da90d60c66875,False,0,,Tue Feb 26 17:46:41 -0800 2013,Tue Feb 26 17:46:41 -0800 2013,,
1996,f8a89075dc6de14857561522e729f82c,16148644,cc0f19a7125193c7af40991dad7acad9,False,0,,Tue Feb 26 17:46:40 -0800 2013,Tue Feb 26 17:46:40 -0800 2013,,
1997,f8a89075dc6de14857561522e729f82c,14739821,436346cde89624897615cb050701fa1e,False,0,,Tue Feb 26 17:46:37 -0800 2013,Tue Feb 26 17:46:37 -0800 2013,,
1998,f8a89075dc6de14857561522e729f82c,7849034,c94f1d483d5649c4c19b080a55700f62,False,0,,Tue Feb 26 17:43:15 -0800 2013,Tue Feb 26 17:43:15 -0800 2013,,


In [5]:
ya_interactions.columns

Index(['user_id', 'book_id', 'review_id', 'is_read', 'rating',
       'review_text_incomplete', 'date_added', 'date_updated', 'read_at',
       'started_at'],
      dtype='object')

ya_interactions keep: ['user_id', 'book_id', 'review_id', 'is_read', 'rating']

In [6]:
ya_interactions = load_data(DIR + 'goodreads_interactions_young_adult.json.gz', columns = ['user_id', 'book_id', 'is_read', 'rating'])
ya_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34919254 entries, 0 to 34919253
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   user_id  object
 1   book_id  int64 
 2   is_read  bool  
 3   rating   int64 
dtypes: bool(1), int64(2), object(1)
memory usage: 832.5+ MB


In [7]:
#for int and float types, print out the min and max, followed by the column name
for col in ya_interactions.columns:
    if ya_interactions[col].dtype == 'int64' or ya_interactions[col].dtype == 'float64':
        print(ya_interactions[col].min(), ya_interactions[col].max(), col)

50 36524503 book_id
0 5 rating


In [8]:
ya_interactions['book_id'] = ya_interactions['book_id'].astype('int32')
ya_interactions['rating'] = ya_interactions['rating'].astype('int8')

In [9]:
#check for duplicate book_id, user_id in ya_interactions
ya_interactions[['book_id', 'user_id']].duplicated().sum()

0

In [10]:
ya_interactions.replace('', np.nan, inplace=True)

In [11]:
#check for duplicate book_id, user_id in ya_interactions
ya_interactions[['book_id', 'user_id']].duplicated().sum()

0

In [12]:
ya_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34919254 entries, 0 to 34919253
Data columns (total 4 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   user_id  object
 1   book_id  int32 
 2   is_read  bool  
 3   rating   int8  
dtypes: bool(1), int32(1), int8(1), object(1)
memory usage: 466.2+ MB


no nulls either

In [13]:
#how many false values are there in ya_interactions['is_read']?
ya_interactions['is_read'].value_counts()

is_read
False    19196505
True     15722749
Name: count, dtype: int64

More than half of the interactions involve books that the users haven't read yet -- in some way they are expressing interest in them though, likely as to-read options

In [None]:
ya_interactions = ya_interactions.drop(columns=['review_text_incomplete', 'date_added', 'date_updated', 'read_at', 'started_at'])

In [None]:
ya_interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34919254 entries, 0 to 34919253
Data columns (total 5 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   user_id    object
 1   book_id    int64 
 2   review_id  object
 3   is_read    bool  
 4   rating     int64 
dtypes: bool(1), int64(2), object(2)
memory usage: 1.1+ GB


In [None]:
interactions_read = ya_interactions[ya_interactions['is_read'] == True]
interactions_read.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15722749 entries, 3 to 34919253
Data columns (total 5 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   user_id    object
 1   book_id    int64 
 2   review_id  object
 3   is_read    bool  
 4   rating     int64 
dtypes: bool(1), int64(2), object(2)
memory usage: 614.8+ MB


In [None]:
interactions_interested= ya_interactions[ya_interactions['is_read'] == False]

In [None]:
#export to parquet
interactions_interested.to_parquet('C:\\Users\\jesse\\Desktop\\Honors Project\\ya_interactions_interested.parquet')