In [1]:
import pandas as pd
import json
import gzip


def parse(path):
    g = gzip.open(path, "rb")
    for l in g:
        yield json.loads(l)


def getDF(path, stop=0):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
        if stop > 0 and i == stop:
            break
    return pd.DataFrame.from_dict(df, orient="index")

In [2]:
df = pd.DataFrame(columns=["overall", "verified", "reviewText", "summary", "sentiment"])

In [3]:
peek = 1_000_000
stop = 1000
i = 0

for review in parse("./data/All_Amazon_Review.json.gz"):
    i += 1
    if stop > 0 and stop == i:
        break
    if i % peek == 0:
        print(f"Processed {i / peek:.2f} million reviews")

    if "overall" not in review or "verified" not in review or "reviewText" not in review or "summary" not in review:
        continue
    df.loc[i] = [review["overall"], review["verified"], review["reviewText"], review["summary"], 0]



In [4]:
df.head()

Unnamed: 0,overall,verified,reviewText,summary,sentiment
1,1.0,False,Alexa is not able to control my lights. If I a...,"VERY Buggy, doesn't work.",0
2,4.0,False,"Alexa works great for me so far, but I'm also ...",So Far So Good,0
3,1.0,False,Weak!!\n\nAlexa doesn't even recognize the nam...,Time waster,0
4,2.0,False,Can only control one of two bulbs from one of ...,Buggy,0
5,1.0,False,this worked great then randomly stopped. pleas...,stopped working,0


In [5]:
df.shape

(997, 5)

In [6]:
df_size =  df.memory_usage(index=True).sum()
print("size of df in bytes: ", df_size)

size of df in bytes:  40877


In [7]:
df1 = getDF("./data/All_Amazon_Review.json.gz", stop)

In [8]:
df1.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote
0,1.0,False,"12 11, 2015",A27BTSGLXK2C5K,B017O9P72A,Jacob M. Wessler,Alexa is not able to control my lights. If I a...,"VERY Buggy, doesn't work.",1449792000,
1,4.0,False,"12 8, 2015",A27ZJ1NCBFP1HZ,B017O9P72A,Greg,"Alexa works great for me so far, but I'm also ...",So Far So Good,1449532800,5.0
2,1.0,False,"12 7, 2015",ACCQIOZMFN4UK,B017O9P72A,Da-Gr8-1,Weak!!\n\nAlexa doesn't even recognize the nam...,Time waster,1449446400,11.0
3,2.0,False,"12 5, 2015",A3KUPJ396OQF78,B017O9P72A,Larry Russlin,Can only control one of two bulbs from one of ...,Buggy,1449273600,
4,1.0,False,"02 2, 2018",A1U1RE1ZI19E1H,B017O9P72A,Rebekah,this worked great then randomly stopped. pleas...,stopped working,1517529600,2.0


In [9]:
df1.shape

(1000, 10)

In [10]:
df1_size =  df1.memory_usage(index=True).sum()
print("size of df1 in bytes: ", df1_size)

size of df1 in bytes:  81000


In [11]:
print("ratio of reading the whole file vs parsing the required columns: ", df1_size / df_size)

ratio of reading the whole file vs parsing the required columns:  1.9815544193556278
