In [None]:
import os
import math
import json
import gzip
import pandas as pd
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)

In [None]:
def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return s, size_name[i]

In [None]:
rev_chunks = pd.read_json('./data/Electronics.json', lines=True, orient='columns', chunksize = 1000000)
metad_chunks = pd.read_json('./data/meta_Electronics.json', lines=True, orient='columns', chunksize = 100000)

## Metadata

In [None]:
# Extract Metadata with Computers Filter
folder_size = 0
for idx, mchunk in tqdm(enumerate(metad_chunks, start=1)):
    mchunk = mchunk[mchunk['main_cat'].isin(['Computers'])]
    mchunk.to_pickle(f'./data/metadata/md_{idx}.pkl')
    size = os.path.getsize(f'./data/metadata/md_{idx}.pkl')
    size_, size_unit = convert_size(size)
    folder_size+=size_
    if (folder_size>=3000 and size_unit=='MB') or (folder_size>=3.5 and size_unit=='GB'):
        print('Current Folder Size: ', folder_size)
        break

0it [00:00, ?it/s]

Current Folder Size:  3543.5699999999997


In [None]:
# Load metadata and append to dataframe
meta_df = pd.DataFrame()
for file in tqdm(os.listdir('./data/metadata/')):
    if file.endswith('.pkl'):
        meta_df = pd.concat([meta_df, pd.read_pickle(os.path.join('./data/metadata/', file))])
        
meta_df = meta_df.reset_index(drop=True)        
convert_size(meta_df.memory_usage(deep=True).sum())

(3.36, 'GB')

In [None]:
# Extract products
laptops = {}
for idx, category in enumerate(meta_df['category']):
    try:
        if 'laptops' in category[3].lower():
            laptops[meta_df['title'].iloc[idx]] = idx
    except:
        pass

In [None]:
len(laptops)

10989

In [None]:
# Dataframe of 10989 laptops
meta_df = meta_df.loc[meta_df.index[list(laptops.values())]]
convert_size(meta_df.memory_usage(deep=True).sum())

(326.26, 'MB')

In [None]:
meta_df.to_pickle('./data/metadata/LaptopsMetaData.pkl')

## Reviews

In [None]:
# Extract Reviews
folder_size = 0
for idx, rchunk in tqdm(enumerate(rev_chunks, start=1)):
    rchunk.to_pickle(f'./data/reviews/rev_{idx}.pkl')
    size = os.path.getsize(f'./data/reviews/rev_{idx}.pkl')
    size_, size_unit = convert_size(size)
    folder_size+=size_
    if (folder_size>=3000 and size_unit=='MB') or (folder_size>=4 and size_unit=='GB'):
        print('Current Folder Size: ', folder_size)
        break

0it [00:00, ?it/s]

Current Folder Size:  3336.98


In [None]:
# Load reviews and append to dataframe
rev_df = pd.DataFrame()
for file in tqdm(os.listdir('./data/reviews/')):
    if file.endswith('.pkl'):
        rev_df = pd.concat([rev_df, pd.read_pickle(os.path.join('./data/reviews/', file))])
        
rev_df = rev_df.reset_index(drop=True)        
convert_size(rev_df.memory_usage(deep=True).sum())

  0%|          | 0/7 [00:00<?, ?it/s]

(6.39, 'GB')

In [None]:
# Add product metadata
fdf = pd.merge(rev_df, meta_df, on = 'asin', how='inner')

In [None]:
fdf.to_pickle('./data/FinalDf.pkl')

## Analysis

In [None]:
fdf = pd.read_pickle('./data/FinalDf.pkl')

In [None]:
fdf = fdf[['verified', 'asin', 'title', 'reviewerName', 'reviewText', 'summary', 'vote', 'description',
    'brand', 'feature', 'date', 'price']]

In [None]:
fdf.shape

(42669, 12)

In [None]:
fdf[~fdf['description'].apply(lambda x: 'restored' in x[0] if len(x)>0 else False)].iloc[5]['reviewText']

'It was a gift for my sister but she loves it. It\'s fast and has a nice big 17" screen. She\'s very happy!\nThanks.'

In [None]:
fdf['reviewText'][20]

'It is an excellent laptop. However I wished it had more USB ports. Comparing it with Asus Zenbook, this laptop is $200 cheaper and more rigid.'