In [1]:
import numpy as np
import pandas as pd
import pickle
import gc
import random
from tqdm import tqdm

random.seed(2020)

In [2]:
def to_df(file_path):
    with open(file_path, 'r') as fin:
        df = {}
        i = 0
        for line in tqdm(fin):
            df[i] = eval(line)
            i += 1
            
            if i > 1000000:
                break
        df = pd.DataFrame.from_dict(df, orient='index')
        return df

In [3]:
review_df = to_df('./Electronics_5.json')

1000000it [01:30, 11026.40it/s]


In [4]:
review_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"


In [6]:
with open('./reviews.pkl', 'wb') as f:
    pickle.dump(review_df, f, pickle.HIGHEST_PROTOCOL)

In [7]:
unique_asin = review_df['asin'].unique()

In [8]:
del review_df
gc.collect()

29

In [9]:
meta_df = to_df('./meta_Electronics.json')
meta_df = meta_df[meta_df['asin'].isin(unique_asin)]
meta_df = meta_df.reset_index(drop=True)

498196it [00:59, 8419.27it/s] 


In [10]:
meta_df.head()

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
0,528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",
1,594451647,http://ecx.images-amazon.com/images/I/51RjSETO...,HDTV Adapter Kit for NOOK HD and NOOK HD+\nThi...,"[[Electronics, Computers & Accessories, Touch ...",Barnes &amp; Noble HDTV Adapter Kit for NOOK H...,49.95,,"{'also_bought': ['B009L7EEZA', 'B00AGAYQEU', '...",
2,594481813,http://ecx.images-amazon.com/images/I/41K7ymN5...,Power up your device with this Barnes &amp; No...,"[[Electronics, eBook Readers & Accessories, Po...",Barnes &amp; Noble OV/HB-ADP Universal Power Kit,19.65,,"{'also_bought': ['B00AAKLIIS', 'B00A668GUO', '...",Barnes &amp; Noble
3,972683275,http://ecx.images-amazon.com/images/I/41hYJ9Mw...,The VideoSecu TV mount is a mounting solution ...,"[[Electronics, Accessories & Supplies, Audio &...",VideoSecu 24&quot; Long Arm TV Wall Mount Low ...,29.99,{},"{'also_bought': ['B000X3KOD2', 'B0074FGR74', '...",VideoSecu
4,1400532620,http://ecx.images-amazon.com/images/I/519ca3cu...,Barnes & Noble Nook eReader - no 3GMeet nook. ...,"[[Electronics, eBook Readers & Accessories]]",Barnes &amp; Noble Nook eReader - no 3G,74.95,{'Electronics': 23071},"{'also_bought': ['B0035CLBT4', 'B004X18N24', '...",Barnes &amp; Noble


In [14]:
pickle.dump(meta_df, open('./meta.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

In [15]:
reviews = pd.read_pickle('./reviews.pkl')
reviews_df = reviews[['reviewerID', 'asin', 'unixReviewTime']]

meta = pd.read_pickle('./meta.pkl')
meta_df = meta[['asin', 'categories']]

del reviews, meta
gc.collect()

0

In [16]:
meta_df['categories'] = meta_df['categories'].map(lambda x:x[-1][-1])

In [17]:
meta_df.head()

Unnamed: 0,asin,categories
0,528881469,Trucking GPS
1,594451647,Chargers & Adapters
2,594481813,Power Adapters
3,972683275,TV Ceiling & Wall Mounts
4,1400532620,eBook Readers & Accessories


In [18]:
reviews_df.head()

Unnamed: 0,reviewerID,asin,unixReviewTime
0,AO94DHGC771SJ,528881469,1370131200
1,AMO214LNFCEI4,528881469,1290643200
2,A3N7T0DY83Y4IG,528881469,1283990400
3,A1H8PY3QHMQQA0,528881469,1290556800
4,A24EV6RXELQZ63,528881469,1317254400


In [19]:
print(meta_df.shape, reviews_df.shape)

(37570, 2) (1000001, 3)


In [20]:
select_user_id = np.random.choice(reviews_df['reviewerID'].unique(), size=100000, replace=False)
reviews_df = reviews_df[reviews_df['reviewerID'].isin(select_user_id)]
meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())]

In [21]:
print(meta_df.shape, reviews_df.shape)

(37392, 2) (536662, 3)


In [22]:
def build_map(df, col_name):
    key = sorted(df[col_name].unique().tolist())
    m = dict(zip(key, range(len(key))))
    df[col_name] = df[col_name].map(lambda x:m[x])
    return m, key

In [23]:
asin_map, asin_key = build_map(meta_df, 'asin')
cate_map, cate_key = build_map(meta_df, 'categories')
revi_map, revi_key = build_map(reviews_df, 'reviewerID')

In [24]:
user_count, item_count, cate_count, example_count = len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0]
print(user_count, item_count, cate_count, example_count)

100000 37392 711 536662


In [25]:
meta_df = meta_df.sort_values('asin').reset_index(drop=True)

In [26]:
reviews_df['asin'] = reviews_df['asin'].map(lambda x:asin_map[x])
reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime']).reset_index(drop=True)
reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]

In [27]:
cate_list = np.array(meta_df['categories'], dtype='int32')

In [28]:
with open('./remap.pkl','wb') as f:
    pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((user_count, item_count, cate_count, example_count), f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL)