In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pyarrow as pa
import pyarrow.parquet as pq

import os
import json
import gc

In [2]:
base_dir = '/kaggle/input/yelp-dataset'

business_json_path = os.path.join(base_dir, 'yelp_academic_dataset_business.json') # 119 MB
tip_json_path = os.path.join(base_dir, 'yelp_academic_dataset_tip.json') # 181 MB
checkin_json_path = os.path.join(base_dir, 'yelp_academic_dataset_checkin.json') # 287 MB
user_json_path = os.path.join(base_dir, 'yelp_academic_dataset_user.json') # 3.37 GB
review_json_path = os.path.join(base_dir, 'yelp_academic_dataset_review.json')# 5.35 GB


In [91]:
json_data = []
with open(checkin_json_path) as fp:
    for line in fp:
        # get 'extra data on line xyz' error if using kson.loads(fp)
        json_data.append(json.loads(line))
     
# json_data.keys()

In [92]:
len(json_data)

131930

In [93]:
df = pd.DataFrame(json_data)
print("df.shape: ", df.shape)
print("df.columns: ", df.columns)


df.shape:  (131930, 2)
df.columns:  Index(['business_id', 'date'], dtype='object')


In [94]:
'''
business_json_path: 150346
    df.shape:  (150346, 14)
    df.columns:  Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')
      
tip_json_path: 908915
    df.shape:  (908915, 5)
    df.columns:  Index(['user_id', 'business_id', 'text', 'date', 'compliment_count'], 
        dtype='object')

checkin_json_path: 131930
    df.shape:  (131930, 2)
    df.columns:  Index(['business_id', 'date'], dtype='object')

user_json_path: 1987897
    df.shape:  (1987897, 22)
    df.columns:  Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 
        'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')
      
review_json_path: 6990280
    df.shape:  (6990280, 9)
    df.columns:  Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')
'''

"\nbusiness_json_path: 150346\n    df.shape:  (150346, 14)\n    df.columns:  Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',\n       'latitude', 'longitude', 'stars', 'review_count', 'is_open',\n       'attributes', 'categories', 'hours'],\n      dtype='object')\n      \ntip_json_path: 908915\n    df.shape:  (908915, 5)\n    df.columns:  Index(['user_id', 'business_id', 'text', 'date', 'compliment_count'], \n        dtype='object')\n\ncheckin_json_path: 131930\n    df.shape:  (131930, 2)\n    df.columns:  Index(['business_id', 'date'], dtype='object')\n\nuser_json_path: 1987897\n    df.shape:  (1987897, 22)\n    df.columns:  Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', \n        'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',\n       'compliment_more', 'compliment_profile', 'compliment_cute',\n       'compliment_list', 'compliment_note', 'compliment_plain',\n       'compliment_cool', 'compliment_funny', 'c

In [95]:
df.dtypes

business_id    object
date           object
dtype: object

In [96]:
df.head(n=3)

Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."
2,--30_8IhuyMHbSOcNWd6DQ,"2013-06-14 23:29:17, 2014-08-13 23:20:22"


In [97]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        gc.collect()
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [98]:
gc.collect()
print("BEFORE df.shape: ", df.shape)
df, nalist = reduce_mem_usage(df)
print("AFTER df.shape: ", df.shape)
gc.collect()

BEFORE df.shape:  (131930, 2)
Memory usage of properties dataframe is : 2.013214111328125  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  2.013214111328125  MB
This is  100.0 % of the initial size
AFTER df.shape:  (131930, 2)


0

In [99]:
df.dtypes

business_id    object
date           object
dtype: object

In [16]:
# df.to_csv('./review.csv', header=True, index=False, compression='gzip')

In [17]:
# ! rm /kaggle/working/review.csv

In [51]:
df.index

RangeIndex(start=0, stop=150346, step=1)

In [86]:
pq.write_table(pa.Table.from_pandas(df), './checkin.parquet')
gc.collect()


0

In [101]:
df = pq.read_table('./checkin.parquet').to_pandas()

In [102]:
df.shape

(131930, 2)

In [103]:
df.dtypes

business_id    object
date           object
dtype: object

In [105]:
df.memory_usage().sum() / 1024**2 

2.013214111328125

In [74]:
df.head()

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0
