In [4]:
import pandas as pd
import seaborn as sns
import json
import itertools
import numpy as np
import os
from functools import partial
import seaborn as sns

import nest_asyncio
nest_asyncio.apply()

import stan

## Define data convert functions

In [42]:
def peek(iterable):
    try:
        first = next(iterable)
    except StopIteration:
        return None
    return first, itertools.chain([first], iterable)

In [43]:
def json_to_feather(filename, new_filename_base, records_per_file = 1000000, pipe_func = None):

    records = map(json.loads, open(filename))
    
    records_per_file = 1000000

    file_num = 0
    peek_res = peek(records)
    while peek_res is not None:
        _, records = peek_res
        data = pd.DataFrame.from_records(records, nrows = records_per_file)
        data.to_feather(f"{new_filename_base}_tmp_{file_num}.feather")
        peek_res = peek(records)
        file_num += 1
        
    dfs = list()
    for read_num in range(file_num):
        tmp_filename = f"{new_filename_base}_tmp_{read_num}.feather"
        small_df = pd.read_feather(tmp_filename)
        if pipe_func is not None:
            small_df = small_df.pipe(pipe_func)
                   
        dfs.append(small_df)
        os.remove(tmp_filename)
    
    data = pd.concat(dfs, axis = 0).reset_index()
    data.to_feather(f"{new_filename_base}.feather")

In [44]:
def starts_with(df, start_str):
    mask = df.columns.str.startswith(start_str)
    columns = list(df.columns[mask])
    return(columns)

In [45]:
def pipeable_drop(df, labels):
    return(df.drop(columns = labels))

def pipeable_drop_startswith(df, labels, start):
    new_df = (df.drop(columns = labels)
                .pipe(lambda x: x.drop(columns = starts_with(x, start)))
             )
    return(new_df)

## Convert Data to Feather

In [46]:
filename = "yelp_academic_dataset_business.json"
new_filename_base = "yelp_business"

business_drop = partial(pipeable_drop, labels = ["address", "is_open", "attributes", "hours"])

json_to_feather(filename, new_filename_base, pipe_func = business_drop)

In [47]:
filename = "yelp_academic_dataset_user.json"
new_filename_base = "yelp_user"

users_drop = partial(pipeable_drop_startswith, 
                               labels = ["name", "useful", "funny", "cool", "elite", "friends", "fans"],
                               start = "compliment")

json_to_feather(filename, new_filename_base, pipe_func = users_drop)

In [48]:
filename = "yelp_academic_dataset_review.json"
new_filename_base = "yelp_review"

review_drop = partial(pipeable_drop, labels = ["text"])

json_to_feather(filename, new_filename_base, pipe_func = review_drop)