#### Chunk the large json into smaller files, each with 100,000 max rows

In [None]:
from itertools import chain, islice

def chunks(iterable, n):
    iterable = iter(iterable)
    while True:
        yield chain([next(iterable)], islice(iterable, n-1))

#10000
l = 10*10**4

file_large = 'layer1.json'
with open(file_large, encoding='utf-8') as bigfile:
    for i, lines in enumerate(chunks(bigfile, l)):
        file_split = '{}.{}'.format(file_large, i)
        with open(file_split, 'w', encoding='utf-8') as f:
            f.writelines(lines)

#### Use sed and jq to fix json formatting... 
* first file needs a closing "]"
* middle files need to delete the "," and add both opening "[" and closing "]"
* last file needs to add opening "[" 

In [None]:
!sed -i '$ s/,$/]/' layer1.json.0 && cat layer1.json.0 | jq -c --slurp . > layer1.json.0.valid

In [None]:
for idx in range(1, 10):
    !sed -i '1s/^/[\n/' layer1.json.{idx} && sed -i '$ s/,$/]/' layer1.json.{idx} && cat layer1.json.{idx} | jq -c --slurp . > layer1.json.{idx}.valid

In [None]:
!sed -i '1s/^/[\n/' layer1.json.10 && cat layer1.json.10 | jq -c --slurp . > layer1.json.10.valid

#### Load json files into compressed pickle files for reusability

In [None]:
import pandas as pd
import json
from pandas.io.json import json_normalize

files = ["layer1.json.0.valid",
         "layer1.json.1.valid",
         "layer1.json.2.valid",
         "layer1.json.3.valid",
         "layer1.json.4.valid",
         "layer1.json.5.valid",
         "layer1.json.6.valid",
         "layer1.json.7.valid",
         "layer1.json.8.valid",
         "layer1.json.9.valid"]

for each in files:
    tmp_df = pd.read_json(each, lines=True)
    new_df = pd.concat([pd.DataFrame(json_normalize(x)) for x in tmp_df[0]],
                       ignore_index=True)
    
    # add ingredient and instruction counts and shorten url
    new_df["ingr_count"] = new_df.apply(lambda row: len(row.ingredients), axis=1)
    new_df["instr_count"] = new_df.apply(lambda row: len(row.instructions), axis=1)
    new_df["collection"] = new_df.apply(lambda row: row.url.split("//")[-1].split("/")[0].split('?')[0], axis=1)
    new_df.to_pickle("./" + each + ".gz", compression="gzip")