In [1]:
# -*- coding: utf-8 -*-
"""Convert the Yelp Dataset Challenge dataset from json format to csv.

For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge

"""
import json
import csv
import argparse
import collections
import sys
import os

In [3]:
def get_superset_of_column_names_from_file(json_file_path):
    column_names = set()
    with open(json_file_path) as fin:
        for line in fin:
            line_contents = json.loads(line)
            column_names.update(
                set(get_column_names(line_contents).keys())
            )
    return column_names

def get_column_names(line_contents, parent_key=''):
    column_names = []
    for k, v in line_contents.items():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, collections.MutableMapping):
            column_names.extend(
                get_column_names(v, column_name).items()
            )
        else:
            column_names.append((column_name, v))
    return dict(column_names)

def get_nested_value(d, key):
    if '.' not in key:
        return d.get(key, None)
    base_key, sub_key = key.split('.', 1)
    sub_dict = d.get(base_key, {})
    return get_nested_value(sub_dict, sub_key)

def get_row(line_contents, column_names):
    row = []
    for column_name in column_names:
        line_value = get_nested_value(line_contents, column_name)
        if isinstance(line_value, str):
            row.append('{0}'.format(line_value))
        elif line_value is not None:
            row.append('{0}'.format(line_value))
        else:
            row.append('')
    return row

def read_and_write_file(json_file_path, csv_file_path, column_names):
    with open(json_file_path) as fin, open(csv_file_path, 'w', newline='', encoding='utf-8') as fout:
        csv_writer = csv.writer(fout)
        csv_writer.writerow(column_names)  # Write header
        for line in fin:
            line_contents = json.loads(line)
            row = get_row(line_contents, column_names)
            csv_writer.writerow(row)

In [5]:
import json
import csv
import argparse
import collections.abc
import sys
import os

def get_superset_of_column_names_from_file(json_file_path):
    column_names = set()
    with open(json_file_path) as fin:
        for line in fin:
            line_contents = json.loads(line)
            column_names.update(
                set(get_column_names(line_contents).keys())
            )
    return column_names

def get_column_names(line_contents, parent_key=''):
    column_names = []
    for k, v in line_contents.items():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, collections.abc.MutableMapping):
            column_names.extend(
                get_column_names(v, column_name).items()
            )
        else:
            column_names.append((column_name, v))
    return dict(column_names)

def get_nested_value(d, key):
    if '.' not in key:
        return d.get(key, None)
    base_key, sub_key = key.split('.', 1)
    sub_dict = d.get(base_key, {})
    if sub_dict is None:
        return None  # Handle the case where sub_dict is None
    return get_nested_value(sub_dict, sub_key)

def get_row(line_contents, column_names):
    row = []
    for column_name in column_names:
        line_value = get_nested_value(line_contents, column_name)
        if isinstance(line_value, str):
            row.append('{0}'.format(line_value))
        elif line_value is not None:
            row.append('{0}'.format(line_value))
        else:
            row.append('')
    return row

def read_and_write_file(json_file_path, csv_file_path, column_names):
    with open(json_file_path) as fin, open(csv_file_path, 'w', newline='', encoding='utf-8') as fout:
        csv_writer = csv.writer(fout)
        csv_writer.writerow(column_names)  # Write header
        for line in fin:
            line_contents = json.loads(line)
            row = get_row(line_contents, column_names)
            csv_writer.writerow(row)

if __name__ == '__main__':
    if 'ipykernel' in sys.modules:
        # Running in an interactive environment
        json_file = os.path.expanduser('~/Desktop/archive/yelp_academic_dataset_user.json')
        if not os.path.isfile(json_file):
            raise FileNotFoundError(f"File not found: {json_file}")
        csv_file = '{0}.csv'.format(json_file.split('.json')[0])
        column_names = get_superset_of_column_names_from_file(json_file)
        read_and_write_file(json_file, csv_file, column_names)
    else:
        parser = argparse.ArgumentParser(
            description='Convert Yelp Dataset Challenge data from JSON format to CSV.'
        )

        parser.add_argument(
            'json_file',
            type=str,
            help='The JSON file to convert.'
        )

        args = parser.parse_args()

        json_file = os.path.expanduser(args.json_file)
        if not os.path.isfile(json_file):
            raise FileNotFoundError(f"File not found: {json_file}")
        csv_file = '{0}.csv'.format(json_file.split('.json')[0])

        column_names = get_superset_of_column_names_from_file(json_file)
        read_and_write_file(json_file, csv_file, column_names)

In [None]:
import json
import csv
import argparse
import collections.abc
import sys
import os

def get_superset_of_column_names_from_file(json_file_path):
    column_names = set()
    with open(json_file_path) as fin:
        for line in fin:
            line_contents = json.loads(line)
            column_names.update(
                set(get_column_names(line_contents).keys())
            )
    return column_names

def get_column_names(line_contents, parent_key=''):
    column_names = []
    for k, v in line_contents.items():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, collections.abc.MutableMapping):
            column_names.extend(
                get_column_names(v, column_name).items()
            )
        else:
            column_names.append((column_name, v))
    return dict(column_names)

def get_nested_value(d, key):
    if '.' not in key:
        return d.get(key, None)
    base_key, sub_key = key.split('.', 1)
    sub_dict = d.get(base_key, {})
    if sub_dict is None:
        return None  # Handle the case where sub_dict is None
    return get_nested_value(sub_dict, sub_key)

def get_row(line_contents, column_names):
    row = []
    for column_name in column_names:
        line_value = get_nested_value(line_contents, column_name)
        if isinstance(line_value, str):
            row.append('{0}'.format(line_value))
        elif line_value is not None:
            row.append('{0}'.format(line_value))
        else:
            row.append('')
    return row

def read_and_write_file(json_file_path, csv_file_path, column_names):
    with open(json_file_path) as fin, open(csv_file_path, 'w', newline='', encoding='utf-8') as fout:
        csv_writer = csv.writer(fout)
        csv_writer.writerow(column_names)  # Write header
        for line in fin:
            line_contents = json.loads(line)
            row = get_row(line_contents, column_names)
            csv_writer.writerow(row)

if __name__ == '__main__':
    if 'ipykernel' in sys.modules:
        # Running in an interactive environment
        json_file = os.path.expanduser('~/Desktop/archive/yelp_academic_dataset_user.json')
        if not os.path.isfile(json_file):
            raise FileNotFoundError(f"File not found: {json_file}")
        csv_file = '{0}.csv'.format(json_file.split('.json')[0])
        column_names = get_superset_of_column_names_from_file(json_file)
        read_and_write_file(json_file, csv_file, column_names)
    else:
        parser = argparse.ArgumentParser(
            description='Convert Yelp Dataset Challenge data from JSON format to CSV.'
        )

        parser.add_argument(
            'json_file',
            type=str,
            help='The JSON file to convert.'
        )

        args = parser.parse_args()

        json_file = os.path.expanduser(args.json_file)
        if not os.path.isfile(json_file):
            raise FileNotFoundError(f"File not found: {json_file}")
        csv_file = '{0}.csv'.format(json_file.split('.json')[0])

        column_names = get_superset_of_column_names_from_file(json_file)
        read_and_write_file(json_file, csv_file, column_names)