In [1]:
# Convert Yelp Data Challenge streaming JSON files into CSV

import json
import csv

YELP_REVIEW_FILE = "../data/yelp_academic_dataset_review.json"
YELP_TIP_FILE = "../data/yelp_academic_dataset_tip.json"
YELP_USER_FILE = "../data/yelp_academic_dataset_user.json"
YELP_CHECKIN_FILE = "../data/yelp_academic_dataset_checkin.json"
YELP_BUSINESS_FILE = "../data/yelp_academic_dataset_business.json"

files = [YELP_REVIEW_FILE, YELP_TIP_FILE, YELP_USER_FILE, YELP_CHECKIN_FILE, YELP_BUSINESS_FILE]


In [2]:
def convertForLoadCSV(files):
    for json_file in files:
        with open(json_file, "r") as file:
            with open(json_file + '.csv', 'w') as csvfile:
                writer = csv.writer(csvfile, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL)
                writer.writerow(json.loads(file.readline()).keys())
                for line in file:
                    l = []
                    item = json.loads(line)
                    for k,i in item.items():
                        # Represent a list of items as a semicolon delimitted string
                        if type(i) == list:
                            l.append(';'.join(i))
                        # Aggressive quoting and escape char handling
                        if type(i) == str:
                            l.append(i.replace('"', '').replace('\\', ''))
                        else:
                            l.append(i)
                    writer.writerow(l)

In [3]:
def convertForNeo4jImport():
    # Create User csv file and FRIENDS relationship csv
    count = 0
    with open(YELP_USER_FILE, 'r') as file:
        # open user node file
        with open('../data/import/user.csv', 'w') as csvfile:
            writer = csv.writer(csvfile, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL)
            writer.writerow([':ID(User)', ':LABEL', 'name'])
            with open('../data/import/friends.csv', 'w') as friendscsv:
                friendwriter = csv.writer(friendscsv, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL)
                friendwriter.writerow([":START_ID(User)", ":END_ID(User)", ":TYPE"])
                for line in file:
                    item = json.loads(line)
                    writer.writerow([item['user_id'], "User", item['name']])
                    for friend in item['friends']:
                        friendwriter.writerow([item['user_id'], friend, 'FRIENDS'])
                    
    # Create Business csv file
    with open(YELP_BUSINESS_FILE, 'r') as file:
        #count = 0
        with open('../data/import/business.csv', 'w') as csvfile:
            writer = csv.writer(csvfile, escapechar='\\', quotechar='"', quoting=csv.QUOTE_ALL)
            writer.writerow([':ID(Business)', ':LABEL', 'name'])
            for line in file:
                item = json.loads(line)
                try:
                    writer.writerow([item['business_id'], 'Business', item['name']])
                except Exception as e:
                    print(item)
                    throw(e)
                
                
    # Create Review csv file, WROTE relationship csv, 
    with open(YELP_REVIEW_FILE, 'r') as file:
        with open('../data/import/review.csv', 'w') as csvfile:
            writer = csv.writer(csvfile, escapechar="\\", quotechar='"', quoting=csv.QUOTE_ALL)
            writer.writerow([':ID(Review)', ':LABEL', 'stars:int', 'text'])
            
            with open('../data/import/wrote.csv', 'w') as wrotefile:
                wrotewriter = csv.writer(wrotefile, escapechar="\\", quotechar='"', quoting=csv.QUOTE_ALL)
                wrotewriter.writerow([':START_ID(User)', ':END_ID(Review)', ':TYPE'])
                
                with open('../data/import/review_of.csv', 'w') as review_of_file:
                    reviewwriter = csv.writer(review_of_file,  escapechar="\\", quotechar='"', quoting=csv.QUOTE_ALL)
                    reviewwriter.writerow([':START_ID(Review)', ':END_ID(Business)', ':TYPE'])
                
                    for line in file:
                        item = json.loads(line)
                        # (:Review {review_id, stars, text})
                        writer.writerow([item['review_id'], 'Review', item['stars'], item['text'].replace('\n', ' ').replace('"', '').replace('\\', '')])
                        
                        # (User)-[:WROTE]->(:Review)
                        wrotewriter.writerow([item['user_id'], item['review_id'], 'WROTE'])

                        # (:Review)-[:REVIEW_OF]->(:Business)
                        reviewwriter.writerow([item['review_id'], item['business_id'], 'REVIEW_OF'])


In [None]:
convertForLoadCSV()

In [None]:
convertForNeo4jImport()