In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import argparse
import json

RESTS = np.array(['American restaurant', 'Angler fish restaurant',
       'Armenian restaurant', 'Asian fusion restaurant',
       'Asian restaurant', 'Australian restaurant', 'Austrian restaurant',
       'Barbecue restaurant', 'Breakfast restaurant', 'Brunch restaurant',
       'Buffet restaurant', 'Burrito restaurant',
       'Cheesesteak restaurant', 'Chicken restaurant',
       'Chicken wings restaurant', 'Chinese noodle restaurant',
       'Chinese restaurant', 'Chophouse restaurant',
       'Continental restaurant', 'Delivery Chinese restaurant',
       'Delivery Restaurant', 'Dessert restaurant',
       'Down home cooking restaurant', 'European restaurant',
       'Family restaurant', 'Fast food restaurant', 'Filipino restaurant',
       'Fine dining restaurant', 'Fish & chips restaurant',
       'German restaurant', 'Gluten-free restaurant', 'Greek restaurant',
       'Hamburger restaurant', 'Hawaiian restaurant',
       'Health food restaurant', 'Hoagie restaurant',
       'Hot dog restaurant', 'Indian restaurant', 'Irish restaurant',
       'Israeli restaurant', 'Italian restaurant', 'Japanese restaurant',
       'Korean restaurant', 'Latin American restaurant',
       'Lebanese restaurant', 'Lunch restaurant', 'Meat dish restaurant',
       'Mediterranean restaurant', 'Mexican restaurant',
       'Mexican torta restaurant', 'Middle Eastern restaurant',
       'Mongolian barbecue restaurant', 'New American restaurant',
       'Organic restaurant', 'Pan-Asian restaurant',
       'Peruvian restaurant', 'Pho restaurant', 'Pizza restaurant',
       'Ramen restaurant', 'Restaurant', 'Restaurant or cafe',
       'Restaurant supply store', 'Rice restaurant', 'Seafood restaurant',
       'Small plates restaurant', 'Soul food restaurant',
       'Soup restaurant', 'Southeast Asian restaurant',
       'Southern restaurant (US)', 'Southwestern restaurant (US)',
       'Spanish restaurant', 'Sushi restaurant', 'Taco restaurant',
       'Taiwanese restaurant', 'Takeout Restaurant', 'Takeout restaurant',
       'Tex-Mex restaurant', 'Thai restaurant',
       'Traditional American restaurant', 'Traditional restaurant',
       'Vegan restaurant', 'Vegetarian restaurant',
       'Venezuelan restaurant', 'Vietnamese restaurant',
       'Western restaurant'], dtype='<U31')

def parse(path):
    g = open(path, 'r')
    for l in g:
        yield json.loads(l)


def make_metadata_df(fl):
    parser = parse(fl)
    rest_records = []
    print('Processing metadata')
    for record in tqdm(parser):
        if record['category'] != None:
            if not set(record['category']).isdisjoint(RESTS):
                rest_records.append([record['name'],
                                     record['gmap_id'],
                                     record['address'],
                                     record['avg_rating'],
                                     record['relative_results'],
                                     record['num_of_reviews']])
    
    df = pd.DataFrame(rest_records, columns=['Name', 'gmap_id', 'address', 'avg_rating', 
                                             'relative_results', 'num_of_reviews'])
    return df

def make_reviews_df(fl, min_char=0, max_char=10000):
    parser = parse(fl)
    reviews = []
    print('Processing reviews data')
    for review in tqdm(parser):
        if review['text'] != None:
            if len(review['text']) >= min_char and len(review['text']) < max_char:
                reviews.append([review['name'],
                                review['rating'],
                                review['text'],
                                review['gmap_id']
                               ])
    df = pd.DataFrame(reviews, columns=['name', 'rating', 'text', 'gmap_id'])
    return df



In [7]:
os.getcwd()

'/app/notebooks'

In [None]:
reviews_file_path = "data/review-Massachusetts_10.json"
meta_file_path = "/app/data/meta-Massachusetts.json"

reviews_df = make_reviews_df(reviews_file_path, min_char=min_char, max_char=max_char)
meta_df = make_metadata_df(metadata_file_path)

meta_df = meta_df[meta_df['num_of_reviews'] < max_num_reviews]
combined_df = reviews_df.merge(meta_df, on="gmap_id", how="inner")

sub_df = combined_df.loc[:,['text', 'Name', 'address']]
sub_df = sub_df.groupby(["Name", "address"]).agg({"text": "|||||".join}).reset_index()

sub_df.to_csv(output_file_path)