In [2]:
import pandas as pd
import numpy as np
import json
import csv
import seaborn as sns
from glob import glob

In [3]:
PATH = "d:\\data\\yelpdata\\"
DATA = f'{PATH}dataset\\'
PHOT = f'{PATH}photos\\'

### Getting Ontario businesses

In [3]:
business_cols = ['business_id','name','neighborhood','address','city','state','postal_code','latitude','longitude',
       'stars','review_count','is_open','attributes','categories','hours']
business_df = []
for i,line in enumerate(open(f'{DATA}business.json', encoding='utf-8')):
    line_data = []
    for k,v in json.loads(line).items():
        line_data.append(v)
    business_df.append(line_data)
business_df = pd.DataFrame(business_df)
business_df.columns = business_cols

In [4]:
business_on = business_df[business_df['state'] == 'ON'].copy()
business_on.reset_index(drop=True,inplace=True)
business_on.to_csv(f'{DATA}business_on.csv')

In [5]:
ontario_business_ids = set(business_on['business_id'])

In [6]:
del business_df

### Getting Ontario reviews

We get the date, which is great, because we can get the weather data. We can look at the effects of weather on both individual and aggregate user review scores. A simple hypothesis is that people tend to give better reviews in "good" weather, which we can test. It would be even better if we had the exact time of day (which I'm sure Yelp has), but we can't have everything.

In [7]:
review_cols = ['review_id','user_id','business_id','stars','date','text','useful','funny','cool']
review_df = []
for i,line in enumerate(open(f'{DATA}review.json', encoding='utf-8')):
    line_data = []
    for k,v in json.loads(line).items():
        line_data.append(v)
    review_df.append(line_data)
review_df = pd.DataFrame(review_df)
review_df.columns = review_cols

Keep only Ontario/GTA businesses.

In [9]:
review_on = review_df[review_df['business_id'].isin(ontario_business_ids)].copy()
review_on.reset_index(drop=True,inplace=True)

In [11]:
review_on['text'] = review_on['text'].apply(lambda l: l.replace('\n', ' ').replace('"', '').replace('\\', ''))

In [12]:
review_on.to_csv(f'{DATA}review_on.csv', quotechar='"', escapechar="\\")

In [13]:
ontario_user_ids = set(review_on['user_id'])

In [14]:
del review_df

### Users who have reviewed Ontario businesses

In [105]:
user_cols = ['user_id','name','review_count','yelping_since','friends','useful','funny','cool','fans','elite',
            'average_stars','compliment_hot','compliment_more','compliment_profile','compliment_cute','compliment_list',
            'compliment_note','compliment_plain','compliment_cool','compliment_funny','compliment_writer','compliment_photos']
user_df = []
for i,line in enumerate(open(f'{DATA}user.json', encoding='utf-8')):
    line_data = []
    for k,v in json.loads(line).items():
        line_data.append(v)
    user_df.append(line_data)
user_df = pd.DataFrame(user_df)
user_df.columns = user_cols

In [109]:
user_on = user_df[user_df['user_id'].isin(ontario_user_ids)].copy()
user_on.reset_index(drop=True,inplace=True)
user_on.to_csv(f'{DATA}user_on.csv')

In [110]:
del user_df

### Photos for Ontario businesses

In [111]:
photos_cols = ['caption','photo_id','business_id','label']
photos_df = []
for i,line in enumerate(open(f'{DATA}photos.json', encoding='utf-8')):
    line_data = []
    for k,v in json.loads(line).items():
        line_data.append(v)
    photos_df.append(line_data)
photos_df = pd.DataFrame(photos_df)
photos_df.columns = photos_cols

In [115]:
photos_on = photos_df[photos_df['business_id'].isin(ontario_business_ids)].copy()
photos_on.reset_index(drop=True,inplace=True)
photos_on.to_csv(f'{DATA}photos_on.csv')

In [116]:
del photos_df

### Tips for Ontario businesses

In [118]:
tip_cols = ['text','date','likes','business_id','user_id']
tip_df = []
for i,line in enumerate(open(f'{DATA}tip.json', encoding='utf-8')):
    line_data = []
    for k,v in json.loads(line).items():
        line_data.append(v)
    tip_df.append(line_data)
tip_df = pd.DataFrame(tip_df)
tip_df.columns = tip_cols

In [119]:
tip_on = tip_df[tip_df['business_id'].isin(ontario_business_ids)].copy()
tip_on.reset_index(drop=True,inplace=True)
tip_on.to_csv(f'{DATA}tip_on.csv')

In [120]:
del tip_df

### Checkin times for Ontario businesses

In [121]:
checkin_cols = ['time','business_id']
checkin_df = []
for i,line in enumerate(open(f'{DATA}checkin.json', encoding='utf-8')):
    line_data = []
    for k,v in json.loads(line).items():
        line_data.append(v)
    checkin_df.append(line_data)
checkin_df = pd.DataFrame(checkin_df)
checkin_df.columns = checkin_cols

In [122]:
checkin_on = checkin_df[checkin_df['business_id'].isin(ontario_business_ids)].copy()
checkin_on.reset_index(drop=True,inplace=True)
checkin_on.to_csv(f'{DATA}checkin_on.csv')

In [123]:
del checkin_df

### Friends list

Make a separate file only for people with friends on Yelp.

In [14]:
friends = pd.read_csv(f'{DATA}user_on.csv')

In [15]:
friends = friends[friends.friends != '[]']

In [17]:
friends.reset_index(inplace=True, drop=True)

In [22]:
friends.drop('Unnamed: 0', inplace=True, axis=1)

In [24]:
friends.to_csv(f'{DATA}user_on_friends.csv')

### Smaller reviews file...

In [26]:
smallrev = pd.read_csv(f'{DATA}review_on.csv')

In [29]:
smallrev = smallrev[['review_id','user_id','business_id','stars']]

In [31]:
smallrev.to_csv(f'{DATA}review_on_small.csv')