In [9]:
import pandas as pd
import json
import os
import seaborn

%matplotlib inline

# schemas

```json


{
    "business_id":"encrypted business id",
    "name":"business name",
    "neighborhood":"hood name",
    "address":"full address",
    "city":"city",
    "state":"state -- if applicable --",
    "postal code":"postal code",
    "latitude":latitude,
    "longitude":longitude,
    "stars":star rating, rounded to half-stars,
    "review_count":number of reviews,
    "is_open":0/1 (closed/open),
    "attributes":["an array of strings: each array element is an attribute"],
    "categories":["an array of strings of business categories"],
    "hours":["an array of strings of business hours"],
    "type": "business"
}

{
    "review_id":"encrypted review id",
    "user_id":"encrypted user id",
    "business_id":"encrypted business id",
    "stars":star rating, rounded to half-stars,
    "date":"date formatted like 2009-12-19",
    "text":"review text",
    "useful":number of useful votes received,
    "funny":number of funny votes received,
    "cool": number of cool review votes received,
    "type": "review"
}

{
    "user_id":"encrypted user id",
    "name":"first name",
    "review_count":number of reviews,
    "yelping_since": date formatted like "2009-12-19",
    "friends":["an array of encrypted ids of friends"],
    "useful":"number of useful votes sent by the user",
    "funny":"number of funny votes sent by the user",
    "cool":"number of cool votes sent by the user",
    "fans":"number of fans the user has",
    "elite":["an array of years the user was elite"],
    "average_stars":floating point average like 4.31,
    "compliment_hot":number of hot compliments received by the user,
    "compliment_more":number of more compliments received by the user,
    "compliment_profile": number of profile compliments received by the user,
    "compliment_cute": number of cute compliments received by the user,
    "compliment_list": number of list compliments received by the user,
    "compliment_note": number of note compliments received by the user,
    "compliment_plain": number of plain compliments received by the user,
    "compliment_cool": number of cool compliments received by the user,
    "compliment_funny": number of funny compliments received by the user,
    "compliment_writer": number of writer compliments received by the user,
    "compliment_photos": number of photo compliments received by the user,
    "type":"user"
}


{
    "time":["an array of check ins with the format day-hour:number of check ins from hour to hour+1"],
    "business_id":"encrypted business id",
    "type":"checkin"
}


{
    "text":"text of the tip",
    "date":"date formatted like 2009-12-19",
    "likes":compliment count,
    "business_id":"encrypted business id",
    "user_id":"encrypted user id",
    "type":"tip"
}

[
    {
        "photo_id": (encrypted photo id),
        "business_id" : (encrypted business id),
        "caption" : (the photo caption, if any),
        "label" : (the category the photo belongs to, if any)
    },
    {...}
]

```

In [5]:
BASE_PATH_YELP = '/media/i008/duzy1/yelp_dataset_challenge_round9/yelp_academic_dataset_{}.json'
PHOTOS = '/media/i008/duzy1/2016_yelp_dataset_challenge_photos/photo_id_to_business_id.json' 


y_bus = BASE_PATH_YELP.format('business')
y_checkin = BASE_PATH_YELP.format('checkin')
y_review = BASE_PATH_YELP.format('review')
y_tip = BASE_PATH_YELP.format('tip')
y_usr = BASE_PATH_YELP.format('user')

In [6]:
#helpers

def yelp_data_generator(file_path):
    with open(file_path) as f:
        for line in f:
            yield json.loads(line.rstrip())
            

In [7]:
business = yelp_data_generator(y_bus)
checkins = yelp_data_generator(y_checkin)
reviews = yelp_data_generator(y_review)
tips = yelp_data_generator(y_tip)
users = yelp_data_generator(y_usr)
photos = yelp_data_generator(PHOTOS)

In [8]:
NROWS=None

df_business = pd.DataFrame.from_records(business, nrows=NROWS)
df_checkins = pd.DataFrame.from_records(checkins, nrows=NROWS)
df_reviews = pd.DataFrame.from_records(reviews, nrows=50000) #really big carefull
df_tips = pd.DataFrame.from_records(tips, nrows=NROWS)
df_users= pd.DataFrame.from_records(users, nrows=NROWS)
df_photos = pd.DataFrame.from_records(photos, nrows=NROWS)




# brainstorming


#### 1) exploratory, basic preprocessing and questions to ask the dataset to get a feel of it.


@business
>- process categories represented as lists [business-ds]
- stars distribiution based on business/categories
- stars vs number of reviews
- plot on globe(map) to see where the place are actually on the map.


@reviews
>- usefull vs rating
- funny vs rating 
- stars distribiution (high resolution then for business)


@checkins
>- process checkin-array to display them as a proper histogram


@users
>- some graph/netowrk analysis could be helpfull here, but thats not simple to implement



#### 2) Ideas for more advanced classification, regression, clustering, exploration tasks


>- figure out usefull reviews (by text)
- figure out rating(stars) from review text
- image captioning -> generate image description.
- figure out candidates for fake reviews/reviewers
- find users that reviewed a lot of places wich are far apart from eachother
- 






In [11]:
df_reviews

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,type,useful,user_id
0,2aFiy99vNLklCx3T_tGS9A,0,2011-10-10,0,NxL8SIC5yqOdnlXCg18IBg,5,If you enjoy service by someone who is as comp...,review,0,KpkOkG6RIf4Ra25Lhhxf1A
1,2aFiy99vNLklCx3T_tGS9A,0,2010-12-29,0,pXbbIgOXvLuTi_SPs1hQEQ,5,After being on the phone with Verizon Wireless...,review,1,bQ7fQq1otn9hKX-gXRsrgA
2,2aFiy99vNLklCx3T_tGS9A,0,2011-04-29,0,wslW2Lu4NYylb1jEapAGsw,5,Great service! Corey is very service oriented....,review,0,r1NUhdNmL6yU9Bn-Yx6FTw
3,2LfIuF3_sX6uwe-IR-P0jQ,1,2014-07-14,0,GP6YEearUWrzPtQYSF1vVg,5,Highly recommended. Went in yesterday looking ...,review,0,aW3ix1KNZAvoM8q-WghA3Q
4,2LfIuF3_sX6uwe-IR-P0jQ,0,2014-01-15,0,25RlYGq2s5qShi-pn3ufVA,4,I walked in here looking for a specific piece ...,review,0,YOo-Cip8HqvKp_p9nEGphw
5,2LfIuF3_sX6uwe-IR-P0jQ,1,2013-04-28,0,Uf1Ki1yyH_JDKhLvn2e4FQ,5,What a great place! Modern on Melrose has amaz...,review,2,bgl3j8yJcRO-00NkUYsXGQ
6,2LfIuF3_sX6uwe-IR-P0jQ,0,2014-10-12,0,oFmVZh-La7SuvpHrH_Al4Q,4,A hidden gem! Found a beautiful buffet for a g...,review,0,CWKF9de-nskLYEqDDCfubg
7,2LfIuF3_sX6uwe-IR-P0jQ,0,2012-09-18,0,bRvdVt88MJ_YMTlLbjDLxQ,5,This place is a great for those vintage/mid ce...,review,2,GJ7PTY7huYORFKKg3db3Gw
8,2LfIuF3_sX6uwe-IR-P0jQ,0,2015-10-11,0,zNUSxqflZKgKD1NQH3jdFA,5,This is the place to go for all your Mid Centu...,review,0,rxqp9eXZj1jYTn0UIsm3Hg
9,2LfIuF3_sX6uwe-IR-P0jQ,0,2015-04-05,0,LkP1l7sZIwOV6IKNLqQp_A,5,"Great items at a good price. Helpful, easy to...",review,0,UU0nHQtHPMAfLidk8tOHTg
