In [18]:
import ijson
from geopy.distance import geodesic

USER_FILE_URL = "D:/dataset/yelp_academic_dataset_user.json"
REVIEW_FILE_URL = "D:/dataset/yelp_academic_dataset_review.json"
BUSINESS_FILE_URL = "D:/dataset/yelp_academic_dataset_business.json"

In [4]:
# count the number of active users
def activeUserCount():
    f = open(USER_FILE_URL, encoding="utf-8")
    users = ijson.items(f, "", multiple_values=True)
    active_users = (user for user in users if user["review_count"] >= 50)
    count = 0
    for user in active_users:
        count = count + 1
    print("The total number of active user: " + str(count))
activeUserCount()

The total number of active user: 189978


In [25]:
# count the number of businesses who have review count >= 1000
def businessCount(lbound: int = 0):
    f = open(BUSINESS_FILE_URL, encoding="utf-8")
    businesses = ijson.items(f, "", multiple_values=True)
    active_businesses = (business for business in businesses if business["review_count"] >= lbound)
    count = 0
    for business in active_businesses:
        count = count + 1
        if count <= 10:
            print(business["business_id"])
    print("The total number of businesses who have review count at least " + str(lbound) + ": " + str(count))
businessCount(1000)

bP6goJODwRnM3AVy45Kn9w
vecuat0jOia-CJveW3ngDw
J8Ha6yIvGoU-E31jnCq7Ew
QdzRS1s0tSltIokm2xV-kA
3cLBtLFiH8IRdlEy9S8RiQ
oug5bLTWP_YTtj1C3_X6Xw
bZiIIUcpgxh8mpKMDhdqbA
U9lrX8Nviajz-74dF6zL7g
Un6u2cECyV4nZb_HGZ-uTA
PySoEDAeoksJcVCJi8Sjzg
The total number of businesses who have review count at least 1000: 418


In [29]:
# count the user who visit business at least n times.
def activeUserCountForBusiness(bid: str, n:int = 2):
    f = open(REVIEW_FILE_URL, encoding="utf-8")
    reviews = ijson.items(f, "", multiple_values=True)
    reviews = (review for review in reviews if review["business_id"] == bid)
    users = {}
    review_count = 0
    for review in reviews:
        review_count = review_count + 1
        user_id = review["user_id"]
        if user_id in users:
            users[user_id] = users[user_id] + 1
        else:
            users[user_id] = 1
    print("The total number of reviews of business " + bid + " : " + str(review_count))
    user_count = 0
    for key in users.keys():
        if users[key] >= n:
            user_count = user_count + 1
    print("The total number of user who visit business " + bid + " at least " + str(n) + " times: " + str(user_count))
activeUserCountForBusiness("bP6goJODwRnM3AVy45Kn9w")

The total number of reviews of business bP6goJODwRnM3AVy45Kn9w : 1031
The total number of user who visit business bP6goJODwRnM3AVy45Kn9w at least 2 times: 27


In [8]:
# count the number of business in different cities
def businessCountOfCities():
    f = open(BUSINESS_FILE_URL, encoding="utf-8")
    businesses = ijson.items(f, "", multiple_values=True)
    businesses = (business for business in businesses if business["city"])
    cities = {}
    for business in businesses:
        city = business["city"]
        if city in cities:
            cities[city] = cities[city] + 1
        else:
            cities[city] = 1
    ordered_cities = sorted(cities.items(), key=lambda x:x[1], reverse=True)
    for (key, value) in ordered_cities:
        print(key + ": " + str(value))
businessCountOfCities()

Austin: 22416
Portland: 18203
Vancouver: 13330
Atlanta: 12612
Orlando: 10637
Boston: 8263
Columbus: 6634
Boulder: 2542
Cambridge: 2433
Beaverton: 2252
Richmond: 1791
Burnaby: 1725
Kissimmee: 1713
Decatur: 1411
Winter Park: 1288
Somerville: 1265
Quincy: 1093
Brookline: 986
North Vancouver: 926
Tigard: 916
Waltham: 883
Newton: 823
Smyrna: 820
Altamonte Springs: 783
Lake Oswego: 761
Dublin: 734
Salem: 721
Sanford: 672
Natick: 669
Medford: 668
Coquitlam: 652
Woburn: 618
Watertown: 607
Peabody: 602
Burlington: 595
Oviedo: 584
Braintree: 577
Westerville: 569
Arlington: 545
Surrey: 540
Malden: 538
New Westminster: 522
Winter Garden: 517
Lake Mary: 512
Clermont: 492
Clackamas: 484
Marietta: 476
Beverly: 476
Tucker: 472
Danvers: 472
Longwood: 469
Norwood: 461
Milwaukie: 456
Hilliard: 453
Sandy Springs: 450
Allston: 436
Saugus: 434
Brighton: 430
Dedham: 416
Tualatin: 412
Dorchester: 405
Apopka: 403
Happy Valley: 403
Weymouth: 397
Lynn: 393
Revere: 392
Jamaica Plain: 359
Louisville: 359
Belmont: 

In [12]:
# find n business examples in city
def findBusinessExampleInCity(city: str, n: int):
    f = open(BUSINESS_FILE_URL, encoding="utf-8")
    businesses = ijson.items(f, "", multiple_values=True)
    businesses = (business for business in businesses if business["city"] == city)
    for business in businesses:
        print("name: " + business["name"] + " id: " + business["business_id"])
        print("latitude: " + str(business["latitude"]) + " longitude: " + str(business["longitude"]))
        n = n - 1
        if n < 0:
            break
findBusinessExampleInCity("Austin", 10)

name: Lane Wells Jewelry Repair id: N3_Gs3DnX4k9SgpwJxdEfw
latitude: 30.346169 longitude: -97.711458
name: Capital City Barber Shop id: tXvdYGvlEceDljN8gt2_3Q
latitude: 30.1727062 longitude: -97.7999199
name: DoubleTree by Hilton Hotel Austin id: nTIhpR7MhsALPwg_Hh14EA
latitude: 30.3263767015 longitude: -97.7045434713
name: PS Property Management Company id: 8XyEpVdAO0o6iVkVxkWosQ
latitude: 30.2464652 longitude: -97.7787377
name: McKinley Chiropractic id: NVfOn7TdnHbaGH97CVB_Qg
latitude: 30.244902 longitude: -97.857409
name: El Pollo Rey id: Xw8tuI30T-xihpzwBV-zJg
latitude: 30.243493 longitude: -97.730141
name: Austin Regional Clinic: ARC Quarry Lake id: s8eTWEtW5WcnL2kUxrOVmw
latitude: 30.4016572 longitude: -97.7429434
name: Eurasia Sushi Bar & Seafood id: NRPemqVb4qpWFF0Avq_6OQ
latitude: 30.2345329 longitude: -97.8772622
name: Austin Community College - Rio Grande Campus id: XymxusqpIsZFmN_QNQW_fw
latitude: 30.2763428964 longitude: -97.7479623166
name: Texas Global Passport Services 

In [20]:
# find local competitors by distance (distance: km)
def findLocalCompetitors(bid: str, latitude: float, longitude: float, distance: float = 20):
    f = open(BUSINESS_FILE_URL, encoding="utf-8")
    businesses = ijson.items(f, "", multiple_values=True)
    businesses = (business for business in businesses if business["latitude"] and business["longitude"] and business["is_open"] == 1)
    count = 0
    for business in businesses:
        position = (business["latitude"], business["longitude"])
        if geodesic((latitude, longitude), position).km <= distance:
            count = count + 1
            print("name: " + business["name"] + " id: " + business["business_id"])
            print("city: " + business["city"])
    print("number of local competitors around " + str(distance) + "km: " + str(count))
findLocalCompetitors("N3_Gs3DnX4k9SgpwJxdEfw", 30.346169, -97.711458, 1)

name: Lane Wells Jewelry Repair id: N3_Gs3DnX4k9SgpwJxdEfw
city: Austin
name: Matt Holm id: 9AQ1io2V6_AXM5Yo_H337A
city: Austin
name: Crates On Skates id: ZfKHogPGqQpzgNFSFjfICw
city: Austin
name: Texas Tees, Etc id: vctfNR9lnFFHovLtsQKupw
city: Austin
name: Legacy Tax Consulting id: rXmI1bUxBJi_mqYRHiLfNw
city: Austin
name: Arepas Grill id: PYH2RJzpOnwvqY1IYsIu8A
city: Austin
name: Kim Phung Restaurant id: HSpFgxTcf9lihq1R-pcytQ
city: Austin
name: Eudaimonia Recovery Homes id: OZX1uBuExwS-stqRPf8DDw
city: Austin
name: Lone Star Cab id: T-VFDbG1WXJhGKAB9GYouA
city: Austin
name: Holden Roofing id: 5atcNkqLyNGv5tBamuSYEA
city: Austin
name: America's Carpet Outlet id: SKRGFfy82Uju2itJoz0WRg
city: Austin
name: Long Play Lounge id: 2RQRTGN7EI0MSXPb_auuTw
city: Austin
name: Discount Electronics id: z0upGkm6F_fQYtpA1h-x1g
city: Austin
name: Austin Pro Siding, Windows & Roofing id: eDAolGfQ3lWkXHICKtKI3A
city: Austin
name: Boss Nails id: 63fqU8G2kmGvaFkGNN7w3Q
city: Austin
name: E&G Hair Braid

name: Distinctive Wood Crafts id: SF4S2zXtG4SFhTIAwACejw
city: Austin
name: Slavin-Nadal School Of Ballet id: S4nydI15bCa9ETuaEX10FA
city: Austin
name: AB Moving - Austin id: vusaGvJz66GNWgaa1lB1vw
city: Austin
name: Gethsemane Lutheran Children's Ministry id: Tyx7L0jSDkTrswe33p_eUw
city: Austin
name: North Lamar Transit Center id: E4ir5mPiFMb28VgYdYT6RQ
city: Austin
name: Josco Supply & Showroom id: kclxV5Keu_P1NUgqrUeNCg
city: Austin
name: Happy Clouds Smoke Shop id: 1LmF4hWtGhk_S5iiQHWiVQ
city: Austin
name: Coolmasters Air Conditioning & Heating id: feow-Gm2bmGHn2s5Txn0CA
city: Austin
name: Phoenix Vapor Shop id: IKc5_joVfuVL-WfYKlFjSg
city: Austin
name: Planet Fitness id: y1lHqO7K0bbgLzOhBczSxw
city: Austin
name: Fine Wood Repair id: iqtAz6VVKcRYbJigrURzkA
city: Austin
name: MSS Contracting id: jBVPKKHxbmEh_MfGiyEJtg
city: Austin
name: Carter's Transmission & Auto Service id: GQvc0X3KgIdmft-nFO-7jg
city: Austin
name: Adagio Spa id: Njl98S2qWtaWeRNO3HZ02Q
city: Austin
name: Austin T