# NLP on Yelp Open Dataset for Review Classification - Data Preparation

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

## Data Preparation

Durante questa fase, abbiamo caricato i file JSON dei business e delle review per selezionare solamente le colonne di interesse e ricaricarlo in un dataset in formato CSV per le fasi successive.

Durante tale processo, abbiamo rimosso alcune colonne relative ai business chiusi per allegerire il carico di reviews presenti nel dataset di origine.

In [2]:
rtypes = {"stars": np.float16, 
            "useful": np.int32, 
            "funny": np.int32,
            "cool": np.int32,
            "text" : str,
           }
reviewPath = './data/yelp_academic_dataset_review.json'
businessPath = './data/yelp_academic_dataset_business.json'
chunkSize = 100000

In [3]:
%%time

bs = pd.read_json(businessPath, lines=True)

CPU times: user 2.84 s, sys: 556 ms, total: 3.4 s
Wall time: 3.4 s


In [4]:
bs.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,45.511907,-122.613693,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Antiques, Fashion, Used, Vintage & Consignment...","{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0..."
3,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,28.914482,-81.295979,3.0,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Beauty & Spas, Hair Salons",
4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,1,"{'GoodForKids': 'False', 'BusinessParking': '{...","Gyms, Active Life, Interval Training Gyms, Fit...","{'Monday': '16:0-19:0', 'Tuesday': '16:0-19:0'..."


In [5]:
# cleaning business dataframe
bs = bs[bs['is_open']==1] # removing closed business
bs = bs.drop(['name', 'address', 'city', 'postal_code', 
              'latitude','longitude','is_open', 'attributes', 
              'categories','hours','state'], axis=1)
bs = bs.rename(columns={'stars' : 'meanStars','review_count' : 'reviewCount'})


In [6]:
bs.head()

Unnamed: 0,business_id,meanStars,reviewCount
0,6iYb2HFDywm3zjuRg0shjw,4.0,86
1,tCbdrRPZA0oiIYSmHG3J0w,4.0,126
2,bvN78flM8NLprQ1a1y5dRg,4.5,13
3,oaepsyvc0J17qwi8cfrOWg,3.0,8
4,PE9uqAjdw0E4-8mjGl3wVA,4.0,14


In [7]:
%%time
review = pd.read_json(reviewPath, lines=True,
                      dtype=rtypes,
                      chunksize=chunkSize)
chunkList = []
for chunkReview in review:
    chunkReview = chunkReview.drop(['review_id','useful','funny','cool', 'user_id'], axis=1)
    chunkReview = chunkReview.rename(columns={'stars': 'reviewStars'})
    chunkMerged = pd.merge(bs, chunkReview, on='business_id', how='inner')
    print(f"{chunkMerged.shape[0]} out of {chunkSize:,} related reviews")
    chunkList.append(chunkMerged)
df = pd.concat(chunkList, ignore_index=True, join='outer', axis=0)

77431 out of 100,000 related reviews
76794 out of 100,000 related reviews
77136 out of 100,000 related reviews
77267 out of 100,000 related reviews
78070 out of 100,000 related reviews
79371 out of 100,000 related reviews
91631 out of 100,000 related reviews
91609 out of 100,000 related reviews
86662 out of 100,000 related reviews
75255 out of 100,000 related reviews
75165 out of 100,000 related reviews
75227 out of 100,000 related reviews
74574 out of 100,000 related reviews
76393 out of 100,000 related reviews
86019 out of 100,000 related reviews
89643 out of 100,000 related reviews
93400 out of 100,000 related reviews
76016 out of 100,000 related reviews
75950 out of 100,000 related reviews
76230 out of 100,000 related reviews
76265 out of 100,000 related reviews
76432 out of 100,000 related reviews
77578 out of 100,000 related reviews
89857 out of 100,000 related reviews
90317 out of 100,000 related reviews
90179 out of 100,000 related reviews
75351 out of 100,000 related reviews
7

In [8]:
df.head()

Unnamed: 0,business_id,meanStars,reviewCount,reviewStars,text,date
0,6iYb2HFDywm3zjuRg0shjw,4.0,86,5.0,Stopped in on a busy Friday night. Despite the...,2018-03-04 00:59:21
1,6iYb2HFDywm3zjuRg0shjw,4.0,86,2.0,Went there about 1 PM on a Monday. It wasn't ...,2018-08-14 05:22:00
2,6iYb2HFDywm3zjuRg0shjw,4.0,86,5.0,This was the place the be on Friday Night! If ...,2018-03-17 14:22:48
3,6iYb2HFDywm3zjuRg0shjw,4.0,86,4.0,Went to this place with my family over the wee...,2018-04-04 21:16:50
4,6iYb2HFDywm3zjuRg0shjw,4.0,86,4.0,"Stopped on a midweek afternoon, and so glad th...",2018-04-28 19:17:04


In [9]:
df = df.rename(columns={'business_id' : 'businessId'})

In [10]:
df.head()

Unnamed: 0,businessId,meanStars,reviewCount,reviewStars,text,date
0,6iYb2HFDywm3zjuRg0shjw,4.0,86,5.0,Stopped in on a busy Friday night. Despite the...,2018-03-04 00:59:21
1,6iYb2HFDywm3zjuRg0shjw,4.0,86,2.0,Went there about 1 PM on a Monday. It wasn't ...,2018-08-14 05:22:00
2,6iYb2HFDywm3zjuRg0shjw,4.0,86,5.0,This was the place the be on Friday Night! If ...,2018-03-17 14:22:48
3,6iYb2HFDywm3zjuRg0shjw,4.0,86,4.0,Went to this place with my family over the wee...,2018-04-04 21:16:50
4,6iYb2HFDywm3zjuRg0shjw,4.0,86,4.0,"Stopped on a midweek afternoon, and so glad th...",2018-04-28 19:17:04


In [12]:
df.to_json('./data/yelp_academic_base_dataset.json', index=True)