In [1]:
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd 
import plotly.express as px
from pandasql import sqldf

In [2]:
yelp_academic_dataset_business_json_path = '/home/016037047/yelp_data/yelp_academic_dataset_business.json'
yelp_business_dataset_json = pd.read_json(yelp_academic_dataset_business_json_path, lines=True)
#printing the overview of the business dataset
print(yelp_business_dataset_json.shape)
print('No of records in business dataset',yelp_business_dataset_json.shape[0])
print('No of features in business dataset',yelp_business_dataset_json.shape[1])
yelp_business_dataset_json.head()

(150346, 14)
No of records in business dataset 150346
No of features in business dataset 14


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [3]:
yelp_business_dataset_json.categories.value_counts()[:10]

Beauty & Spas, Nail Salons    1012
Restaurants, Pizza             935
Nail Salons, Beauty & Spas     934
Pizza, Restaurants             823
Restaurants, Mexican           728
Restaurants, Chinese           708
Mexican, Restaurants           672
Chinese, Restaurants           651
Food, Coffee & Tea             508
Beauty & Spas, Hair Salons     493
Name: categories, dtype: int64

In [4]:
df_categories = yelp_business_dataset_json.assign(categories = yelp_business_dataset_json.categories.str.split(', ')).explode('categories')
df_categories.sample(3)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
149339,qvQ_IfX6guDk4Y4fsK3HDA,Braeutigam Orchards,2795 Turkey Hill Ln,Belleville,IL,62221,38.480958,-89.916106,4.5,26,1,"{'RestaurantsPriceRange2': '1', 'BikeParking':...",Food,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ..."
36665,UHSNKp8AqaYvY0jVuxCBSw,Certegy Check Services,11601 Roosevelt Blvd N,St. Petersburg,FL,33716,27.877463,-82.653546,1.0,6,1,,Financial Services,
31098,KAZodlE7Dyad2plZ_cVkkQ,Shu's Idaho Running Company,1758 W State St,Boise,ID,83702,43.624538,-116.21208,5.0,36,1,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...",Shopping,"{'Monday': '10:0-18:0', 'Tuesday': '10:0-18:0'..."


In [5]:
df_categories.categories.value_counts()[:20]

Restaurants                  52268
Food                         27781
Shopping                     24395
Home Services                14356
Beauty & Spas                14292
Nightlife                    12281
Health & Medical             11890
Local Services               11198
Bars                         11065
Automotive                   10773
Event Planning & Services     9895
Sandwiches                    8366
American (Traditional)        8139
Active Life                   7687
Pizza                         7093
Coffee & Tea                  6703
Fast Food                     6472
Breakfast & Brunch            6239
American (New)                6097
Hotels & Travel               5857
Name: categories, dtype: int64

In [6]:
df_categories=df_categories.dropna()
df_categories.isna().sum()

business_id     0
name            0
address         0
city            0
state           0
postal_code     0
latitude        0
longitude       0
stars           0
review_count    0
is_open         0
attributes      0
categories      0
hours           0
dtype: int64

In [7]:
df_restaurant=df_categories[df_categories.categories == 'Restaurants'].head(5000)
df_shopping=df_categories[df_categories.categories == 'Shopping'].head(5000)
df_activelife=df_categories[df_categories.categories == 'Active Life'].head(5000)
df_beauty=df_categories[df_categories.categories == 'Beauty & Spas'].head(5000)
df_auto=df_categories[df_categories.categories == 'Automotive'].head(5000)

In [8]:
multi_frame=[df_restaurant,df_shopping,df_activelife,df_beauty,df_auto]

In [9]:
df_multicat=pd.concat(multi_frame)

In [10]:
df_multicat.shape

(25000, 14)

In [11]:
yelp_academic_review_json_path = '/home/016037047/yelp_data/yelp_academic_dataset_review.json'

In [12]:
size = 600000
review = pd.read_json(yelp_academic_review_json_path, lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)
chunk_list = []

for chunk_review in review:
    chunk_review = chunk_review.drop(['review_id','useful','funny','cool'], axis=1)
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    chunk_merged = pd.merge(df_multicat, chunk_review, on='business_id', how='inner')
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
multicat_review = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

495615 out of 600,000 related reviews
207874 out of 600,000 related reviews
102789 out of 600,000 related reviews
74073 out of 600,000 related reviews
60836 out of 600,000 related reviews
38941 out of 600,000 related reviews
34551 out of 600,000 related reviews
16656 out of 600,000 related reviews
16125 out of 600,000 related reviews
12460 out of 600,000 related reviews
4798 out of 600,000 related reviews
0 out of 600,000 related reviews


In [15]:
df=multicat_review[["text","date","categories","review_stars"]]

In [16]:
df

Unnamed: 0,text,date,categories,review_stars
0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53,Restaurants,4
1,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06,Restaurants,4
2,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57,Restaurants,5
3,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57,Restaurants,5
4,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35,Restaurants,4
...,...,...,...,...
1064713,I LOVE working out at at Linda's studio. I've ...,2019-07-02 23:23:46,Active Life,5
1064714,I've been coming to Pilates by Linda for over ...,2019-05-11 01:57:09,Active Life,5
1064715,I recently traveled to the area to visit my mo...,2012-02-04 21:44:48,Active Life,5
1064716,I just started pilates about 6 weeks ago and I...,2012-02-15 14:12:06,Active Life,5


In [18]:
df

Unnamed: 0,text,date,categories,review_stars
0,This is nice little Chinese bakery in the hear...,2014-05-26 01:09:53,Restaurants,4
1,This is the bakery I usually go to in Chinatow...,2013-10-05 15:19:06,Restaurants,4
2,"A delightful find in Chinatown! Very clean, an...",2013-10-25 01:34:57,Restaurants,5
3,I ordered a graduation cake for my niece and i...,2018-05-20 17:58:57,Restaurants,5
4,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...,2013-10-25 02:31:35,Restaurants,4
...,...,...,...,...
1064713,I LOVE working out at at Linda's studio. I've ...,2019-07-02 23:23:46,Active Life,5
1064714,I've been coming to Pilates by Linda for over ...,2019-05-11 01:57:09,Active Life,5
1064715,I recently traveled to the area to visit my mo...,2012-02-04 21:44:48,Active Life,5
1064716,I just started pilates about 6 weeks ago and I...,2012-02-15 14:12:06,Active Life,5


In [45]:
df_sample=df.sample(frac=1).reset_index(drop=True)
df_sample.shape
df_sample.categories.value_counts()

Restaurants      518786
Active Life      153596
Beauty & Spas    145245
Automotive       129866
Shopping         117225
Name: categories, dtype: int64

In [68]:
df_test=df_sample[0:564718]
df_test.categories.value_counts()

Restaurants      274938
Active Life       81607
Beauty & Spas     77062
Automotive        68805
Shopping          62306
Name: categories, dtype: int64

### Had to use some other conditions to balance the dataset, as  dataframe operations were taking a lot of time 

In [110]:
to_remove = np.random.choice(df_test[df_test['categories']=="Restaurants"].index,size=198000,replace=False)
df_testv1=df_test.drop(to_remove)

In [111]:
df_testv1.categories.value_counts()

Active Life      81607
Beauty & Spas    77062
Restaurants      76938
Automotive       68805
Shopping         62306
Name: categories, dtype: int64

In [112]:
to_remove = np.random.choice(df_testv1[df_testv1['categories']=="Restaurants"].index,size=8000,replace=False)
df_testv2=df_testv1.drop(to_remove)

In [108]:
df_testv1.shape

(366718, 4)

### Basically ran the below code for almost all the values to bring them close to each other

In [101]:
categories=df_testv2.categories.unique()

In [102]:
categories

array(['Beauty & Spas', 'Restaurants', 'Automotive', 'Active Life',
       'Shopping'], dtype=object)

In [118]:
to_remove = np.random.choice(df_testv2[df_testv2['categories']=="Active Life"].index,size=500,replace=False)
df_testv2=df_testv2.drop(to_remove)

In [119]:
df_testv2.categories.value_counts()

Restaurants      68938
Automotive       68805
Active Life      68607
Beauty & Spas    68562
Shopping         62306
Name: categories, dtype: int64

In [120]:
categories=['Beauty & Spas', 'Automotive', 'Active Life','Restaurants']

In [121]:
# creating a for loop to remove the data as all of the categories listed have almost similar values
for i in categories:
    to_remove = np.random.choice(df_testv2[df_testv2['categories']==i].index,size=6000,replace=False)
    df_testv2=df_testv2.drop(to_remove)

In [122]:
df_testv2.categories.value_counts()

Restaurants      62938
Automotive       62805
Active Life      62607
Beauty & Spas    62562
Shopping         62306
Name: categories, dtype: int64

In [123]:
df_testv2.to_csv("tagging_dataset.csv")

In [124]:
df_testv2.shape

(313218, 4)