<a href="https://colab.research.google.com/github/jahid-jabed/Grad_Thesis/blob/main/Yelp_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install simplejson

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simplejson
  Downloading simplejson-3.17.6-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (130 kB)
[K     |████████████████████████████████| 130 kB 26.4 MB/s 
[?25hInstalling collected packages: simplejson
Successfully installed simplejson-3.17.6


In [3]:
import argparse
import collections
import csv
import simplejson as json

In [4]:
def read_and_write_file(json_file_path, csv_file_path, column_names):
    """Read in the json dataset file and write it out to a csv file, given the column names."""
    with open(csv_file_path, 'wb+') as fout:
        csv_file = csv.writer(fout)
        csv_file.writerow(list(column_names))
        with open(json_file_path) as fin:
            for line in fin:
                line_contents = json.loads(line)
                csv_file.writerow(get_row(line_contents, column_names))

In [5]:
def get_superset_of_column_names_from_file(json_file_path):
    """Read in the json dataset file and return the superset of column names."""
    column_names = set()
    with open(json_file_path) as fin:
        for line in fin:
            line_contents = json.loads(line)
            column_names.update(
                    set(get_column_names(line_contents).keys())
                    )
    return column_names

In [6]:
def get_column_names(line_contents, parent_key=''):
    """Return a list of flattened key names given a dict.
    Example:
        line_contents = {
            'a': {
                'b': 2,
                'c': 3,
                },
        }
        will return: ['a.b', 'a.c']
    These will be the column names for the eventual csv file.
    """
    column_names = []
    for k, v in line_contents.iteritems():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, collections.MutableMapping):
            column_names.extend(
                    get_column_names(v, column_name).items()
                    )
        else:
            column_names.append((column_name, v))
    return dict(column_names)

In [7]:
def get_nested_value(d, key):
    """Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.
    
    Example:
        d = {
            'a': {
                'b': 2,
                'c': 3,
                },
        }
        key = 'a.b'
        will return: 2
    
    """
    if '.' not in key:
        if key not in d:
            return None
        return d[key]
    base_key, sub_key = key.split('.', 1)
    if base_key not in d:
        return None
    sub_dict = d[base_key]
    return get_nested_value(sub_dict, sub_key)

In [8]:
def get_row(line_contents, column_names):
    """Return a csv compatible row given column names and a dict."""
    row = []
    for column_name in column_names:
        line_value = get_nested_value(
                        line_contents,
                        column_name,
                        )
        if isinstance(line_value, unicode):
            row.append('{0}'.format(line_value.encode('utf-8')))
        elif line_value is not None:
            row.append('{0}'.format(line_value))
        else:
            row.append('')
    return row

In [None]:
!tar -xvf '/content/drive/My Drive/Thesis Works/Datasets/yelp_dataset.tar' -C '/content/drive/My Drive/Thesis Works/Datasets/Yelp_Dataset/'     #[run this cell to extract tar files]

Dataset_User_Agreement.pdf
yelp_academic_dataset_business.json
yelp_academic_dataset_checkin.json
yelp_academic_dataset_review.json
yelp_academic_dataset_tip.json
yelp_academic_dataset_user.json


In [None]:
!tar -xvf '/content/drive/My Drive/Thesis Works/Datasets/yelp_photos.tar' -C '/content/drive/My Drive/Thesis Works/Datasets/Yelp_Dataset/'     #[run this cell to extract tar files]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
photos/gkU1O_ALJ2Zw3hFSyCBLzA.jpg
photos/ryZtCUxFiMvPC6AyuDgcSQ.jpg
photos/U5_2a9xq8YiYpv2fcaoZcQ.jpg
photos/hMl6ACYBPpttkgrR_Wn3zw.jpg
photos/K0IHqxaqTAI1e5lpq5r1gA.jpg
photos/-m8tLTaK5A_k5vqxWt3IVg.jpg
photos/qV-72UQUaCwWbwk1HlBGCw.jpg
photos/J5WH2GBo9xPrKTKYbI0HrQ.jpg
photos/MHe4MAU0IHlC5yKfiLa7HQ.jpg
photos/7A84mJqezh1S9S5L1dLA_A.jpg
photos/NoUQfbSpnC-bP9r_SPxvJg.jpg
photos/P9727tOjND3I9VCwlZ_SRA.jpg
photos/zRSpUeTa8NiCbjtqA17LyQ.jpg
photos/BGOZMHHGwKWiDVFvgBLoXA.jpg
photos/R866XVEsHQUFWIEuR8MW1g.jpg
photos/4PG3mcn7VO5zllDY9__F7g.jpg
photos/uElA3cBTni2aSxV_nDZ51A.jpg
photos/je0PGaGAsa2fx4YmhFXY8g.jpg
photos/J4UtoNiZvGs4DuEggQ3iaA.jpg
photos/2AHCoWgyEsvXataD4w-2eQ.jpg
photos/tSebhacaDKXudgbzEr2-0g.jpg
photos/QQ9vK155vQR_pJdZpqrGlg.jpg
photos/YYkpuzbuYHnuTkGjpxFXoQ.jpg
photos/rA5sXazQ1J5ub4llelIIPA.jpg
photos/aiSnIbncuhucv9sicJN5Xw.jpg
photos/S8ICAZcneLnDc8Cp5f-jGA.jpg
photos/SM71XyureMdmgrsuzEr6sg.jpg
photos/-uGxVqv-Dh

In [17]:
# def json_to_csv(json_file):
#     """Convert a yelp dataset file from json to csv."""

#     parser = argparse.ArgumentParser(
#             description='Convert Yelp Dataset Challenge data from JSON format to CSV.',
#             )

#     parser.add_argument(
#             json_file,
#             type=str,
#             help='The json file to convert.',
#             )

#     args = parser.parse_args()

#     json_file = args.json_file
#     csv_file = '{0}.csv'.format(json_file.split('.json')[0])

#     column_names = get_superset_of_column_names_from_file(json_file)
#     read_and_write_file(json_file, csv_file, column_names)
#     return csv_file

In [4]:
# load json
import numpy as np
import pandas as pd

dfReviews = []

r_dtype = {"stars": np.float16,
           "useful": np.int32,
           "funny": np.int32,
           "cool": np.int32,
           }

path_to_json_file = '/content/drive/My Drive/Thesis Works/Datasets/Yelp_Dataset/yelp_academic_dataset_review.json'
with open(path_to_json_file, 'r') as json_file:
  reader = pd.read_json(json_file, orient="records", lines=True, dtype=r_dtype, chunksize=1000)
  for chunk in reader:
    reduced_chunk = chunk.drop(columns=['review_id', 'user_id']).query("`date` >= '2017-12-01'")
    dfReviews.append(reduced_chunk)

dfReviews = pd.concat(dfReviews, ignore_index=True)

In [5]:
dfReviews

Unnamed: 0,business_id,stars,useful,funny,cool,text,date
0,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,cPepkJeRMtHapc_b2Oe_dw,4.0,1,0,1,I was really between 3 and 4 stars for this on...,2018-07-17 03:30:07
2,kq5Ghhh14r-eCxlVmlyd8w,5.0,0,0,0,My boyfriend and I tried this deli for the fir...,2018-08-23 21:39:38
3,Zx7n8mdt8OzLRXVzolXNhQ,5.0,0,0,0,Amazing biscuits and (fill in the blank). Grea...,2018-04-27 23:03:21
4,I6L0Zxi5Ww0zEWSAVgngeQ,4.0,0,0,0,The cafe was extremely cute. We came at 8am an...,2018-07-07 20:50:12
...,...,...,...,...,...,...,...
3080574,6WaI-IN8ql0xpEKlb4q8tg,5.0,1,0,0,We redesigned my moms dress and mad it complet...,2022-01-17 20:59:01
3080575,2vLksaMmSEcGbjI5gywpZA,5.0,2,1,2,"This spot offers a great, affordable east week...",2021-03-31 16:55:10
3080576,R1khUUxidqfaJmcpmGd4aw,4.0,1,0,0,This Home Depot won me over when I needed to g...,2019-12-30 03:56:30
3080577,Rr9kKArrMhSLVE9a53q-aA,5.0,1,0,0,For when I'm feeling like ignoring my calorie-...,2022-01-19 18:59:27


In [6]:
dfBusinessReviews = dfReviews.groupby(['business_id']).agg(lambda col: '<|>'.join(col))

  results[key] = self.aggregate(func)
  results[key] = self.aggregate(func)


In [8]:
dfBusinessReviews.head(5)

Unnamed: 0_level_0,text
business_id,Unnamed: 1_level_1
---kPU91CF4Lq2-WlRu9Lw,Ate here for the 1st time on Saturday 08/07/20...
--0iUa4sNDFiZFrAdIWhZQ,"This place makes the best, most authentic pupu..."
--30_8IhuyMHbSOcNWd6DQ,Action Karate Jamison is a fantastic school. ...
--7PUidqRWpRSpXebiyxTg,Lunchtime while visiting a family member at th...
--7jw19RH9JKXgFohspgQw,"Would not recommend. Apparently eating pizza, ..."
