In [11]:
%reload_ext dotenv
%dotenv

In [12]:
import os
import time
import json
import random
import warnings
import string
from pathlib import Path
from itertools import takewhile, repeat
from math import ceil

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sqlalchemy import create_engine, types

warnings.filterwarnings('ignore')

# Operational Data Generation

The purpose of [Yelp Dataset](https://www.yelp.com/dataset) is to be used for research. Because of that some information present in the entities of the data
are designed for analytics such as aggregated data (e.g. number of stars) or some transformations. The idea of this notebook is to use this data and transforms in a way as close as possible from what appears to be a real-world operational data scenario (with the minimum required number of transformations). This data will be further used in other data mesh steps. Some important assumptions/takeways of this scenario. For more details about the dataset see [Yelp Dataset Documentation](https://www.yelp.com/dataset/documentation/main).

**Running this script:** Create a file .env with the environment variables described in the secrets section. They will be loaded automatically when running this notebook

In [13]:
POSTGRESQL_USER = os.environ.get("POSTGRESQL_USER")
POSTGRESQL_PASSWORD = os.environ.get("POSTGRESQL_PASSWORD")
POSTGRESQL_HOST = os.environ.get("POSTGRESQL_HOST")
POSTGRESQL_PORT = os.environ.get("POSTGRESQL_PORT")

DATA_FOLDER = Path('../data')
CHUNK_SIZE = 1000 # Number of lines to be processed at a time

BUSINESS_DATASET_PATH = Path(DATA_FOLDER/'yelp_academic_dataset_business.json')
USERS_DATASET_PATH = Path(DATA_FOLDER/'yelp_academic_dataset_user.json')
CHECKIN_DATASET_PATH = Path(DATA_FOLDER/'yelp_academic_dataset_checkin.json')
REVIEWS_DATASET_PATH = Path(DATA_FOLDER/'yelp_academic_dataset_review.json')
TIP_DATASET_PATH = Path(DATA_FOLDER/'yelp_academic_dataset_tip.json')

BUSINESS_DATABASE_NAME = "domain_business"
USERS_DATABASE_NAME = "domain_user"
CHECKIN_DATABASE_NAME = "domain_checkin"
EVALUATIONS_DATABASE_NAME = "domain_evaluations"

# Set to maintain reproducibility on check-in domain
random.seed(42) 

**Support Functions**

In [14]:
def rawincount(path: Path) -> int:
    """Count the number of lines of a file without loading all into memory
    Args:
        filename: Path of the file to count the lines.
    Return:
        int: number of lines on the file.
    Ref:
        https://stackoverflow.com/questions/845058/how-to-get-line-count-of-a-large-file-cheaply-in-python
    """
    f = open(path, 'rb')
    bufgen = takewhile(
        lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None))
    )
    return sum(buf.count(b'\n') for buf in bufgen)

def random_id_generator(length: int = 22) -> str:
    characters = string.ascii_letters + string.digits
    return ''.join(random.choices(characters, k=length))
    

## Business Domain

The business domain contain information about each business in Yelp such as address, location, categories and other variables. To populate this domain we only need to drop some columns that we assume that do not exists as operational information since they are aggregations from other domains (i.e. `stars` and `reviews_count` from the **evaluations** domain).

In [None]:
with open(BUSINESS_DATASET_PATH, mode="r") as f:
    print(f"Example: {f.readline()}")

business_engine = create_engine(
    f"postgresql://{POSTGRESQL_USER}:{POSTGRESQL_PASSWORD}@{POSTGRESQL_HOST}:{POSTGRESQL_PORT}/{BUSINESS_DATABASE_NAME}"
)

n_lines = rawincount(BUSINESS_DATASET_PATH)
with business_engine.connect() as connection:
    for chunk in tqdm(
        pd.read_json(BUSINESS_DATASET_PATH, chunksize=CHUNK_SIZE, lines=True),
        total=ceil(n_lines / CHUNK_SIZE),
    ):
        chunk = chunk.rename(columns={"business_id": "id"})
        chunk = chunk.drop(columns=["stars", "review_count"])
        chunk.to_sql(
            "business",
            con=connection,
            if_exists="append",
            index=False,
            method="multi",
            dtype={
                "id": "TEXT PRIMARY KEY",
                "attributes": types.JSON,
                "hours": types.JSON,
                "is_open": types.BOOLEAN,
            },
        )

## Users Domain

The users domain contain basic information about the users such as names, their friends, when the user joined Yelp and many other information. Populate this domain requires populating 3 tables: `users`, `elite_members` and `friends`.

In [7]:
users_engine = create_engine(f'postgresql://{POSTGRESQL_USER}:{POSTGRESQL_PASSWORD}@{POSTGRESQL_HOST}:{POSTGRESQL_PORT}/{USERS_DATABASE_NAME}')

with open(USERS_DATASET_PATH, mode='r') as f:
    example = json.loads(f.readline())
    del example["friends"] # Removed only for a more readable output
    print(f"Example: {example}")

### Tables: users and elite_members 

On the first we are going to populate 2 tables, the `users` that contain basic information about an user and the `elite_members` table that contain all the elite members by the year. Here we follow the same approach for the business dataset by removing some data from other domains (e.g. `review_count`, `funny` and many others).

In [None]:
def generate_elite_member_table(df: pd.DataFrame) -> pd.DataFrame:
    """Transform the `elite` variable into a table

    This function gets the `elite` string variable (years as elite
    with a ',' separator and transforms into a Datataframe with
    'user_id' and 'year' as columns of the data.

    Args:
        df: Dataframe that contains the elite members variable

    Returns:
        pd.Dataframe: Dataframe with 'user_id' and 'year' as columns
    """
    return_data = []
    for i, row in df.iterrows():
        for year in row["elite"].split(","):
            if (
                year == "20"
            ):  # Just a small correction where all 2020 year appears as 20,20
                return_data.append({"user_id": row["id"], "year": 2020})
            elif year != "":
                return_data.append({"user_id": row["id"], "year": int(year)})
    return pd.DataFrame(return_data).drop_duplicates()


n_lines = rawincount(USERS_DATASET_PATH)
with users_engine.connect() as connection:
    for chunk in tqdm(
        pd.read_json(USERS_DATASET_PATH, chunksize=CHUNK_SIZE, lines=True),
        total=ceil(n_lines / CHUNK_SIZE),
    ):
        chunk = chunk.rename(columns={"user_id": "id"})

        # Users table
        users_chunk = chunk[["id", "name", "yelping_since"]]
        users_chunk.to_sql(
            "users",
            con=connection,
            if_exists="append",
            index=False,
            method="multi",
        )

        # Elite Table
        elite_chunk = generate_elite_member_table(chunk)
        elite_chunk.to_sql(
            "elite_members",
            con=connection,
            if_exists="append",
            index=False,
            method="multi",
        )


### Tables: friends

The `friends` table will represent the connection between user as a bi-directional graph using the user ids as foreign keys (and nodes). Also, some users in the friends list do not exists in the `users` database. To solve this problem the approach was to get only the users that exists and insert as friends.

In [12]:
with open(USERS_DATASET_PATH, mode="r") as f:
    users_list = set([json.loads(line)["user_id"] for line in f])

whn_lines = rawincount(USERS_DATASET_PATH)
with users_engine.connect() as connection:
    for chunk in tqdm(
        pd.read_json(USERS_DATASET_PATH, chunksize=CHUNK_SIZE, lines=True),
        total=ceil(n_lines / CHUNK_SIZE),
        desc="Chunks",
    ):
        chunk = chunk[["user_id", "friends"]]
        chunk.friends = chunk.friends.str.split(", ")
        chunk.friends = chunk.friends.transform(
            lambda x: set(x).intersection(users_list)
        )
        chunk = chunk.explode("friends")
        chunk = chunk.rename(
            columns={"user_id": "previous_user", "friends": "next_user"}
        )
        chunk = chunk.dropna()
        chunk.to_sql(
            "friends",
            con=connection,
            if_exists="append",
            index=False,
            method="multi",
        )


{'user_id': 'qVc8ODYU5SZjKXVBgXdI7w', 'name': 'Walker', 'review_count': 585, 'yelping_since': '2007-01-25 16:47:26', 'useful': 7217, 'funny': 1259, 'cool': 5994, 'elite': '2007', 'fans': 267, 'average_stars': 3.91, 'compliment_hot': 250, 'compliment_more': 65, 'compliment_profile': 55, 'compliment_cute': 56, 'compliment_list': 18, 'compliment_note': 232, 'compliment_plain': 844, 'compliment_cool': 467, 'compliment_funny': 467, 'compliment_writer': 239, 'compliment_photos': 180}


## Checkin Domain

This domain contains information about each check-in made by an user to a business. To populate this database some previous information is required: As can be seen in the example below the dataset made available by Yelp does not contain the information about the user that did the checkin into the business, this is expected since this is sensitive and could have been used to identify an user. Since we are simulating a real world dataset, this information is important and our approach will be to randomly select (with repetition) an existing `user_id` for each business checkin. This is not an optimal way to simulate but should be sufficient to create products and dashboards based on this information.

<div class="alert alert-block alert-info"> <b>Note:</b> To maintain reproducibility note that the random seed is fixed on the beginning of this notebook</div>

In [None]:
checkin_engine = create_engine(
    f"postgresql://{POSTGRESQL_USER}:{POSTGRESQL_PASSWORD}@{POSTGRESQL_HOST}:{POSTGRESQL_PORT}/{CHECKIN_DATABASE_NAME}"
)

with open(CHECKIN_DATASET_PATH, mode="r") as f:
    print(f"Example: {f.readline()}")

with open(USERS_DATASET_PATH, mode="r") as f:
    users_list = set([json.loads(line)["user_id"] for line in f])

n_lines = rawincount(CHECKIN_DATASET_PATH)
with checkin_engine.connect() as connection:
    for chunk in tqdm(
        pd.read_json(CHECKIN_DATASET_PATH, chunksize=CHUNK_SIZE, lines=True),
        total=ceil(n_lines / CHUNK_SIZE),
        desc="Chunks",
    ):
        chunk.date = chunk.date.str.split(", ")
        chunk = chunk.explode("date")
        chunk["user_id"] = np.random.choice(
            users_list, size=len(chunk), replace=True
        )
        chunk["id"] = [random_id_generator() for _ in range(CHUNK_SIZE)] 
        chunk = chunk.rename(columns={"date": "checkin_date"})
        chunk.to_sql(
            "checkins",
            con=connection,
            if_exists="append",
            index=False,
            method="multi",
        )


## Evaluations Domain

This evaluations domain contain information about tips and reviews made by users on business. See [Yelp Dataset Documentation](https://www.yelp.com/dataset/documentation/main)) for more complete information about the variables.

In [19]:
evaluations_engine = create_engine(f'postgresql://{POSTGRESQL_USER}:{POSTGRESQL_PASSWORD}@{POSTGRESQL_HOST}:{POSTGRESQL_PORT}/{EVALUATIONS_DATABASE_NAME}')

{"review_id":"KU_O5udG6zpxOg-VcAEodg","user_id":"mh_-eMZ6K5RLWhZyISBhwA","business_id":"XQfwVwDr-v0ZS3_CbbE5Xw","stars":3.0,"useful":0,"funny":0,"cool":0,"text":"If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.","date":"2018-07-07 22:09:11"}



### Tables: reviews

In [None]:
with open(REVIEWS_DATASET_PATH, mode="r") as f:
    print(f"Example: {f.readline()}")

n_lines = rawincount(REVIEWS_DATASET_PATH)
with evaluations_engine.connect() as connection:
    for chunk in tqdm(
        pd.read_json(REVIEWS_DATASET_PATH, chunksize=CHUNK_SIZE, lines=True),
        total=ceil(n_lines / CHUNK_SIZE),
        desc="Chunks",
    ):
        chunk = chunk.rename(
            columns={
                "review_id": "id",
                "useful": "useful_count",
                "funny": "funny_count",
                "cool": "cool_count",
                "text": "content",
                "date": "review_date",
            }
        )
        chunk.to_sql(
            "reviews",
            con=connection,
            if_exists="append",
            index=False,
            method="multi",
        )


### Tables: tips

In [None]:
with open(TIP_DATASET_PATH, mode="r") as f:
    print(f"Example: {f.readline()}")

n_lines = rawincount(TIP_DATASET_PATH)
with evaluations_engine.connect() as connection:
    for chunk in tqdm(
        pd.read_json(TIP_DATASET_PATH, chunksize=CHUNK_SIZE, lines=True),
        total=ceil(n_lines / CHUNK_SIZE),
        desc="Chunks",
    ):
        chunk = chunk.rename(columns={"text": "content", "date": "tips_date"})
        chunk["id"] = [random_id_generator() for _ in range(CHUNK_SIZE)] 
        chunk.to_sql(
            "tips",
            con=connection,
            if_exists="append",
            index=False,
            method="multi",
        )
