In [1]:
import json
from datetime import datetime
from pathlib import Path

import pandas as pd
import requests
from sqlalchemy import select
from sqlalchemy.orm import aliased

from database import Database
from utils import DistributorApi

In [2]:
base_dir = Path(".").resolve()
data_dir = base_dir / "data"

In [3]:
api = DistributorApi()

In [4]:
rules = api.get_all_rules()
df_rules = pd.json_normalize(rules, sep="_")
df_rules = df_rules[
    [
        "id",
        "name",
        "description",
        "tag",
        "edit_state",
        "obsolete",
        "updated_by",
        "updated_on",
        "credential_level",
        "credential_list",
        "provider_level",
        "provider_list",
        "hotel_level",
        "hotel_list",
        "destination_level",
        "destination_list",
        "rate",
        "refundable",
        "market_level",
        "market_list",
        "meal_level",
        "meal_list",
        "check_in_level",
        "check_in_from",
        "check_in_to",
        "booking_date_level",
        "booking_date_from",
        "booking_date_to",
        "range_level",
        "range_from",
        "range_to",
        "max_release",
        "days_of_week_level",
        "days_of_week_list",
        "age",
        "room_level",
        "room_list",
        "num_of_nights_level",
        "num_of_nights_list",
        "hours_level",
        "hours_list",
        "dynamic_commission",
    ]
]

In [5]:
filter = (
    (df_rules["tag"].isin([-1, 1, 3]))  # empty, product, customer request
    & (df_rules["credential_level"] == 1)  # list of credentials only
    & (df_rules["provider_level"] == 1)  # list of providers only
    & (df_rules["hotel_level"] == 1)  # list of hotels only
    & (df_rules["destination_level"] == 0)  # all
    & (df_rules["rate"] == 0)  # all
    & (df_rules["refundable"] == 0)  # all
    & (df_rules["market_level"] == 0)  # all
    & (df_rules["meal_level"] == 0)  # all
    & (df_rules["check_in_level"] == 0)  # all
    & (df_rules["booking_date_level"] == 0)  # all
    & (df_rules["range_level"] == 0)  # all
    & (df_rules["max_release"] == 0)  # all
    & (df_rules["days_of_week_level"] == 0)  # all
    & (df_rules["age"] == 0)  # all
    & (df_rules["room_level"] == 0)  # all
    & (df_rules["num_of_nights_level"] == 0)  # all
    & (df_rules["hours_level"] == 0)  # all
)

In [6]:
df_rules_filtered = df_rules[filter].copy()
df_rules_filtered.shape

(622, 42)

In [7]:
df_rules_filtered["credential_count"] = df_rules_filtered["credential_list"].apply(len)
df_rules_filtered["provider_count"] = df_rules_filtered["provider_list"].apply(len)
df_rules_filtered["hotel_count"] = df_rules_filtered["hotel_list"].apply(len)

In [8]:
df_rules_filtered["cartesian"] = (
    df_rules_filtered["credential_count"]
    * df_rules_filtered["provider_count"]
    * df_rules_filtered["hotel_count"]
)
df_rules_filtered = df_rules_filtered.sort_values(by="cartesian", ascending=False)

In [9]:
df_exploded = (
    df_rules_filtered.explode("credential_list")
    .explode("provider_list")
    .explode("hotel_list")
)
df_exploded.shape

(1924669, 46)

In [10]:
df_exploded["combination_zip"] = list(
    zip(
        df_exploded["credential_list"],
        df_exploded["provider_list"],
        df_exploded["hotel_list"],
    )
)

In [11]:
df_exploded.drop_duplicates(subset=["combination_zip"], keep="first", inplace=True)

In [12]:
df_exploded = (
    df_exploded.groupby("id")
    .agg(
        {
            "credential_list": lambda x: list(set(x)),
            "provider_list": lambda x: list(set(x)),
            "hotel_list": lambda x: list(set(x)),
        }
    )
    .reset_index()
)

In [13]:
df_duplicates = df_rules_filtered.merge(
    df_exploded, on="id", how="left", indicator=True
)

In [14]:
df_duplicates.to_csv("df_duplicates.csv", index=False)