In [1]:
import io
import logging
import urllib

import numpy as np
import pandas as pd
import requests
from config import Config
from requests_ntlm import HttpNtlmAuth
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
class HotelData:
    def __init__(self, destination):
        self.ssrs_url = (
            Config.SSRS_BASE_URL + destination + " Reports/Main Data/HotelList"
        )
        self.ssrs_usr = Config.SSRS_USERNAME
        self.ssrs_pwd = Config.SSRS_PASSWORD
        self.payload = [
            ("Active", True),
            ("rs:Command", "Render"),
            ("rs:Format", "CSV"),
            ("rc:ItemPath", "table1"),
        ]

        self.params = urllib.parse.urlencode(self.payload, quote_via=urllib.parse.quote)

    def get(self):
        response = requests.get(
            self.ssrs_url,
            params=self.params,
            stream=True,
            auth=HttpNtlmAuth(self.ssrs_usr, self.ssrs_pwd),
        )

        if response.status_code == 200:
            data = response.content.decode("utf8")
            return data
        return None

In [3]:
class HotelDataReadCsv(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if X is not None:
            return pd.read_csv(
                io.StringIO(X),
                usecols=[
                    "HotelID",
                    "HotelName",
                    "Country",
                    "State",
                    "City",
                    "HotelTypeID",
                    "Longitude",
                    "Latitude",
                    "Giata",
                    "SaleMail",
                    "CreateDate",
                    "LastChangeDate",
                    "IsActive",
                ],
                dtype={
                    "HotelID": int,
                    "HotelName": str,
                    "Country": str,
                    "State": str,
                    "City": str,
                    "HotelTypeID": str,
                    "Longitude": float,
                    "Latitude": float,
                    "Giata": pd.Int64Dtype(),
                    "SaleMail": str,
                    "IsActive": str,
                },
                parse_dates=[
                    "CreateDate",
                    "LastChangeDate",
                ],
                date_format="%d-%b-%y",
            )


class HotelDataEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.exclude = [
            1,  # NO ACCOMMODATION
            2,  # FLIGHT ONLY PAX
            3,  # TOUR ONLY PAX
            4,  # Transfer Only Pax
            5,  # SPLIT BOOKINGS
            5000,  # TEST
            5005,  # TEST HOTEL
            90001,  # HOTEL SHOP RESERVATIONS
            191680,  # ROULETTE OFFER
            202356,  # TEST HOTEL
            203441,  # ROULETTE HOTEL RAK
            209384,  # TEST HOTEL - BUGFIX - DO NOT ACCESS
            100,  # HOTEL SHOP RESERVATIONS
            90018,  # HOTEL SHOP RESERVATIONS
            209385,  # TEST HOTEL - BUGFIX - DO NOT ACCESS
            217636,  # TEST HOTEL
            218648,  # CRUISE
            218736,  # TEST HOTEL
            219137,  # PP_NOACCOM
            219138,  # PP_NOACCOM
        ]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if X is not None:
            X.drop(X[X["HotelID"].isin(self.exclude)].index, inplace=True)
            X.rename(columns={"HotelName": "HotelName_GWG"}, inplace=True)
            X["HotelName_GWG"] = X["HotelName_GWG"].str.split().str.join(" ")
            X["HotelName"] = X["HotelName_GWG"].str.split("(").str[0]
            X["HotelName"] = X["HotelName"].str.replace("-", " ", regex=False)
            X["HotelName"] = X["HotelName"].str.replace("+", " ", regex=False)
            X["HotelName"] = X["HotelName"].str.replace(" AND ", " & ", regex=False)
            X["HotelName"] = X["HotelName"].str.split().str.join(" ")
            X["SaleMail"] = X["SaleMail"].fillna("undefined").str.lower()
            X.sort_values(["Country", "HotelName"], inplace=True)

            return X
        return None

In [4]:
from sklearn.pipeline import Pipeline

In [5]:
pipeline = Pipeline(
    [
        ("csv_reader", HotelDataReadCsv()),
        ("data_encoder", HotelDataEncoder()),
    ]
)

In [6]:
dfs = []

for destination in ["DU", "OM"]:
    data = HotelData(destination).get()
    df = pipeline.fit_transform(data)

    if destination == "DU":
        df = df[df["Country"].isin(["AE", "SA"])]

    dfs.append(df)

In [7]:
hotel_list = pd.concat(dfs, axis=0, ignore_index=True)

In [8]:
from database import Database, select

db = Database()

hotel_mapping = pd.read_sql(
    sql=select(
        db.mapping_hotel.c.external_code,
        db.mapping_hotel.c.external_name,
        db.mapping_hotel.c.hotel_id,
    ),
    con=db.engine.connect(),
)

  Table(


In [9]:
merged_list = hotel_list.merge(
    hotel_mapping, how="left", left_on="HotelID", right_on="external_code"
)

In [10]:
merged_list["Comparison"] = merged_list["HotelName_GWG"] == merged_list["external_name"]

In [11]:
merged_list.to_csv("merged_list.csv", index=False)