In [1]:
import sys
import os
!{sys.executable} -m pip install langchain==0.0.335 --no-warn-script-location > /dev/null
!{sys.executable} -m pip install pygpt4all==1.1.0 --no-warn-script-location > /dev/null
!{sys.executable} -m pip install gpt4all==1.0.12 --no-warn-script-location > /dev/null
!{sys.executable} -m pip install transformers==4.35.1 --no-warn-script-location > /dev/null
!{sys.executable} -m pip install datasets==2.14.6 --no-warn-script-location > /dev/null
!{sys.executable} -m pip install tiktoken==0.4.0 --no-warn-script-location > /dev/null
!{sys.executable} -m pip install chromadb==0.4.15 --no-warn-script-location > /dev/null
!{sys.executable} -m pip install sentence_transformers==2.2.2 --no-warn-script-location > /dev/null


In [48]:
import requests
import contextlib
import pandas as pd
import time
import io

from tqdm import tqdm
from langchain.llms import GPT4All
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings import HuggingFaceEmbeddings
from datasets import load_dataset


class RAGBot:
    """
    A class to handle model downloading, dataset management, model loading, vector database
    creation, retrieval mechanisms, and inference for a response generation bot.

    Attributes
    ----------
    model_path : str
        The file path where the model is stored.
    data_path : str
        The file path where the dataset is stored.
    user_input : str
        The input provided by the user for generating a response.
    model : str
        The name of the model being used.
    """

    def __init__(self):
        """
        Initializes the RAGBot with default values for model path, data path,
        user input, and model.
        """
        self.model_path = ""
        self.data_path = ""
        self.user_input = ""
        self.model = ""

    def get_model(self, model, chunk_size: int = 10000):
        """
        Downloads the specified model to the model path. Supports downloading of large
        models in chunks.

        Additional download tooling is reserved for users to add their own models. Currently hardcoded to load Falcon from 

        Parameters
        ----------
        model : str
            The name of the model to be downloaded.
        chunk_size : int, optional
            The size of each chunk of data to download at a time, by default 10000.
        """

        self.model = model

        if self.model == "Falcon":
            self.model_path = "/home/common/data/Big_Data/GenAI/llm_models/nomic-ai--gpt4all-falcon-ggml/ggml-model-gpt4all-falcon-q4_0.bin"
        elif model == "More Models Coming Soon!":
            print("More models coming soon, defaulting to Falcon for now!")
            self.model_path = "/home/common/data/Big_Data/GenAI/llm_models/nomic-ai--gpt4all-falcon-ggml/ggml-model-gpt4all-falcon-q4_0.bin"

        if not os.path.isfile(self.model_path):
            # send a GET request to the URL to download the file. Stream since it's large
            response = requests.get(url, stream=True)
            # open the file in binary mode and write the contents of the response to it in chunks
            # This is a large file, so be prepared to wait.
            with open(self.model_path, 'wb') as f:
                for chunk in tqdm(response.iter_content(chunk_size=10000)):
                    if chunk:
                        f.write(chunk)
        else:
            print('model already exists in path.')

    # def download_dataset(self, dataset):
    #     """
    #     Downloads the specified dataset and saves it to the data path.

    #     Parameters
    #     ----------
    #     dataset : str
    #         The name of the dataset to be downloaded.
    #     """
    #     self.data_path = dataset + '_dialogues.txt'

    #     if not os.path.isfile(self.data_path):

    #         datasets = {"robot maintenance": "FunDialogues/customer-service-robot-support", 
    #                     "basketball coach": "FunDialogues/sports-basketball-coach", 
    #                     "physics professor": "FunDialogues/academia-physics-office-hours",
    #                     "grocery cashier" : "FunDialogues/customer-service-grocery-cashier"}
            
    #         # Download the dialogue from hugging face
    #         dataset = load_dataset(f"{datasets[dataset]}")
    #         # Convert the dataset to a pandas dataframe
    #         dialogues = dataset['train']
    #         df = pd.DataFrame(dialogues, columns=['id', 'description', 'dialogue'])
    #         # Print the first 5 rows of the dataframe
    #         df.head()
    #         # only keep the dialogue column
    #         dialog_df = df['dialogue']
            
    #         # save the data to txt file
    #         dialog_df.to_csv(self.data_path, sep=' ', index=False)
    #     else:
    #         print('data already exists in path.')        
    def set_data_path(self):
        """
        Sets the data path to the preprocessed hotel dataset.
        """
        self.data_path = 'hotel_data_for_vector_db.txt'


    def load_model(self, n_threads, max_tokens, repeat_penalty, n_batch, top_k, temp):
        """
        Loads the model with specified parameters for parallel processing.

        Parameters
        ----------
        n_threads : int
            The number of threads for parallel processing.
        max_tokens : int
            The maximum number of tokens for model prediction.
        repeat_penalty : float
            The penalty for repeated tokens in generation.
        n_batch : int
            The number of batches for processing.
        top_k : int
            The number of top k tokens to be considered in sampling.
        """
        # Callbacks support token-wise streaming
        callbacks = [StreamingStdOutCallbackHandler()]
        # Verbose is required to pass to the callback manager

        self.llm = GPT4All(model=self.model_path, callbacks=callbacks, verbose=False,
                           n_threads=n_threads, n_predict=max_tokens, repeat_penalty=repeat_penalty, 
                           n_batch=n_batch, top_k=top_k, temp=temp)

    def build_vectordb(self, chunk_size, overlap):
        """
        Builds a vector database from the dataset for retrieval purposes.

        Parameters
        ----------
        chunk_size : int
            The size of text chunks for vectorization.
        overlap : int
            The overlap size between chunks.
        """
        loader = TextLoader(self.data_path)
        # Text Splitter
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
        # Embed the document and store into chroma DB
        self.index = VectorstoreIndexCreator(embedding= HuggingFaceEmbeddings(), text_splitter=text_splitter).from_loaders([loader])

    def retrieval_mechanism(self, user_input, top_k=1, context_verbosity = False, rag_off= False):
        """
        Retrieves relevant document snippets based on the user's query.

        Parameters
        ----------
        user_input : str
            The user's input or query.
        top_k : int, optional
            The number of top results to return, by default 1.
        context_verbosity : bool, optional
            If True, additional context information is printed, by default False.
        rag_off : bool, optional
            If True, disables the retrieval-augmented generation, by default False.
        """

        self.user_input = user_input
        self.context_verbosity = context_verbosity
                
        # perform a similarity search and retrieve the context from our documents
        results = self.index.vectorstore.similarity_search(self.user_input, k=top_k)
        # join all context information into one string 
        context = "\n".join([document.page_content for document in results])
        if self.context_verbosity:
            print(f"Retrieving information related to your question...")
            print(f"Found this content which is most similar to your question: {context}")

        if rag_off:
            template = """Question: {question}
            Answer: This is the response: """
            self.prompt = PromptTemplate(template=template, input_variables=["question"])
        else:     
            template = """ Don't just repeat the following context, use it in combination with your knowledge to improve your answer to the question:{context}

            Question: {question}
            """
            self.prompt = PromptTemplate(template=template, input_variables=["context", "question"]).partial(context=context)


    def inference(self):
        """
        Performs inference to generate a response based on the user's query.

        Returns
        -------
        str
            The generated response.
        """

        if self.context_verbosity:
            print(f"Your Query: {self.prompt}")
            
        llm_chain = LLMChain(prompt=self.prompt, llm=self.llm)
        print("Processing the information with gpt4all...\n")
        response = llm_chain.run(self.user_input)

        return  response  
    

In [50]:
import ipywidgets as widgets
from IPython.display import display, HTML

bot = RAGBot()

# Initialize previous value variables
previous_threads = None
previous_max_tokens = None
previous_top_k = None
previous_dataset = None
previous_chunk_size = None
previous_overlap = None
previous_temp = None

# Create an output widget
output = widgets.Output()

def process_inputs(b):
    """
    Process inputs from the interactive chat interface.

    This function is triggered by a button click in the IPython widgets interface. It captures 
    user inputs from various widget elements, such as dropdowns, sliders, and text inputs. The function 
    handles model and dataset downloading, initiates model loading and vector database building, 
    performs the retrieval mechanism, and generates a response to the user's query. The response is 
    then displayed in a styled HTML format within the Jupyter Notebook.

    Parameters
    ----------
    b : ipywidgets.widgets.widget_button.Button
        The button widget that triggers this function. This parameter is required by the
        widget framework but is not directly used in the function.

    Notes
    -----
    - This function is designed to be used as a callback for an IPython button widget.
    - It utilizes global variables to access and update the widget elements and their values.
    - The function updates global variables to keep track of previous parameter values for 
      efficient reloading of models and rebuilding of vector databases.
    - Standard output and error output are captured and redirected to suppress unnecessary console logs,
      while relevant output is displayed via the IPython display mechanism.
    """
    global previous_threads, previous_max_tokens, previous_top_k, previous_dataset, previous_chunk_size, previous_overlap, previous_temp

    with output:
        output.clear_output()
        # Suppress output
        f = io.StringIO()
        with contextlib.redirect_stdout(f), contextlib.redirect_stderr(f):

            # Function to process inputs
            # Gather values from the widgets
            model = model_dropdown.value
            query = query_text.value
            top_k = top_k_slider.value
            chunk_size = chunk_size_input.value
            overlap = overlap_input.value
            dataset = dataset_dropdown.value
            threads = threads_slider.value
            max_tokens = max_token_input.value
            rag_off = rag_off_checkbox.value
            temp = temp_slider.value
            bot.get_model(model = model)
            bot.set_data_path()
            # bot.download_dataset(dataset = dataset)
            if threads != previous_threads or max_tokens != previous_max_tokens or top_k != previous_top_k or temp != previous_temp:
                print("loading model due incorporate new parameters")
                bot.load_model(n_threads=threads, max_tokens=max_tokens, repeat_penalty=1.50, n_batch=threads, top_k=top_k, temp=temp)
                # Update previous values
                previous_threads = threads
                previous_max_tokens = max_tokens
                previous_top_k = top_k
                previous_temp = temp
            if dataset != previous_dataset or chunk_size != previous_chunk_size or overlap != previous_overlap:
                print("rebuilding vector DB due to changing dataset, overlap, or chunk")
                bot.build_vectordb(chunk_size = chunk_size, overlap = overlap)
                previous_dataset = dataset
                previous_chunk_size = chunk_size
                previous_overlap = overlap
            bot.retrieval_mechanism(user_input = query, rag_off = rag_off)
            response = bot.inference()
    
            styled_response = f"""
            <div style="
                background-color: lightblue;
                border-radius: 15px;
                padding: 10px;
                font-family: Arial, sans-serif;
                color: black;
                max-width: 600px;
                word-wrap: break-word;
                margin: 10px;
                font-size: 14px;">
                {response}
            </div>
            """
            display(HTML(styled_response))

def create_chat_interface():
    global model_dropdown, query_text, top_k_slider, rag_off_checkbox, chunk_size_input, overlap_input, dataset_dropdown, threads_slider, max_token_input, repeat_penalty_input, temp_slider
    # Model selection dropdown
    model_dropdown = widgets.Dropdown(
        options=['Falcon', 'More Models Coming Soon!'],
        description='Model:',
        disabled=False,
    )

    # User query text input
    query_layout = widgets.Layout(width='400px', height='400px')  # Adjust the width as needed
    query_text = widgets.Text(
        placeholder='Type your query here',
        description='Query:',
        disabled=False, 
        layout=query_layout
    )

    # Vector search top k slider
    top_k_slider = widgets.IntSlider(
        value=2,
        min=1,
        max=4,
        step=1,
        description='Top K:',
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        readout_format='d'
    )

    # Model Temperature slider
    temp_slider = widgets.FloatSlider(
    value=0.7,
    min=0.1,
    max=1.4,
    step=0.1,
    description='Temperature:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='.1f'
)
    
    # RAG OFF TOGGLE
    rag_off_checkbox = widgets.Checkbox(
    value=False,
    description='RAG OFF?',
    disabled=False,
    indent=False,  # Set to True if you want the checkbox to be indented
    tooltip='Turns off RAG and Performs Inference with Raw Model and Prompt Only'
    )

    # Chunk size number input
    chunk_size_input = widgets.BoundedIntText(
        value=500,
        min=5,
        max=5000,
        step=1,
        description='Chunk Size:',
        disabled=False
    )

    # Overlap number input
    overlap_input = widgets.BoundedIntText(
        value=50,
        min=0,
        max=1000,
        step=1,
        description='Overlap:',
        disabled=False
    )

    # Dataset selection dropdown
    # dataset_dropdown = widgets.Dropdown(
    #     options=['robot maintenance', 'basketball coach', 'physics professor', 'grocery cashier'],
    #     description='Dataset:',
    #     disabled=False,
    # )

    # Number of threads slider
    threads_slider = widgets.IntSlider(
        value=64,
        min=2,
        max=200,
        step=1,
        description='Threads:',
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True,
        readout_format='d'
    )

    # Max token number input
    max_token_input = widgets.BoundedIntText(
        value=50,
        min=5,
        max=500,
        step=5,
        description='Max Tokens:',
        disabled=False
    )

    # Group the widgets except the query text into a VBox
    left_column = widgets.VBox([model_dropdown, top_k_slider, temp_slider, rag_off_checkbox, chunk_size_input, 
                                overlap_input, dataset_dropdown, threads_slider, max_token_input])

    # Submit button
    submit_button = widgets.Button(description="Submit")
    submit_button.on_click(process_inputs)

    right_column = widgets.VBox([query_text, submit_button])

    # Use HBox to position the VBox and query text side by side
    interface_layout = widgets.HBox([left_column, right_column])


    # Display the layout
    display(interface_layout, output)

create_chat_interface()

HBox(children=(VBox(children=(Dropdown(description='Model:', options=('Falcon', 'More Models Coming Soon!'), v…

Output()

In [4]:
from datasets import load_dataset
import pandas as pd

In [5]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [6]:
dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets")

In [7]:
print("Dataset Structure:", dataset)

Dataset Structure: DatasetDict({
    train: Dataset({
        features: ['hotel_name', 'hotel_description', 'review_title', 'review_text', 'rate', 'tripdate', 'hotel_url', 'hotel_image', 'price_range', 'rating_value', 'review_count', 'street_address', 'locality', 'country'],
        num_rows: 5997
    })
})


In [8]:
df = pd.DataFrame(dataset['train'])

In [20]:
df.head(2)

Unnamed: 0,hotel_name,hotel_description,review_title,review_text,rate,tripdate,hotel_url,hotel_image,price_range,rating_value,review_count,street_address,locality,country
0,Romance Istanbul Hotel,"Romance Istanbul Hotel has 39 rooms.Every room is elegantly furnished and harmonizes the modern life style with the traditional Ottoman touch. Romance Istanbul sits at the intersection of the old city’s most important part. With its luxuriously inspiring design and landmark old city location, steeped in the history of its surroundings, Romance Istanbul Hotel welcomes you with exceptional designed rooms and world-renowned Turkish hospitality. Our colleagues deliver the most personal service. It is perfectly placed and perfectly designed to enhance all that Istanbul has to offer. Each room offers a private bathroom and shower. Each is equipped with a satellite TV and free wifi connection. The rooms size change between 20 m2 and 45 m2. It includes 7 suite rooms: 1 Royal Suite, 4 Grand Suite, 1 Romance Suite and 1 Premium Suite, 2 Luxury Room With Terrace, 22 Deluxe Room, 8 City Room.","An exceptional boutique hotel, great value for your money",,,February 2020,https://www.tripadvisor.com/Hotel_Review-g293974-d8364987-Reviews-Romance_Istanbul_Hotel-Istanbul.html,https://media-cdn.tripadvisor.com/media/photo-s/1c/02/78/ba/romance-istanbul-hotel.jpg,$ (Based on Average Nightly Rates for a Standard Room from our Partners),5.0,4023,Hudavendigar Cd. No:5 Sirkeci,Istanbul,Turkiye
1,Romance Istanbul Hotel,"Romance Istanbul Hotel has 39 rooms.Every room is elegantly furnished and harmonizes the modern life style with the traditional Ottoman touch. Romance Istanbul sits at the intersection of the old city’s most important part. With its luxuriously inspiring design and landmark old city location, steeped in the history of its surroundings, Romance Istanbul Hotel welcomes you with exceptional designed rooms and world-renowned Turkish hospitality. Our colleagues deliver the most personal service. It is perfectly placed and perfectly designed to enhance all that Istanbul has to offer. Each room offers a private bathroom and shower. Each is equipped with a satellite TV and free wifi connection. The rooms size change between 20 m2 and 45 m2. It includes 7 suite rooms: 1 Royal Suite, 4 Grand Suite, 1 Romance Suite and 1 Premium Suite, 2 Luxury Room With Terrace, 22 Deluxe Room, 8 City Room.",You can’t get better than this.,,,March 2021,https://www.tripadvisor.com/Hotel_Review-g293974-d8364987-Reviews-Romance_Istanbul_Hotel-Istanbul.html,https://media-cdn.tripadvisor.com/media/photo-s/1c/02/78/ba/romance-istanbul-hotel.jpg,$ (Based on Average Nightly Rates for a Standard Room from our Partners),5.0,4023,Hudavendigar Cd. No:5 Sirkeci,Istanbul,Turkiye


In [22]:
df.tail(2)

Unnamed: 0,hotel_name,hotel_description,review_title,review_text,rate,tripdate,hotel_url,hotel_image,price_range,rating_value,review_count,street_address,locality,country
5995,Hotel Campanile Paris Bercy Village,"In the east of Paris, the hotel Campanile Bercy boasts an ideal location for exploring The City of LightsNature-lovers staying at Campanile Bercy will enjoy its privileged vicinity, taking walks in Bercy Park and its four gardens, strolling around the village ""Cour Saint Emilion"" as well as trying restaurants and cafes.",For one night only,We stayed here for one night on our way to Italy and that was enough. The room was clean and quite comfortable but very small like a poor relation of Premier Inn. It’s in an area without much to offer in terms of anything to do in the evening and the hotel doesn’t really offer much either. Depends what you’re after and it is perfectly adequate just not somewhere you want to do anything other than sleep.,3.0,September 2019,https://www.tripadvisor.com/Hotel_Review-g187147-d233766-Reviews-or30-Hotel_Campanile_Paris_Bercy_Village-Paris_Ile_de_France.html,https://media-cdn.tripadvisor.com/media/photo-s/27/c7/d3/ab/chambre-double.jpg,$ (Based on Average Nightly Rates for a Standard Room from our Partners),4.0,1625,17 rue Baron le Roy,Paris,France
5996,Hotel Campanile Paris Bercy Village,"In the east of Paris, the hotel Campanile Bercy boasts an ideal location for exploring The City of LightsNature-lovers staying at Campanile Bercy will enjoy its privileged vicinity, taking walks in Bercy Park and its four gardens, strolling around the village ""Cour Saint Emilion"" as well as trying restaurants and cafes.",Nice hotel in Paris Bercy,"The Hotel is close to the metro station Cour Saint-Emilion (line 14), in a nice area, with plenty of shops and restaurants, in Bercy Village. Within 15 minutes you can reach the city center / Chatelet Les Halles, still using metro line 14. The hotel staff was very friendly. The rooms are clean, maybe a little uninspired. Breakfast is very nice, the pains au chocolat are top of class! Overall: a nice stay in a not so overcrowded area of Paris.",4.0,September 2019,https://www.tripadvisor.com/Hotel_Review-g187147-d233766-Reviews-or30-Hotel_Campanile_Paris_Bercy_Village-Paris_Ile_de_France.html,https://media-cdn.tripadvisor.com/media/photo-s/27/c7/d3/ab/chambre-double.jpg,$ (Based on Average Nightly Rates for a Standard Room from our Partners),4.0,1625,17 rue Baron le Roy,Paris,France


In [10]:
df.shape[0]

5997

In [23]:
df.sample(3)

Unnamed: 0,hotel_name,hotel_description,review_title,review_text,rate,tripdate,hotel_url,hotel_image,price_range,rating_value,review_count,street_address,locality,country
4533,Dream Downtown,"Located between New York City's Meatpacking District and Chelsea neighborhood, Dream Downtown is a lifestyle hotel with an emphasis on service and after-dark possibilities. Extraordinary in design and unique in character, the hotel offers everything today's urban traveler seeks, including 315 loft-style guest rooms and suites, luxurious amenities and unique entertainment spaces.",Great Hotel with Amazing Service,"WOW what a great hotel and above and beyond service, will defiantly Stay again thank you Ester for helping me out for this stay and many more to come, Great Location, close to trains and very clean hotel",5.0,December 2023,https://www.tripadvisor.com/Hotel_Review-g60763-d2173604-Reviews-or10-Dream_Downtown-New_York_City_New_York.html,https://media-cdn.tripadvisor.com/media/photo-s/04/0b/67/7b/dream-downtown.jpg,$$ (Based on Average Nightly Rates for a Standard Room from our Partners),4.5,4451,355 West 16th Street,New York City,United States
2209,San Remo Hotel,"We are a family-owned historic boutique pension-style hotel located in North Beach, one of San Francisco's oldest and most popular tourist destinations, and only a few blocks from Fisherman's Wharf. Built in 1906, our hotel maintains an atmosphere of Old-World charm and ambiance with specially selected antique furnishings - no two rooms are alike. The walls of the hotel are filled with a collection of memorabilia and history. Our style is simplicity and sanctuary - there are no room telephones, no televisions, and on the order of European pensions, shared baths which are meticulously cleaned daily, and are private when in use, but for all guests to use. Our rooms are cozy and small, our prices are proportional. Our staff is available while our Front Desk is open to help you find anything your heart desires. A truly unique, non-standardized hotel experience.",Mom & Daughter Trip,,,January 2020,https://www.tripadvisor.com/Hotel_Review-g60713-d81360-Reviews-San_Remo_Hotel-San_Francisco_California.html,https://dynamic-media-cdn.tripadvisor.com/media/photo-o/18/7d/6c/2f/front-of-hotel.jpg?w=500&h=-1&s=1,$ (Based on Average Nightly Rates for a Standard Room from our Partners),4.0,1389,2237 Mason St,San Francisco,United States
471,Hotel Amira Istanbul,"Hotel Amira Istanbul is a boutique hotel located at the heart of the centuries old empire's capital. Some remarkable neighbors of the Hotel Amira Istanbul are the great Hagia Sophia, the magnificent Blue Mosque, the mystic Basilica Cistern, the imperial Topkapi Palace and the colorful Grand Bazaar. Amira Hotel offers you spaciously designed 32 rooms in Sultanahmet area, featuring very comfortable visco mattresses, local touches like ottoman style hand-made ceiling paintings. Along with its world famous cordial guest services, Amira Hotel offers a rich, complimentary buffet breakfast, free Wi-Fi access, a gym and massage services. Also our concierge would be more than happy to assist you in every aspect of your stay, whether business or pleasure.",Great hotel in Old Town Istanbul,"This is a boutique hotel in the middle of the old city. Is is within walking distance of all of the main attractions, the Hippodrome, the Blue Mosque, Hagia Sophia and so on. The hotel is clean and modern, the staff are competent and incredibly accomodating.. I would stay there again.",5.0,August 2023,https://www.tripadvisor.com/Hotel_Review-g293974-d1674691-Reviews-or30-Hotel_Amira_Istanbul-Istanbul.html,https://dynamic-media-cdn.tripadvisor.com/media/photo-o/1c/48/d2/47/exterior.jpg?w=500&h=-1&s=1,$ (Based on Average Nightly Rates for a Standard Room from our Partners),5.0,4662,Kucuk Ayasofya Mah. Mustafa Pasa Sok. No: 43,Istanbul,Türkiye


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5997 entries, 0 to 5996
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   hotel_name         5997 non-null   object 
 1   hotel_description  5197 non-null   object 
 2   review_title       5997 non-null   object 
 3   review_text        4867 non-null   object 
 4   rate               4867 non-null   float64
 5   tripdate           5997 non-null   object 
 6   hotel_url          5997 non-null   object 
 7   hotel_image        5997 non-null   object 
 8   price_range        5997 non-null   object 
 9   rating_value       5997 non-null   float64
 10  review_count       5997 non-null   int64  
 11  street_address     5997 non-null   object 
 12  locality           5997 non-null   object 
 13  country            5997 non-null   object 
dtypes: float64(2), int64(1), object(11)
memory usage: 656.0+ KB


In [17]:
def check_column_data(df, column):
    if column in df.columns:
        # Check for None/NaN values
        null_count = df[column].isnull().sum()
        print(f"Number of None/NaN values in '{column}': {null_count}")

        # Count unique values including None/NaN
        unique_values = df[column].value_counts(dropna=False)
        print(f"Unique values in '{column}' (including None/NaN):")
        print(unique_values)

        # Print the total number of unique values
        total_unique = df[column].nunique(dropna=False)
        print(f"\nTotal number of unique values (including None/NaN) in '{column}': {total_unique}\n")
    else:
        print(f"Column '{column}' not found in the DataFrame.")

In [18]:
check_column_data(df, 'hotel_name')

Number of None/NaN values in 'hotel_name': 0
Unique values in 'hotel_name' (including None/NaN):
hotel_name
Romance Istanbul Hotel                                                         40
Motto by Hilton New York City Chelsea                                          40
The Bryant Park Hotel                                                          40
Pod Times Square                                                               40
Tempo by Hilton New York Times Square                                          40
Pod 51 Hotel                                                                   40
Hotel Riu Plaza New York Times Square                                          40
Park Central Hotel New York                                                    40
Hyatt Grand Central New York                                                   40
Moxy NYC East Village                                                          40
The Knickerbocker                                                       

In [19]:
check_column_data(df, 'hotel_description')

Number of None/NaN values in 'hotel_description': 800
Unique values in 'hotel_description' (including None/NaN):
hotel_description
None                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [21]:
check_column_data(df, 'rate')

Number of None/NaN values in 'rate': 1130
Unique values in 'rate' (including None/NaN):
rate
5.0    3435
NaN    1130
4.0     622
1.0     308
3.0     288
2.0     214
Name: count, dtype: int64

Total number of unique values (including None/NaN) in 'rate': 6



In [24]:
check_column_data(df, 'price_range')

Number of None/NaN values in 'price_range': 0
Unique values in 'price_range' (including None/NaN):
price_range
$$ (Based on Average Nightly Rates for a Standard Room from our Partners)     3120
$ (Based on Average Nightly Rates for a Standard Room from our Partners)      2757
$$$ (Based on Average Nightly Rates for a Standard Room from our Partners)     120
Name: count, dtype: int64

Total number of unique values (including None/NaN) in 'price_range': 3



In [26]:
check_column_data(df, 'rating_value')

Number of None/NaN values in 'rating_value': 0
Unique values in 'rating_value' (including None/NaN):
rating_value
4.5    2677
5.0    1520
4.0    1520
3.5     200
3.0      80
Name: count, dtype: int64

Total number of unique values (including None/NaN) in 'rating_value': 5



In [27]:
check_column_data(df, 'street_address')

Number of None/NaN values in 'street_address': 0
Unique values in 'street_address' (including None/NaN):
street_address
Hudavendigar Cd. No:5 Sirkeci                                        40
113 West 24th Street                                                 40
40 West 40th Street                                                  40
400 W 42nd St                                                        40
1568 Broadway                                                        40
230 East 51st Street                                                 40
305 W 46th St                                                        40
870 7th Ave                                                          40
109 East 42nd Street At Grand Central Terminal                       40
112 East 11th Street                                                 40
6 Times Square SE Corner Of 42nd St                                  40
228 West 47th Street                                                 40
20 W 29th Street

In [28]:
check_column_data(df, 'locality')

Number of None/NaN values in 'locality': 0
Unique values in 'locality' (including None/NaN):
locality
Istanbul         1200
San Francisco    1200
London           1200
Paris            1200
New York City    1197
Name: count, dtype: int64

Total number of unique values (including None/NaN) in 'locality': 5



In [29]:
check_column_data(df, 'country')

Number of None/NaN values in 'country': 0
Unique values in 'country' (including None/NaN):
country
United States     2397
United Kingdom    1200
France            1200
Turkiye            720
Türkiye            480
Name: count, dtype: int64

Total number of unique values (including None/NaN) in 'country': 5



### Clean country column

In [30]:
df['country'] = df['country'].replace('Türkiye', 'Turkiye')

In [38]:
df['country'] = df['country'].replace('Turkiye', 'Turkey')

In [39]:
check_column_data(df, 'country')

Number of None/NaN values in 'country': 0
Unique values in 'country' (including None/NaN):
country
United States     2397
Turkey            1200
United Kingdom    1200
France            1200
Name: count, dtype: int64

Total number of unique values (including None/NaN) in 'country': 4



In [35]:
check_column_data(df, 'tripdate')

Number of None/NaN values in 'tripdate': 0
Unique values in 'tripdate' (including None/NaN):
tripdate
 December 2023     1387
 January 2024       972
 November 2023      649
 October 2023       472
 September 2023     330
 August 2023        238
 July 2023          169
 June 2023          126
 May 2023           102
 January 2020        83
 February 2020       78
 April 2023          72
 July 2022           66
 May 2022            65
 March 2023          63
 October 2022        63
 September 2022      62
 June 2022           61
 February 2023       56
 December 2022       51
 August 2022         50
 January 2023        49
 November 2022       44
 December 2019       42
 August 2021         42
 March 2022          38
 January 2022        35
 March 2020          33
 November 2021       33
 April 2022          32
 September 2021      29
 October 2021        26
 August 2020         25
 July 2021           24
 June 2021           23
 September 2019      22
 December 2021       22
 February 

In [34]:
def count_unique_values(df):
    for column in df.columns:
        unique_count = df[column].nunique(dropna=False)
        print(f"Column '{column}' has {unique_count} unique values.")

count_unique_values(df)

Column 'hotel_name' has 150 unique values.
Column 'hotel_description' has 131 unique values.
Column 'review_title' has 5314 unique values.
Column 'review_text' has 4868 unique values.
Column 'rate' has 6 unique values.
Column 'tripdate' has 76 unique values.
Column 'hotel_url' has 600 unique values.
Column 'hotel_image' has 150 unique values.
Column 'price_range' has 3 unique values.
Column 'rating_value' has 5 unique values.
Column 'review_count' has 147 unique values.
Column 'street_address' has 150 unique values.
Column 'locality' has 5 unique values.
Column 'country' has 4 unique values.


### Clean price range column 

In [36]:
def convert_price_range(price_range):
    mapping = {
        "$ (Based on Average Nightly Rates for a Standard Room from our Partners)": "Cheap",
        "$$ (Based on Average Nightly Rates for a Standard Room from our Partners)": "Moderate",
        "$$$ (Based on Average Nightly Rates for a Standard Room from our Partners)": "Expensive"
    }
    return mapping.get(price_range, "Unknown")

df['price_range'] = df['price_range'].apply(convert_price_range)


In [44]:
check_column_data(df, 'price_range')

Number of None/NaN values in 'price_range': 0
Unique values in 'price_range' (including None/NaN):
price_range
Moderate     3120
Cheap        2757
Expensive     120
Name: count, dtype: int64

Total number of unique values (including None/NaN) in 'price_range': 3



In [45]:
columns_to_include = [
    'hotel_name', 
    'hotel_description', 
    'review_title', 
    'review_text', 
    'price_range', 
    'street_address', 
    'locality', 
    'country'
]

df_subset = df[columns_to_include]

In [51]:
# # Create a copy of the DataFrame with only the specified columns
# df_subset = df[['hotel_name', 'hotel_description', 'review_title', 'review_text', 'street_address', 'locality', 'country']].copy()

# # Concatenate the columns into a single text column
# df_subset['combined_text'] = df_subset.apply(lambda row: f"Hotel Name: {row['hotel_name']}\n"
#                                                          f"Hotel Description: {row['hotel_description']}\n"
#                                                          f"Review Title: {row['review_title']}\n"
#                                                          f"Review Text: {row['review_text']}\n"
#                                                          f"Price: {row['price_range']}\n"
#                                                          f"Address: {row['street_address']}, {row['locality']}, {row['country']}\n",
#                                                          axis=1)

# # Export this combined column to a text file
# with open('hotel_data_for_vector_db.txt', 'w', encoding='utf-8') as file:
#     for text in df_subset['combined_text']:
#         file.write(text + "\n\n")

df_subset = df[['hotel_name', 'hotel_description', 'review_title', 'review_text', 'price_range',
                'street_address', 'locality', 'country', 'rate', 'tripdate', 'rating_value', 'review_count']].copy()

# Concatenate the columns into a single text column
df_subset['combined_text'] = df_subset.apply(lambda row: f"Hotel Name: {row['hotel_name']}\n"
                                                         f"Hotel Description: {row['hotel_description']}\n"
                                                         f"Review Title: {row['review_title']}\n"
                                                         f"Review Text: {row['review_text']}\n"
                                                         f"Trip Date: {row['tripdate']}\n"
                                                         f"Price: {row['price_range']}\n"
                                                         f"User Rating: {row['rate']}\n"
                                                         f"Average Rating: {row['rating_value']}\n"
                                                         f"Total Review Count: {row['review_count']}\n"
                                                         f"Address: {row['street_address']}, {row['locality']}, {row['country']}\n",
                                                         axis=1)

# Export this combined column to a text file
with open('hotel_data_for_vector_db.txt', 'w', encoding='utf-8') as file:
    for text in df_subset['combined_text']:
        file.write(text + "\n\n")


In [53]:
# Define the file path
file_path = 'hotel_data_for_vector_db.txt'

# Open the file and read the first few lines
with open(file_path, 'r', encoding='utf-8') as file:
    for _ in range(40):  # Adjust the number 5 to change how many lines you read
        line = file.readline()
        print(line)


Hotel Name: Romance Istanbul Hotel

Hotel Description: Romance Istanbul Hotel has 39 rooms.Every room is elegantly furnished and harmonizes the modern life style with the traditional Ottoman touch. Romance Istanbul sits at the intersection of the old city’s most important part. With its luxuriously inspiring design and landmark old city location, steeped in the history of its surroundings, Romance Istanbul Hotel welcomes you with exceptional designed rooms and world-renowned Turkish hospitality. Our colleagues deliver the most personal service. It is perfectly placed and perfectly designed to enhance all that Istanbul has to offer. Each room offers a private bathroom and shower. Each is equipped with a satellite TV and free wifi connection. The rooms size change between 20 m2 and 45 m2. It includes 7 suite rooms: 1 Royal Suite, 4 Grand Suite, 1 Romance Suite and 1 Premium Suite, 2 Luxury Room With Terrace, 22 Deluxe Room, 8 City Room.

Review Title: An exceptional boutique hotel, great

In [54]:
check_column_data(df, 'hotel_image')

Number of None/NaN values in 'hotel_image': 0
Unique values in 'hotel_image' (including None/NaN):
hotel_image
https://media-cdn.tripadvisor.com/media/photo-s/1c/02/78/ba/romance-istanbul-hotel.jpg                                40
https://media-cdn.tripadvisor.com/media/photo-s/2a/d3/d1/a6/exterior.jpg                                              40
https://media-cdn.tripadvisor.com/media/photo-s/28/8d/23/73/our-front-yard-in-spring.jpg                              40
https://media-cdn.tripadvisor.com/media/photo-s/24/36/15/1c/pod-times-square.jpg                                      40
https://media-cdn.tripadvisor.com/media/photo-s/2a/d3/da/de/exterior.jpg                                              40
https://media-cdn.tripadvisor.com/media/photo-s/11/16/f8/44/pod-51-hotel.jpg                                          40
https://media-cdn.tripadvisor.com/media/photo-s/29/6e/7f/99/zny-usp.jpg                                               40
https://media-cdn.tripadvisor.com/media/ph