## Data Preprocessing

In [None]:
# !pip install pyspark

In [None]:
import pandas as pd
from datasets import load_dataset

from PIL import Image
from io import BytesIO
import requests
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer, GPT2TokenizerFast
import torch

In [None]:
dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets")

In [None]:
df = pd.DataFrame(dataset['train'])

### Clean 'price_range' Column by converting to description

In [None]:
def convert_price_range(price_range):
    mapping = {
        "$ (Based on Average Nightly Rates for a Standard Room from our Partners)": "Cheap",
        "$$ (Based on Average Nightly Rates for a Standard Room from our Partners)": "Moderate",
        "$$$ (Based on Average Nightly Rates for a Standard Room from our Partners)": "Expensive"
    }
    return mapping.get(price_range, "Unknown")

df['price_range'] = df['price_range'].apply(convert_price_range)

### Clean 'country' Column by translating Turkiye to Turkey 

In [None]:
df['country'] = df['country'].replace(['Turkiye', 'Türkiye'], 'Turkey')

## Convert 'hotel_image' jpgs to Bytes

In [None]:
#df['hotel_image'].to_csv('hotel_images.csv')

In [None]:
def convert_image(image_url):
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content))

        # Calculate new dimensions while maintaining the aspect ratio
        #img.thumbnail((width, height), Image.ANTIALIAS)

        # Create an in-memory buffer to store the resized image
        output_buffer = io.BytesIO()

        # Save the resized image to the buffer in JPEG format
        img.save(output_buffer, format="JPEG")

        # Get the resized image data as bytes
        resized_image_data = output_buffer.getvalue()

        img.close()
    except Exception as e:
        print(f"Error processing image {image_url}: {e}")

    return resized_image_data
    
pretrained_model = "nlpconnect/vit-gpt2-image-captioning"

model = VisionEncoderDecoderModel.from_pretrained(pretrained_model)
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_model)
image_processor = ViTImageProcessor.from_pretrained(pretrained_model)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

def predict_step(image_file):
    try:
        print("Image File:", image_file)  # Debugging: Print the value of image_file
        img = Image.open(BytesIO(image_file))
    
        # process the image into a tensor
        pixel_values = image_processor(images=img, return_tensors="pt").pixel_values
        pixel_values = pixel_values.to(device)
    
        output_ids = model.generate(pixel_values, **gen_kwargs)
    
        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        preds = [pred.strip() for pred in preds]
    except Exception as e:
        print(f"Error processing image {img}: {e}")
    
    return str(preds)

In [None]:
test_df = df.sample(1)
test_df

In [None]:
test_df['converted_urls'] = test_df['hotel_image'].apply(convert_image)
#test_df
test_df['img_preds'] = test_df['converted_urls'].apply(predict_step)
# check results
print(test_df['img_preds'])

In [None]:
Image.open(BytesIO(test_df['converted_urls'].iloc[0]))

In [None]:
# try another image...
test_df = df.sample(1)

In [None]:
# test_df['converted_urls'] = test_df['hotel_image'].apply(convert_image)
# #test_df
# test_df['img_preds'] = test_df['converted_urls'].apply(predict_step)
# # check results
print(test_df['img_preds'])

In [None]:
Image.open(BytesIO(test_df['converted_urls'].iloc[0]))

In [None]:
# import pandas as pd
# from pyspark.sql import Row
# from pyspark.sql.functions import pandas_udf

In [None]:
# img_df = spark.createDataFrame([
#     Row("image_url":df['hotel_image'][:5])
# ])

# img_df.show()

In [None]:
# @pandas_udf('long')
# def pandas_convert_image(series: pd.Series) -> pd.Series:
#     return convert_image(series)

### Make Cleaned Dataset Text File for RAG

In [None]:
df_subset = df[['hotel_name', 'hotel_description', 'review_title', 'review_text', 'price_range',
                'street_address', 'locality', 'country', 'rate', 'tripdate', 'rating_value', 'review_count']].copy()

# Concatenate the columns into a single text column
df_subset['combined_text'] = df_subset.apply(lambda row: f"Hotel Name: {row['hotel_name']}\n"
                                                         f"Hotel Description: {row['hotel_description']}\n"
                                                         f"Review Title: {row['review_title']}\n"
                                                         f"Review Text: {row['review_text']}\n"
                                                         f"Trip Date: {row['tripdate']}\n"
                                                         f"Price: {row['price_range']}\n"
                                                         f"User Rating: {row['rate']}\n"
                                                         f"Average Rating: {row['rating_value']}\n"
                                                         f"Total Review Count: {row['review_count']}\n"
                                                         f"Address: {row['street_address']}, {row['locality']}, {row['country']}\n",
                                                         axis=1)

# Export this combined column to a text file
with open('hotel_data_for_vector_db.txt', 'w', encoding='utf-8') as file:
    for text in df_subset['combined_text']:
        file.write(text + "\n\n")

In [10]:
file_path = 'hotel_data_for_vector_db.txt'

# Open the file and read the first few lines
with open(file_path, 'r', encoding='utf-8') as file:
    for _ in range(40):  # Adjust the number 5 to change how many lines you read
        line = file.readline()
        print(line)


Hotel Name: Romance Istanbul Hotel

Hotel Description: Romance Istanbul Hotel has 39 rooms.Every room is elegantly furnished and harmonizes the modern life style with the traditional Ottoman touch. Romance Istanbul sits at the intersection of the old city’s most important part. With its luxuriously inspiring design and landmark old city location, steeped in the history of its surroundings, Romance Istanbul Hotel welcomes you with exceptional designed rooms and world-renowned Turkish hospitality. Our colleagues deliver the most personal service. It is perfectly placed and perfectly designed to enhance all that Istanbul has to offer. Each room offers a private bathroom and shower. Each is equipped with a satellite TV and free wifi connection. The rooms size change between 20 m2 and 45 m2. It includes 7 suite rooms: 1 Royal Suite, 4 Grand Suite, 1 Romance Suite and 1 Premium Suite, 2 Luxury Room With Terrace, 22 Deluxe Room, 8 City Room.

Review Title: An exceptional boutique hotel, great