In [18]:
import pandas as pd
from tqdm import tqdm
import os

In [35]:
TUI_DATA_PATH = "../../data/TUI Musement Backup Accommodation Data.csv"
FILE_NAME = "tui-musement-backup-accommodation-embedded.parquet"
SAVE_FOLDER = "../poi_desc_emb"
SAVE_PATH = os.path.join(SAVE_FOLDER, FILE_NAME)
SAVE_PATH

'../poi_desc_emb/tui-musement-backup-accommodation-embedded.parquet'

## Load Data

In [20]:
df = pd.read_csv(TUI_DATA_PATH)

In [21]:
import re

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Example usage
html_text = "<p>This is a <b>bold</b> paragraph.</p>"
clean_text = remove_html_tags(html_text)
print(clean_text)


This is a bold paragraph.


In [22]:
df['Description'] = df['Description'].apply(remove_html_tags)

In [23]:
df = df.rename(columns={"Id": "Expedia_Id"})

df.head()

Unnamed: 0,Expedia_Id,Name,Type,Brand,Categories,Tags,Contexts,Rating,Reviews,Lat,Lng,Address,Address Object,Timezone,Modes,Price/day,Description,Small Image,Medium Image,Images
0,1039,Riu Palace Paradise Island - Adults Only - All...,lodging,RIU Resorts,,"Near Ocean,Spa,Swimming,Coffee,Tea/Coffee,Room...",,8.0,2478,25.083853,-77.31633,"6307 Casino Drive,Paradise Island,New Providen...","{""street1"":""6307 Casino Drive"",""city"":""Paradis...",,TUI_MUSEMENT,532.21,Adults-only beach propertyCatch some rays at t...,https://images.trvl-media.com/lodging/1000000/...,https://images.trvl-media.com/lodging/1000000/...,https://images.trvl-media.com/lodging/1000000/...
1,6919,Hotel Riu Plaza The Gresham Dublin,lodging,RIU Plaza,,"Near Ocean,Tea/Coffee,Room Service,Chinese (Ma...",,8.8,1384,53.351585,-6.260934,"23 Upper O Connell Street,Dublin,Dublin,Ireland","{""street1"":""23 Upper O Connell Street"",""city"":...",,TUI_MUSEMENT,198.9,"Upscale eco-certified hotel, walk to O'Connell...",https://images.trvl-media.com/lodging/1000000/...,https://images.trvl-media.com/lodging/1000000/...,https://images.trvl-media.com/lodging/1000000/...
2,24625,Hotel Riu Plaza Fisherman's Wharf,lodging,RIU Plaza,,"Near Ocean,Near Mountain,Coffee,Tea/Coffee,Air...",,8.8,6325,37.806866,-122.41378,"2500 Mason St,San Francisco,California,United ...","{""street1"":""2500 Mason St"",""city"":""San Francis...",,TUI_MUSEMENT,139.02,Eco-certified San Francisco hotel in Fisherman...,https://images.trvl-media.com/lodging/1000000/...,https://images.trvl-media.com/lodging/1000000/...,https://images.trvl-media.com/lodging/1000000/...
3,28648,Riu Palace Antillas - Adults Only - All Inclusive,lodging,RIU Resorts,,"Near Ocean,Spa,Coffee,Tea/Coffee,Room Service,...",,7.8,1220,12.56542,-70.04856,"J E Irausquin Boulevard 77,Noord,Noord,Aruba","{""street1"":""J E Irausquin Boulevard 77"",""city""...",,TUI_MUSEMENT,775.61,Adults-only beach propertyCatch some rays at t...,https://images.trvl-media.com/lodging/1000000/...,https://images.trvl-media.com/lodging/1000000/...,https://images.trvl-media.com/lodging/1000000/...
4,54490,Grupotel Mayorazgo,lodging,Grupotel,,Near Mountain,,8.2,583,40.42214,-3.7095,"Flor Baja 3,Madrid,Madrid,Spain","{""street1"":""Flor Baja 3"",""city"":""Madrid"",""stat...",,TUI_MUSEMENT,134.49,"Upscale hotel, walk to Gran ViaA restaurant, a...",https://images.trvl-media.com/lodging/1000000/...,https://images.trvl-media.com/lodging/1000000/...,https://images.trvl-media.com/lodging/1000000/...


## Encode (MiniLM)

In [24]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from torch import Tensor
import torch
from tqdm import tqdm
import torch.nn.functional as F


In [25]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [26]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
model.device

device(type='cpu')

In [27]:
def model_encode(model, tokenizer, text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    # Perform pooling
    text_emb = mean_pooling(model_output, encoded_input['attention_mask'])
    text_emb = F.normalize(text_emb, p=2, dim=1)
    return text_emb

In [28]:
def encode_input_text(model, tokenizer, input_texts, batch_size=128):
    emb_list = []
    with torch.no_grad():
        for s in tqdm(range(0, len(input_texts), batch_size)):
            e = min(len(input_texts), s+batch_size)
            batch_input_text = input_texts[s:e]
            batch_embeddings = model_encode(model, tokenizer, batch_input_text)
            emb_list.append(batch_embeddings)
    return torch.cat(emb_list, dim=0)

In [29]:
descriptions = df['Description'].values.tolist()

In [30]:
desc_emb = encode_input_text(model, tokenizer, descriptions, batch_size=256).tolist()

100%|██████████| 1/1 [00:09<00:00,  9.99s/it]


In [32]:
df['all-MiniLM-L12-v2'] = desc_emb

## Save Results

In [36]:
df.to_parquet(SAVE_PATH, index=False)

In [37]:
df['all-MiniLM-L12-v2'][0][0:10]

[0.055443741381168365,
 -0.028733868151903152,
 0.12436169385910034,
 0.06000158190727234,
 0.0953599214553833,
 0.0028567889239639044,
 0.053968723863363266,
 -0.03289089351892471,
 0.011583748273551464,
 0.0510227233171463]