In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

hotel_id_to_combat_human_trafficking_2022_fgvc9_path = kagglehub.competition_download('hotel-id-to-combat-human-trafficking-2022-fgvc9')

print('Data source import complete.')


In [3]:
!mkdir hotels-50k
!wget -P hotels-50k https://github.com/GWUvision/Hotels-50K/raw/master/input/dataset.tar.gz
!tar -xvzf hotels-50k/dataset.tar.gz -C hotels-50k
!rm hotels-50k/dataset.tar.gz


--2025-02-26 23:46:58--  https://github.com/GWUvision/Hotels-50K/raw/master/input/dataset.tar.gz
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/GWUvision/Hotels-50K/master/input/dataset.tar.gz [following]
--2025-02-26 23:46:58--  https://raw.githubusercontent.com/GWUvision/Hotels-50K/master/input/dataset.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14313153 (14M) [application/octet-stream]
Saving to: ‘hotels-50k/dataset.tar.gz’


2025-02-26 23:46:58 (113 MB/s) - ‘hotels-50k/dataset.tar.gz’ saved [14313153/14313153]

._dataset
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.x

# Load data info

In [4]:
import pandas as pd
import tqdm

In [5]:
chain_df = pd.read_csv("./hotels-50k/dataset/chain_info.csv")
display(chain_df.head())

Unnamed: 0,chain_id,chain_name
0,-1,unknown
1,0,Best Western
2,1,Hyatt
3,2,Marriott
4,3,Hilton


In [6]:
hotel_df = pd.read_csv("./hotels-50k/dataset/hotel_info.csv")
display(hotel_df.head())

Unnamed: 0,hotel_id,hotel_name,chain_id,latitude,longitude
0,391,Extended Stay America - Fairbanks - Old Airpor...,72,64.83538,-147.8233
1,392,Hilton Hangzhou Qiandao Lake Resort,3,29.60819,119.0729
2,393,Taj Lands End,-1,19.04391,72.81879
3,395,Cambridge Suites Hotel Sydney,-1,46.13663,-60.19551
4,396,Tamanu Beach,14,-18.84213,-159.78794


In [7]:
train_df = pd.read_csv('./hotels-50k/dataset/train_set.csv', header=None,
                       names=['image_id', 'hotel_id', 'url', 'source', 'timestamp'])

display(train_df.head())

Unnamed: 0,image_id,hotel_id,url,source,timestamp
0,3485,18187,https://image-submissions.s3.us-west-2.amazona...,traffickcam,9/9/15 17:23
1,3486,18187,https://image-submissions.s3.us-west-2.amazona...,traffickcam,9/9/15 17:23
2,3663,73224,https://image-submissions.s3.us-west-2.amazona...,traffickcam,9/17/15 19:33
3,2586939,86350,https://image-submissions.s3.us-west-2.amazona...,traffickcam,1/25/16 19:12
4,2586950,1533,https://image-submissions.s3.us-west-2.amazona...,traffickcam,1/25/16 17:23


# Check Hotels-50k data

In [8]:
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [9]:
data_df = train_df.merge(hotel_df, on="hotel_id").merge(chain_df, on="chain_id")
data_df["image_id"] = data_df["image_id"].astype(str)
data_df["hotel_id"] = data_df["hotel_id"].astype(str)
data_df["chain_id"] = data_df["chain_id"].astype(str)

display(data_df.head())

Unnamed: 0,image_id,hotel_id,url,source,timestamp,hotel_name,chain_id,latitude,longitude,chain_name
0,3485,18187,https://image-submissions.s3.us-west-2.amazona...,traffickcam,9/9/15 17:23,Hilton Minneapolis,3,44.97338,-93.27331,Hilton
1,3486,18187,https://image-submissions.s3.us-west-2.amazona...,traffickcam,9/9/15 17:23,Hilton Minneapolis,3,44.97338,-93.27331,Hilton
2,3663,73224,https://image-submissions.s3.us-west-2.amazona...,traffickcam,9/17/15 19:33,Philadelphia Marriott Downtown,2,39.95204,-75.16071,Marriott
3,2586939,86350,https://image-submissions.s3.us-west-2.amazona...,traffickcam,1/25/16 19:12,Four Points by Sheraton Tucson Airport,76,32.12362,-110.93472,Sheraton
4,2586950,1533,https://image-submissions.s3.us-west-2.amazona...,traffickcam,1/25/16 17:23,The Lexington New York City Autograph Collection,-1,40.75508,-73.97338,unknown


### Dataset size

In [10]:
print("Image count:", len(data_df))
print("Hotel count:", len(data_df["hotel_id"].unique()))
print("Chain count:", len(data_df["chain_id"].unique()))

Image count: 1124215
Hotel count: 50000
Chain count: 93


### Hotel and image count per chain

In [11]:
chain_group_df = data_df.groupby(["chain_name"]).agg({"hotel_id": [pd.Series.nunique], "image_id" : [pd.Series.nunique]})
chain_group_df.columns = ["_".join(x) for x in chain_group_df.columns.ravel()]
chain_group_df = chain_group_df.reset_index().sort_values("hotel_id_nunique")[::-1]

In [12]:
fig = px.scatter(chain_group_df, x="chain_name", y="hotel_id_nunique",
                 size="image_id_nunique", color = "image_id_nunique",
                 hover_name = None,
                 log_y=True, size_max=75)

fig.update_yaxes(title_text="Hotel count")
fig.update_xaxes(title_text="Chain ID")
fig.update_layout(title="Hotel and image count per chain", coloraxis=dict(colorbar=dict(title="Image count")))
fig.update_traces(hovertemplate="Chain: %{x} <br>Hotel count: %{y:%d}<br>Image count: %{marker.size:%d}")
fig.show()

### Image count per hotel

In [13]:
group_df = data_df.groupby(["hotel_id"]).size().to_frame("image_count").sort_values("image_count")[::-1].reset_index()

In [14]:
fig = px.histogram(group_df, x="image_count", nbins=100, marginal="box", height=500)
fig.update_layout(title="Distribution of image count per hotel")
fig.update_traces(hovertemplate="Image count: %{x} <br>Hotel count: %{y:%d}")
fig.update_yaxes(title_text="Hotel count")
fig.update_xaxes(title_text="Image count")
fig.show()

### Image count per source
Images come from two different sources: travel_website and traffickcam

In [15]:
group_df = data_df.groupby(["source"]).size().to_frame("image_count").sort_values("image_count")[::-1].reset_index()

fig = px.bar(group_df, x="source", y="image_count", height=500)
fig.update_layout(title="Image count per source")
fig.update_traces(hovertemplate="Source: %{x:%d} <br>Image count: %{y:%d}")
fig.update_yaxes(title_text="Image count")
fig.update_xaxes(title_text="Source")
fig.show()

# Sample 50 hotels with more than 10 and less than 100 images

In [16]:
hotel_group_df = data_df.groupby(by=["hotel_id"])["image_id"].count().to_frame("image_count")

In [17]:
import os
import pandas as pd

# Load the chain information
chain_df = pd.read_csv("./hotels-50k/dataset/chain_info.csv")
display(chain_df.head())

# Create test_df with sorted image_id list and an empty hotel_id column
image_list = os.listdir("./hotels-50k/dataset/test_set")  # Ensure this is a valid directory path
test_df = pd.DataFrame({"image_id": sorted(image_list), "hotel_id": [""] * len(image_list)})

display(test_df.head())


Unnamed: 0,chain_id,chain_name
0,-1,unknown
1,0,Best Western
2,1,Hyatt
3,2,Marriott
4,3,Hilton


FileNotFoundError: [Errno 2] No such file or directory: './hotels-50k/dataset/test_set'

In [18]:
sample_hotels = hotel_group_df[(hotel_group_df["image_count"] > 10) & (hotel_group_df["image_count"] < 100)]
print("Number of hotels with more than 10 images and less than 100:", len(sample_hotels))
sample_hotels = sample_hotels.sample(50, random_state=42)

Number of hotels with more than 10 images and less than 100: 42629


In [19]:
sample_df = data_df[data_df["hotel_id"].isin(sample_hotels.index)].reset_index(drop=True)
print("Sampled images:", len(sample_df))

Sampled images: 1218


In [20]:
chain_group_df = sample_df.groupby(["chain_name"]).agg({"hotel_id": [pd.Series.nunique], "image_id" : [pd.Series.nunique]})
chain_group_df.columns = ["_".join(x) for x in chain_group_df.columns.ravel()]
chain_group_df = chain_group_df.reset_index().sort_values("hotel_id_nunique")[::-1]

fig = px.scatter(chain_group_df, x="chain_name", y="hotel_id_nunique",
                 size="image_id_nunique", color = "image_id_nunique",
                 hover_name = None,
                 size_max=75)

fig.update_yaxes(title_text="Hotel count")
fig.update_xaxes(title_text="Chain ID")
fig.update_layout(title="Sampled data <br>Hotel and image count per chain", coloraxis=dict(colorbar=dict(title="Image count")))
fig.update_traces(hovertemplate="Chain: %{x} <br>Hotel count: %{y:%d}<br>Image count: %{marker.size:%d}")
fig.show()

# Download sampled images

## Prepare to download images
The SSL certificate of the image urls is expired so we have to handle it.

In [21]:
from __future__ import print_function
import csv, multiprocessing, cv2, os
import numpy as np
import urllib
import urllib.request

import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

In [22]:
output_folder = "hotels-50k/images"
output_image_folder  = output_folder + "/train"

os.makedirs(output_image_folder)

## Download images

We will download the images without padding or resizing and we will keep the original folder structure: hotels-50k/images/train/chain_id/hotel_id/source/image_id.jpeg

In [23]:
def url_to_image(url):
    resp = urllib.request.urlopen(url, context=ctx)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_UNCHANGED)
    return image


def download_images(imList):
    # 2d list, rows are samples
    # columns: "chain_id", "hotel_id", "source", "image_id", "url"
    for im in imList:
        try:
            saveDir = os.path.join(output_image_folder, im[0], im[1], im[2])
            if not os.path.exists(saveDir):
                os.makedirs(saveDir)

            savePath = os.path.join(saveDir, str(im[3])+'.'+im[4].split('.')[-1])

            if not os.path.isfile(savePath):
                img = url_to_image(im[4])
                cv2.imwrite(savePath,img)
            else:
                print('Already exists: ' + savePath)
        except Exception as e:
            print(e, ': ' + im[4])

In [24]:
%%time

image_data_array = sample_df[["chain_id", "hotel_id", "source", "image_id", "url"]].values

pool = multiprocessing.Pool()
NUM_THREADS = multiprocessing.cpu_count()
for cpu in range(NUM_THREADS):
    pool.apply_async(download_images,[image_data_array[cpu::NUM_THREADS]])

pool.close()
pool.join()

[Errno 17] File exists: 'hotels-50k/images/train/91/30979/traffickcam' : https://image-submissions.s3.us-west-2.amazonaws.com/images/2017/7/20160626_052731_3APKHK.jpg
HTTP Error 404: Not Found : https://i.travelapi.com/hotels/3000000/2370000/2363100/2363074/0f0fa8a9_b.jpg
HTTP Error 404: Not Found : https://i.travelapi.com/hotels/3000000/2370000/2363100/2363074/1c95430c_b.jpg
HTTP Error 404: Not Found : https://i.travelapi.com/hotels/3000000/2370000/2363100/2363074/484454e9_b.jpg
HTTP Error 404: Not Found : https://i.travelapi.com/hotels/3000000/2370000/2363100/2363074/ca653a42_b.jpg
HTTP Error 404: Not Found : https://i.travelapi.com/hotels/3000000/2370000/2363100/2363074/f91e251a_b.jpg
HTTP Error 404: Not Found : https://i.travelapi.com/hotels/3000000/2370000/2363100/2363074/2287f936_b.jpg
HTTP Error 404: Not Found : https://i.travelapi.com/hotels/3000000/2370000/2363100/2363074/fb43fd41_b.jpg
HTTP Error 404: Not Found : https://i.travelapi.com/hotels/3000000/2370000/2363100/2363074/

Not every image is available, lets check how many images were successfully downloaded

In [25]:
!find {output_image_folder} -type f | wc -l

861


## Check downloaded data

In [26]:
# update the sample data frame with path, image name and whether it was downloaded
sample_df["downloaded"] = False
sample_df["image_name"] = None
sample_df["image_folder"] = None

for index, row in sample_df.iterrows():
    image_folder = os.path.join(output_image_folder, row["chain_id"], row["hotel_id"], row["source"])
    image_name   = row["image_id"] + '.'+ row["url"].split('.')[-1]
    image_path   = os.path.join(image_folder, image_name)
    if os.path.exists(image_path):
        sample_df.loc[index, "downloaded"] = True
        sample_df.loc[index, "image_name"] = image_name
        sample_df.loc[index, "image_folder"] = image_folder

In [27]:
display(sample_df.head())

Unnamed: 0,image_id,hotel_id,url,source,timestamp,hotel_name,chain_id,latitude,longitude,chain_name,downloaded,image_name,image_folder
0,2595674,30979,https://image-submissions.s3.us-west-2.amazona...,traffickcam,6/26/16 5:27,Hampton Inn & Suites Phoenix/Gilbert,91,33.29222,-111.75175,Hampton,True,2595674.jpg,hotels-50k/images/train/91/30979/traffickcam
1,2595676,30979,https://image-submissions.s3.us-west-2.amazona...,traffickcam,6/26/16 5:27,Hampton Inn & Suites Phoenix/Gilbert,91,33.29222,-111.75175,Hampton,False,,
2,2601652,30090,https://image-submissions.s3.us-west-2.amazona...,traffickcam,6/29/16 2:18,Courtyard by Marriott Santa Ana Orange County,90,33.69921,-117.86625,Courtyard by Marriott,True,2601652.jpg,hotels-50k/images/train/90/30090/traffickcam
3,2623736,6992,https://image-submissions.s3.us-west-2.amazona...,traffickcam,7/19/16 14:27,Hilton Garden Inn New Braunfels,87,29.70936,-98.09014,Hilton Garden Inn,True,2623736.jpg,hotels-50k/images/train/87/6992/traffickcam
4,2623738,6992,https://image-submissions.s3.us-west-2.amazona...,traffickcam,7/19/16 14:27,Hilton Garden Inn New Braunfels,87,29.70936,-98.09014,Hilton Garden Inn,True,2623738.jpg,hotels-50k/images/train/87/6992/traffickcam


In [28]:
# number of downloaded images should be the same as number of images in the output_image_folder
print("Number of downloaded images:", sample_df["downloaded"].sum())

Number of downloaded images: 861


In [29]:
# save sample df to csv
sample_df.to_csv("hotels-50k/sample.csv", index=False)

In [30]:
!zip -r -qq hotels-50K-sample.zip hotels-50k
!rm -rf hotels-50k

In [31]:
# 5 images equate to \
"""
Establish a performance benchmark
Extract full list of hotel IDs from dataset
For each test image, randomly assign 5 hotel IDs as predictions
Submit random assignment predictions to Kaggle to be evaluated against the leaderboard.
Random model will likely have an extremely low MAP@5 score

1.4 million images across thousands of hotels.
Hotel images labeled with hotel ids.
Unlabeled images to test dataset.
Significant variation in hotel room designs.
Class imbalance as some hotels have more images than others.
Need for data augmentation and better feature extraction.
Expect MAP@5 score expected to be low
This serves more as a lower-bound benchmark for model improvement.

"""



'\nEstablish a performance benchmark\nExtract full list of hotel IDs from dataset\nFor each test image, randomly assign 5 hotel IDs as predictions\nSubmit random assignment predictions to Kaggle to be evaluated against the leaderboard.\nRandom model will likely have an extremely low MAP@5 score\n\n1.4 million images across thousands of hotels. \nHotel images labeled with hotel ids.\nUnlabeled images to test dataset.\nSignificant variation in hotel room designs.\nClass imbalance as some hotels have more images than others.\nNeed for data augmentation and better feature extraction.\nExpect MAP@5 score expected to be low\nThis serves more as a lower-bound benchmark for model improvement.\n\n'

In [32]:
import random

In [33]:
all_hotel_ids = set(train_df['hotel_id'].to_list())
all_hotel_ids = sorted(list(all_hotel_ids))

In [34]:
image_urls_predictions = {}
for i, image in enumerate(train_df['url']):
    # get the first 20 images
    images_limit = 20
    if i < images_limit:
        image_urls_predictions[image] = random.sample(all_hotel_ids, 5)

print(f"All urls: {image_urls_predictions}")

All urls: {'https://image-submissions.s3.us-west-2.amazonaws.com/images/2016/10/20150909_172340_SDFCNB.jpg': [78685, 19702, 52258, 43624, 97247], 'https://image-submissions.s3.us-west-2.amazonaws.com/images/2016/10/20150909_172340_5ZJ44Z.jpg': [207007, 40369, 9737, 29731, 22541], 'https://image-submissions.s3.us-west-2.amazonaws.com/images/2016/10/20150917_193357_QPYZTV.jpg': [33604, 38468, 19375, 5683, 48975], 'https://image-submissions.s3.us-west-2.amazonaws.com/images/2017/2/20160125_191211_6TA3VG.jpg': [31869, 6275, 2617, 20359, 307734], 'https://image-submissions.s3.us-west-2.amazonaws.com/images/2017/2/20160125_172326_L5XHD4.jpg': [23256, 39028, 31877, 7271, 39347], 'https://image-submissions.s3.us-west-2.amazonaws.com/images/2017/2/20160125_172326_AHE4K7.jpg': [31587, 113343, 42031, 3304, 52165], 'https://image-submissions.s3.us-west-2.amazonaws.com/images/2017/2/20160125_172326_TFYKGB.jpg': [18700, 8970, 85994, 47733, 557], 'https://image-submissions.s3.us-west-2.amazonaws.com/

In [None]:
import random