This file makes it easier to manually filter data. It flips thorugh images and lets me type numbers based on whether this image is of a room or not (or if the image can't be open it's saved differently).

The output is stored and used in other files in the form of the "arr" variable

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from os import listdir

COLUMNS = ["name", "city", "description", "stars", "price", "rating", "reviews", "image", "images",
           "categoryReviews", "userReviews"]
CATEGORIES = ["staff", "facilities", "cleanliness", "comfort", "valueForMoney", "location"]

# reads the file and selects columns
dataframes = []
for file in listdir("/content/drive/MyDrive/Hotel_data_final/"):
    temp_df = pd.read_json(f"/content/drive/MyDrive/Hotel_data_final/{file}")
    temp_df["city"] = file[:-12]
    temp_df = temp_df[COLUMNS]
    dataframes.append(temp_df)
df = pd.concat(dataframes)

# create columns based on review categories
for i, var_name in enumerate(CATEGORIES):
    df[var_name] = df["categoryReviews"].str[i].str['score']

# if duplicate or no reviews are available, drop row
df = df[df["reviews"] > 0]
df = df.dropna()
df = df.drop_duplicates(subset=["name", "city"])

# normalize the values of numerical attributes and adjust buggy prices
df.loc[df["price"] < 5, "price"] = 1000 * df.loc[df["price"] < 10, "price"]
for column in ["stars", "reviews", "rating"] + CATEGORIES:
    df[column] = (df[column] - np.mean(df[column])) / np.std(df[column])

# add the one-hot-encoding for cities
df = pd.concat([df, pd.get_dummies(df["city"])], axis=1)

df["price"] = (df["price"]-np.mean(df["price"]))/(np.sqrt(np.var(df["price"])))

# change the price to be in log form
df["price"] = 1/(1 + np.exp(-df["price"]))

# save it as a csv to be used in other places
#df.to_csv("data.csv", index=False)

df_images = pd.DataFrame().assign(Images=df['image'], Price=df['price'])


In [None]:
df_images

Unnamed: 0,Images,Price
0,https://cf.bstatic.com/images/hotel/max1024x76...,0.356912
1,https://cf.bstatic.com/images/hotel/max1024x76...,0.521611
2,https://cf.bstatic.com/images/hotel/max1024x76...,0.462112
3,https://cf.bstatic.com/images/hotel/max1024x76...,0.413272
4,https://cf.bstatic.com/images/hotel/max1024x76...,0.302573
...,...,...
99,https://cf.bstatic.com/images/hotel/max1024x76...,0.515661
101,https://cf.bstatic.com/images/hotel/max1024x76...,0.448322
102,https://cf.bstatic.com/images/hotel/max1024x76...,0.328295
103,https://cf.bstatic.com/images/hotel/max1024x76...,0.366080


In [None]:
link = "https://cf.bstatic.com/images/hotel/max1024x768/287/287696292.jpg"

print(df[df["image"]==link].index.values)

print(df["name"][179])

[179]
179    Holiday Inn & Suites Orlando International Dri...
179                            The Annex at Hayes Valley
179    Holiday Inn Express - Times Square South, an I...
179                                    La Valencia Hotel
179    Alexis Hotel & Banquets Dallas Park Central Ga...
179                             L'Ermitage Beverly Hills
179                                  Archer Hotel Austin
179     Sleep Inn & Suites Highway 290/Northwest Freeway
179                              Comfort Inn San Antonio
179    Extended Stay America Premier Suites - Charlot...
179                             Comfort Suites Nashville
Name: name, dtype: object


In [None]:
print(df[159])

NameError: ignored

In [None]:
data_1 = list(df_images.itertuples(index=False, name=None))

arr = [-1]*len(data_1)

#arr[:296] = [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 3, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 3, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 2, 2, 1, 2, 3, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 1, 2, 3, 2, 2, 1, 2, 3, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 3, 1, 2, 1, 3, 3, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 3, 3, 2, 1, 1, 1]

In [None]:
from google.colab import output

import time

from PIL import Image
import requests
from io import BytesIO


from torchvision.transforms import Resize

data_1 = list(df_images.itertuples(index=False, name=None))

start = 1319
i = start

while i < len(data_1):
    (img, label) = data_1[i]

    try:
        #signal.alarm(3)  # set time limit to 5 seconds
        response = requests.get(img)
        image = Image.open(BytesIO(response.content))
        #signal.alarm(0)  # reset the alarm
    except:
        arr[i] = 3
        print("except")
        i += 1
        continue

    image.thumbnail((200, 200))
    image.show()

    time.sleep(0.1) 

    num1 = int(input("1 for bad or 2 for good: "))

    output.clear()

    if(num1 == 0):
        print("DONE")
        break
    elif (num1 == 6):
        i -= 1
        continue
    elif (num1 == 1 or num1 == 2):
        arr[i] = num1
        i += 1



In [None]:
print(arr[start:i])
print(i)

[1, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2]
1360


In [None]:
print(arr[:i])

[2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 1, 2, 2, 2, 3, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 

In [None]:
arr2 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 3, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 3, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 2, 2, 1, 2, 3, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 1, 2, 3, 2, 2, 1, 2, 3, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 3, 1, 2, 1, 3, 3, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 3, 3, 2, 1, 1, 1]

In [None]:
print(len(arr2))

296


In [None]:
arr = [2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 3, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 3, 1, 2, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 3, 2, 2, 1, 2, 3, 3, 1, 2, 1, 1, 3, 2, 1, 2, 2, 1, 2, 3, 2, 2, 1, 2, 3, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 3, 1, 2, 1, 3, 3, 2, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 3, 3, 2, 1, 1, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 1, 1, 3, 2, 1, 2, 3, 2, 1, 2, 3, 2, 2, 2, 1, 3, 2, 2, 1, 2, 3, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 3, 1, 3, 1, 2, 2, 1, 1, 2, 1, 2, 3, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 3, 1, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 3, 1, 1, 1, 2, 2, 1, 3, 2, 1, 2, 3, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 3, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 3, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 3, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 3, 2, 1, 2, 2, 2, 2, 3, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 3, 3, 2, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 3, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 3, 1, 2, 1, 2, 3, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 3, 2, 2, 2, 2, 2, 3]

In [None]:
arr.count(2)

698

In [None]:
print(len(arr))

1003
