# Data Preprocessing

In [2]:
import sys

import csv
import numpy as np
from tqdm import tqdm_notebook as tqdm

In [3]:
item_ids = set()
with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        item_ids.add(int(line[0]))
        
with open('../Dataset/train.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        if line[4] in ['clickout item', 'interaction item deals', 'interaction item image', 'interaction item info',  'interaction item rating',  'search for item'] and line[5].isdigit():
            item_ids.add(int(line[5]))
        if line[10] != '':
            item_ids.update(set(list(map(int, line[10].split('|')))))
            
with open('../Dataset/test.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        if line[4] in ['clickout item', 'interaction item deals', 'interaction item image', 'interaction item info',  'interaction item rating',  'search for item']  and line[5].isdigit():
            item_ids.add(int(line[5]))
        if line[10] != '':
            item_ids.update(set(list(map(int, line[10].split('|')))))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [4]:
property_to_index = {}
with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    index = 0
    for line in tqdm(reader):
        for prop in line[1].split('|'):
            if prop not in property_to_index:
                property_to_index[prop] = index
                index += 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [5]:
item_to_property = {}

for item in item_ids:
    item_to_property[item] = np.zeros(157)

with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        for prop in line[1].split('|'):
            item_to_property[int(line[0])][property_to_index[prop]] = 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
user = set()
action_type = set()
platform = set()
city = set()
device = set()
current_filters = set()

In [7]:
with open('../Dataset/train.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        user.add(line[0])
        action_type.add(line[4])
        platform.add(line[6])
        city.add(line[7])
        device.add(line[8])
        if line[9]:
            for filter in line[9].split('|'):
                current_filters.add(filter)
            
with open('../Dataset/test.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        user.add(line[0])
        action_type.add(line[4])
        platform.add(line[6])
        city.add(line[7])
        device.add(line[8])
        if line[9]:
            for filter in line[9].split('|'):
                current_filters.add(filter)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
user = sorted(list(user))
action_type = sorted(list(action_type))
platform = sorted(list(platform))
city = sorted(list(city))
device = sorted(list(device))
current_filters = sorted(list(current_filters))

In [9]:
user_to_ind = {user[i]:i for i in range(len(user))}
action_to_ind = {action_type[i]:i for i in range(len(action_type))}
platform_to_ind = {platform[i]:i for i in range(len(platform))}
city_to_ind = {city[i]:i for i in range(len(city))}
device_to_ind = {device[i]:i for i in range(len(device))}
current_filters_to_ind = {current_filters[i]:i for i in range(len(current_filters))}

In [10]:
reference = [set() for _ in range(10)]
with open('../Dataset/train.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        reference[action_to_ind[line[4]]].add(line[5])

with open('../Dataset/test.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        reference[action_to_ind[line[4]]].add(line[5])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [11]:
# reference_to_ind[i]: dictionary object that maps reference string of action_type[i] to integer index

reference_to_ind = [{} for _ in range(10)]
for i in range(10):
    reference_i = list(reference[i])
    
    if i in [1, 3, 4, 5, 6, 8]:    # for items, non-item-index references are mapped to zero. item-indexes are mapped to itself.
        for j in range(len(reference_i)):
            if reference_i[j] in ['', 'Estació de Sants', 'Shinjuku Station', 'Lower Manhattan', 'Miyako Airport', 'unknown']:
                reference_to_ind[i][reference_i[j]] = 0
            else:
                reference_to_ind[i][reference_i[j]] = int(reference_i[j])
    else:   # for non-items, reference values are indexed from 0 to number_of_references-1
        reference_to_ind[i].update({reference_i[j]:j for j in range(len(reference_i))})

# Clustering

For each platform, find out which properties are the most popular.

In [13]:
platform_clickouts = [[] for _ in range(55)]
with open('../Dataset/train.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        if line[4]=='clickout item':
            if line[5].isdigit():
                platform_clickouts[platform_to_ind[line[6]]].append(item_to_property[int(line[5])])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [53]:
platform_to_ind

{'AA': 0,
 'AE': 1,
 'AR': 2,
 'AT': 3,
 'AU': 4,
 'BE': 5,
 'BG': 6,
 'BR': 7,
 'CA': 8,
 'CH': 9,
 'CL': 10,
 'CN': 11,
 'CO': 12,
 'CZ': 13,
 'DE': 14,
 'DK': 15,
 'EC': 16,
 'ES': 17,
 'FI': 18,
 'FR': 19,
 'GR': 20,
 'HK': 21,
 'HR': 22,
 'HU': 23,
 'ID': 24,
 'IE': 25,
 'IL': 26,
 'IN': 27,
 'IT': 28,
 'JP': 29,
 'KR': 30,
 'MX': 31,
 'MY': 32,
 'NL': 33,
 'NO': 34,
 'NZ': 35,
 'PE': 36,
 'PH': 37,
 'PL': 38,
 'PT': 39,
 'RO': 40,
 'RS': 41,
 'RU': 42,
 'SE': 43,
 'SG': 44,
 'SI': 45,
 'SK': 46,
 'TH': 47,
 'TR': 48,
 'TW': 49,
 'UK': 50,
 'US': 51,
 'UY': 52,
 'VN': 53,
 'ZA': 54}

In [24]:
len(platform_clickouts[platform_to_ind['KR']])

7783

In [25]:
platform_clickout_property = [[sum(platform[i][j] for i in range(len(platform))) for j in range(157)] for platform in platform_clickouts]

In [30]:
property_to_index

{'Satellite TV': 0,
 'Golf Course': 1,
 'Airport Shuttle': 2,
 'Cosmetic Mirror': 3,
 'Safe (Hotel)': 4,
 'Telephone': 5,
 'Hotel': 6,
 'Sitting Area (Rooms)': 7,
 'Reception (24/7)': 8,
 'Air Conditioning': 9,
 'Hypoallergenic Rooms': 10,
 'Cable TV': 11,
 'Hotel Bar': 12,
 'Pool Table': 13,
 'Bathtub': 14,
 'Satisfactory Rating': 15,
 'Room Service': 16,
 'Luxury Hotel': 17,
 'Terrace (Hotel)': 18,
 'Television': 19,
 'Minigolf': 20,
 'Business Hotel': 21,
 'Shower': 22,
 'Cot': 23,
 'Gym': 24,
 'Hairdryer': 25,
 'Hypoallergenic Bedding': 26,
 'Accessible Parking': 27,
 'From 3 Stars': 28,
 'Good Rating': 29,
 'Radio': 30,
 '4 Star': 31,
 'From 4 Stars': 32,
 'Family Friendly': 33,
 'Desk': 34,
 'Tennis Court (Indoor)': 35,
 'Balcony': 36,
 'WiFi (Public Areas)': 37,
 'Openable Windows': 38,
 'Express Check-In / Check-Out': 39,
 'Restaurant': 40,
 'Laundry Service': 41,
 'Ironing Board': 42,
 'Tennis Court': 43,
 'From 2 Stars': 44,
 'Business Centre': 45,
 'Bowling': 46,
 'Conferenc

In [28]:
platform_clickout_property[platform_to_ind['KR']]

[2700.0,
 554.0,
 1284.0,
 1970.0,
 3150.0,
 4347.0,
 5744.0,
 1769.0,
 4862.0,
 5170.0,
 141.0,
 3807.0,
 3170.0,
 653.0,
 3626.0,
 6919.0,
 2951.0,
 5559.0,
 1914.0,
 4975.0,
 235.0,
 4926.0,
 5269.0,
 2274.0,
 2537.0,
 4976.0,
 201.0,
 1040.0,
 4308.0,
 6295.0,
 1479.0,
 1757.0,
 2821.0,
 4134.0,
 4438.0,
 163.0,
 608.0,
 5489.0,
 2882.0,
 1887.0,
 4508.0,
 3861.0,
 2745.0,
 744.0,
 4645.0,
 3245.0,
 234.0,
 3343.0,
 4152.0,
 997.0,
 1800.0,
 907.0,
 4480.0,
 5129.0,
 2903.0,
 1050.0,
 505.0,
 1305.0,
 4257.0,
 5234.0,
 4601.0,
 2889.0,
 2457.0,
 4819.0,
 1573.0,
 4792.0,
 1949.0,
 4255.0,
 1831.0,
 2086.0,
 4235.0,
 956.0,
 2927.0,
 510.0,
 828.0,
 2389.0,
 2162.0,
 404.0,
 130.0,
 3243.0,
 294.0,
 2735.0,
 1000.0,
 1005.0,
 910.0,
 2005.0,
 1251.0,
 1215.0,
 257.0,
 243.0,
 225.0,
 2262.0,
 87.0,
 853.0,
 927.0,
 1263.0,
 2159.0,
 890.0,
 256.0,
 2182.0,
 121.0,
 1458.0,
 1598.0,
 1117.0,
 3769.0,
 291.0,
 1241.0,
 723.0,
 1295.0,
 454.0,
 71.0,
 1397.0,
 24.0,
 866.0,
 1053.0,
 5

In [33]:
platform_top_properties = [sorted(property_to_index, reverse=True, key=lambda prop: platform[property_to_index[prop]]) for platform in platform_clickout_property]

In [72]:
platform_top_properties[platform_to_ind['ZA']]

['Satisfactory Rating',
 'Car Park',
 'Shower',
 'Good Rating',
 'Television',
 'Non-Smoking Rooms',
 'Electric Kettle',
 'WiFi (Rooms)',
 'WiFi (Public Areas)',
 'Business Hotel',
 'Air Conditioning',
 'Satellite TV',
 'Openable Windows',
 'Laundry Service',
 'Desk',
 'Hotel',
 'Restaurant',
 'Hairdryer',
 'Free WiFi (Combined)',
 'Luxury Hotel',
 'From 2 Stars',
 'Swimming Pool (Outdoor)',
 'Hotel Bar',
 'From 3 Stars',
 'Swimming Pool (Combined Filter)',
 'Family Friendly',
 'Reception (24/7)',
 'Conference Rooms',
 'Terrace (Hotel)',
 'Free WiFi (Rooms)',
 'Safe (Rooms)',
 'Free WiFi (Public Areas)',
 'Very Good Rating',
 'Beach',
 'Room Service',
 'Telephone',
 'Safe (Hotel)',
 'Bathtub',
 'Computer with Internet',
 'Business Centre',
 'Deck Chairs',
 'Cot',
 'Concierge',
 'Wheelchair Accessible',
 'Fridge',
 'Lift',
 'Romantic',
 'Central Heating',
 'Sitting Area (Rooms)',
 'Airport Shuttle',
 'From 4 Stars',
 'Ironing Board',
 'Childcare',
 'Self Catering',
 '3 Star',
 'Microwav