# Data Preprocessing

In [2]:
import sys

import csv
import numpy as np
from tqdm import tqdm_notebook as tqdm

In [3]:
item_ids = set()
with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        item_ids.add(int(line[0]))
        
with open('../Dataset/train.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        if line[4] in ['clickout item', 'interaction item deals', 'interaction item image', 'interaction item info',  'interaction item rating',  'search for item'] and line[5].isdigit():
            item_ids.add(int(line[5]))
        if line[10] != '':
            item_ids.update(set(list(map(int, line[10].split('|')))))
            
with open('../Dataset/test.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        if line[4] in ['clickout item', 'interaction item deals', 'interaction item image', 'interaction item info',  'interaction item rating',  'search for item']  and line[5].isdigit():
            item_ids.add(int(line[5]))
        if line[10] != '':
            item_ids.update(set(list(map(int, line[10].split('|')))))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [4]:
property_to_index = {}
with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    index = 0
    for line in tqdm(reader):
        for prop in line[1].split('|'):
            if prop not in property_to_index:
                property_to_index[prop] = index
                index += 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [5]:
item_to_property = {}

for item in item_ids:
    item_to_property[item] = np.zeros(157)

with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        for prop in line[1].split('|'):
            item_to_property[int(line[0])][property_to_index[prop]] = 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
user = set()
action_type = set()
platform = set()
city = set()
device = set()
current_filters = set()

In [7]:
with open('../Dataset/train.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        user.add(line[0])
        action_type.add(line[4])
        platform.add(line[6])
        city.add(line[7])
        device.add(line[8])
        if line[9]:
            for filter in line[9].split('|'):
                current_filters.add(filter)
            
with open('../Dataset/test.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        user.add(line[0])
        action_type.add(line[4])
        platform.add(line[6])
        city.add(line[7])
        device.add(line[8])
        if line[9]:
            for filter in line[9].split('|'):
                current_filters.add(filter)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
user = sorted(list(user))
action_type = sorted(list(action_type))
platform = sorted(list(platform))
city = sorted(list(city))
device = sorted(list(device))
current_filters = sorted(list(current_filters))

In [9]:
user_to_ind = {user[i]:i for i in range(len(user))}
action_to_ind = {action_type[i]:i for i in range(len(action_type))}
platform_to_ind = {platform[i]:i for i in range(len(platform))}
city_to_ind = {city[i]:i for i in range(len(city))}
device_to_ind = {device[i]:i for i in range(len(device))}
current_filters_to_ind = {current_filters[i]:i for i in range(len(current_filters))}

In [10]:
reference = [set() for _ in range(10)]
with open('../Dataset/train.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        reference[action_to_ind[line[4]]].add(line[5])

with open('../Dataset/test.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        reference[action_to_ind[line[4]]].add(line[5])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [11]:
# reference_to_ind[i]: dictionary object that maps reference string of action_type[i] to integer index

reference_to_ind = [{} for _ in range(10)]
for i in range(10):
    reference_i = list(reference[i])
    
    if i in [1, 3, 4, 5, 6, 8]:    # for items, non-item-index references are mapped to zero. item-indexes are mapped to itself.
        for j in range(len(reference_i)):
            if reference_i[j] in ['', 'Estació de Sants', 'Shinjuku Station', 'Lower Manhattan', 'Miyako Airport', 'unknown']:
                reference_to_ind[i][reference_i[j]] = 0
            else:
                reference_to_ind[i][reference_i[j]] = int(reference_i[j])
    else:   # for non-items, reference values are indexed from 0 to number_of_references-1
        reference_to_ind[i].update({reference_i[j]:j for j in range(len(reference_i))})

# Clustering

For each platform, find out which properties are the most popular.

In [13]:
platform_clickouts = [[] for _ in range(55)]
with open('../Dataset/train.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        if line[4]=='clickout item':
            if line[5].isdigit():
                platform_clickouts[platform_to_ind[line[6]]].append(item_to_property[int(line[5])])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [53]:
platform_to_ind

{'AA': 0,
 'AE': 1,
 'AR': 2,
 'AT': 3,
 'AU': 4,
 'BE': 5,
 'BG': 6,
 'BR': 7,
 'CA': 8,
 'CH': 9,
 'CL': 10,
 'CN': 11,
 'CO': 12,
 'CZ': 13,
 'DE': 14,
 'DK': 15,
 'EC': 16,
 'ES': 17,
 'FI': 18,
 'FR': 19,
 'GR': 20,
 'HK': 21,
 'HR': 22,
 'HU': 23,
 'ID': 24,
 'IE': 25,
 'IL': 26,
 'IN': 27,
 'IT': 28,
 'JP': 29,
 'KR': 30,
 'MX': 31,
 'MY': 32,
 'NL': 33,
 'NO': 34,
 'NZ': 35,
 'PE': 36,
 'PH': 37,
 'PL': 38,
 'PT': 39,
 'RO': 40,
 'RS': 41,
 'RU': 42,
 'SE': 43,
 'SG': 44,
 'SI': 45,
 'SK': 46,
 'TH': 47,
 'TR': 48,
 'TW': 49,
 'UK': 50,
 'US': 51,
 'UY': 52,
 'VN': 53,
 'ZA': 54}

In [24]:
len(platform_clickouts[platform_to_ind['KR']])

7783

In [25]:
platform_clickout_property = [[sum(platform[i][j] for i in range(len(platform))) for j in range(157)] for platform in platform_clickouts]

In [30]:
property_to_index

{'Satellite TV': 0,
 'Golf Course': 1,
 'Airport Shuttle': 2,
 'Cosmetic Mirror': 3,
 'Safe (Hotel)': 4,
 'Telephone': 5,
 'Hotel': 6,
 'Sitting Area (Rooms)': 7,
 'Reception (24/7)': 8,
 'Air Conditioning': 9,
 'Hypoallergenic Rooms': 10,
 'Cable TV': 11,
 'Hotel Bar': 12,
 'Pool Table': 13,
 'Bathtub': 14,
 'Satisfactory Rating': 15,
 'Room Service': 16,
 'Luxury Hotel': 17,
 'Terrace (Hotel)': 18,
 'Television': 19,
 'Minigolf': 20,
 'Business Hotel': 21,
 'Shower': 22,
 'Cot': 23,
 'Gym': 24,
 'Hairdryer': 25,
 'Hypoallergenic Bedding': 26,
 'Accessible Parking': 27,
 'From 3 Stars': 28,
 'Good Rating': 29,
 'Radio': 30,
 '4 Star': 31,
 'From 4 Stars': 32,
 'Family Friendly': 33,
 'Desk': 34,
 'Tennis Court (Indoor)': 35,
 'Balcony': 36,
 'WiFi (Public Areas)': 37,
 'Openable Windows': 38,
 'Express Check-In / Check-Out': 39,
 'Restaurant': 40,
 'Laundry Service': 41,
 'Ironing Board': 42,
 'Tennis Court': 43,
 'From 2 Stars': 44,
 'Business Centre': 45,
 'Bowling': 46,
 'Conferenc

In [28]:
platform_clickout_property[platform_to_ind['KR']]

[2700.0,
 554.0,
 1284.0,
 1970.0,
 3150.0,
 4347.0,
 5744.0,
 1769.0,
 4862.0,
 5170.0,
 141.0,
 3807.0,
 3170.0,
 653.0,
 3626.0,
 6919.0,
 2951.0,
 5559.0,
 1914.0,
 4975.0,
 235.0,
 4926.0,
 5269.0,
 2274.0,
 2537.0,
 4976.0,
 201.0,
 1040.0,
 4308.0,
 6295.0,
 1479.0,
 1757.0,
 2821.0,
 4134.0,
 4438.0,
 163.0,
 608.0,
 5489.0,
 2882.0,
 1887.0,
 4508.0,
 3861.0,
 2745.0,
 744.0,
 4645.0,
 3245.0,
 234.0,
 3343.0,
 4152.0,
 997.0,
 1800.0,
 907.0,
 4480.0,
 5129.0,
 2903.0,
 1050.0,
 505.0,
 1305.0,
 4257.0,
 5234.0,
 4601.0,
 2889.0,
 2457.0,
 4819.0,
 1573.0,
 4792.0,
 1949.0,
 4255.0,
 1831.0,
 2086.0,
 4235.0,
 956.0,
 2927.0,
 510.0,
 828.0,
 2389.0,
 2162.0,
 404.0,
 130.0,
 3243.0,
 294.0,
 2735.0,
 1000.0,
 1005.0,
 910.0,
 2005.0,
 1251.0,
 1215.0,
 257.0,
 243.0,
 225.0,
 2262.0,
 87.0,
 853.0,
 927.0,
 1263.0,
 2159.0,
 890.0,
 256.0,
 2182.0,
 121.0,
 1458.0,
 1598.0,
 1117.0,
 3769.0,
 291.0,
 1241.0,
 723.0,
 1295.0,
 454.0,
 71.0,
 1397.0,
 24.0,
 866.0,
 1053.0,
 5

In [33]:
platform_top_properties = [sorted(property_to_index, reverse=True, key=lambda prop: platform[property_to_index[prop]]) for platform in platform_clickout_property]

In [78]:
platform_top_properties[platform_to_ind['KR']]

['Satisfactory Rating',
 'Good Rating',
 'Hotel',
 'Luxury Hotel',
 'WiFi (Public Areas)',
 'Shower',
 'WiFi (Rooms)',
 'Air Conditioning',
 'Car Park',
 'Hairdryer',
 'Television',
 'Business Hotel',
 'Reception (24/7)',
 'Very Good Rating',
 'Free WiFi (Combined)',
 'From 2 Stars',
 'Lift',
 'Restaurant',
 'Non-Smoking Rooms',
 'Desk',
 'Telephone',
 'From 3 Stars',
 'Computer with Internet',
 'Free WiFi (Rooms)',
 'Free WiFi (Public Areas)',
 'Electric Kettle',
 'Family Friendly',
 'Laundry Service',
 'Cable TV',
 'Fridge',
 'Bathtub',
 'Conference Rooms',
 'Business Centre',
 'Concierge',
 'Hotel Bar',
 'Safe (Hotel)',
 'Room Service',
 'Beach',
 'Safe (Rooms)',
 'Central Heating',
 'Openable Windows',
 'From 4 Stars',
 'Ironing Board',
 'Massage',
 'Satellite TV',
 'Gym',
 'Wheelchair Accessible',
 '3 Star',
 'Cot',
 'Romantic',
 'Large Groups',
 'Swimming Pool (Combined Filter)',
 'Singles',
 'Honeymoon',
 'Pet Friendly',
 'Swimming Pool (Outdoor)',
 'Washing Machine',
 'Cosmetic

In [73]:
overall_clickout_property = [sum(platform[i] for platform in platform_clickout_property) for i in range(157)]

[696935.0,
 221509.0,
 277203.0,
 352432.0,
 726032.0,
 1002671.0,
 1129595.0,
 503841.0,
 1064493.0,
 1049743.0,
 99668.0,
 736135.0,
 826335.0,
 288534.0,
 688588.0,
 1351484.0,
 751957.0,
 1100796.0,
 664640.0,
 1198885.0,
 91681.0,
 1097348.0,
 1238154.0,
 726850.0,
 552499.0,
 1021798.0,
 78960.0,
 329413.0,
 962151.0,
 1177496.0,
 464350.0,
 359335.0,
 552744.0,
 971063.0,
 1013691.0,
 102449.0,
 198238.0,
 1233120.0,
 992370.0,
 440723.0,
 922610.0,
 890277.0,
 607806.0,
 325925.0,
 1073652.0,
 574789.0,
 125063.0,
 785404.0,
 682776.0,
 332136.0,
 382845.0,
 344802.0,
 1074169.0,
 1186648.0,
 685439.0,
 336943.0,
 207608.0,
 384955.0,
 846440.0,
 1185959.0,
 887852.0,
 893426.0,
 684154.0,
 834452.0,
 324155.0,
 1026853.0,
 384399.0,
 950399.0,
 342717.0,
 554363.0,
 927698.0,
 351093.0,
 531727.0,
 193040.0,
 229683.0,
 468382.0,
 386272.0,
 154984.0,
 74553.0,
 561783.0,
 136493.0,
 450225.0,
 246684.0,
 189270.0,
 174989.0,
 503509.0,
 231507.0,
 252766.0,
 150778.0,
 139747

In [77]:
overall_top_property = sorted(property_to_index, reverse=True, key=lambda prop: overall_clickout_property[property_to_index[prop]])
overall_top_property

['Satisfactory Rating',
 'Shower',
 'WiFi (Public Areas)',
 'Television',
 'Car Park',
 'WiFi (Rooms)',
 'Good Rating',
 'Hotel',
 'Luxury Hotel',
 'Business Hotel',
 'Non-Smoking Rooms',
 'From 2 Stars',
 'Reception (24/7)',
 'Air Conditioning',
 'Free WiFi (Combined)',
 'Hairdryer',
 'Desk',
 'Telephone',
 'Openable Windows',
 'Family Friendly',
 'From 3 Stars',
 'Free WiFi (Rooms)',
 'Free WiFi (Public Areas)',
 'Restaurant',
 'Central Heating',
 'Laundry Service',
 'Lift',
 'Computer with Internet',
 'Very Good Rating',
 'Hotel Bar',
 'Conference Rooms',
 'Room Service',
 'Cable TV',
 'Cot',
 'Safe (Hotel)',
 'Satellite TV',
 'Bathtub',
 'Safe (Rooms)',
 'Wheelchair Accessible',
 'Electric Kettle',
 'Terrace (Hotel)',
 'Ironing Board',
 'Swimming Pool (Combined Filter)',
 'Business Centre',
 'Concierge',
 'Pet Friendly',
 'From 4 Stars',
 'Gym',
 'Beach',
 'Fridge',
 'Romantic',
 'Sitting Area (Rooms)',
 'Swimming Pool (Outdoor)',
 'Large Groups',
 'Deck Chairs',
 '3 Star',
 'Radio

In [96]:
import numpy as np

np.savez('Platform-Analysis.npz', overall_top_property=overall_top_property, platform_top_properties=platform_top_properties)

# For the instructor: Try it yourself!

Run the cells below.  
Try changing the country code.

In [89]:
platform_to_ind = {'AA': 0, 'AE': 1, 'AR': 2, 'AT': 3, 'AU': 4, 'BE': 5, 'BG': 6, 'BR': 7, 'CA': 8, 'CH': 9, 'CL': 10, 'CN': 11, 'CO': 12, 'CZ': 13, 'DE': 14, 'DK': 15, 'EC': 16, 'ES': 17, 'FI': 18, 'FR': 19, 'GR': 20, 'HK': 21, 'HR': 22, 'HU': 23, 'ID': 24, 'IE': 25, 'IL': 26, 'IN': 27, 'IT': 28, 'JP': 29, 'KR': 30, 'MX': 31, 'MY': 32, 'NL': 33, 'NO': 34, 'NZ': 35, 'PE': 36, 'PH': 37, 'PL': 38, 'PT': 39, 'RO': 40, 'RS': 41, 'RU': 42, 'SE': 43, 'SG': 44, 'SI': 45, 'SK': 46, 'TH': 47, 'TR': 48, 'TW': 49, 'UK': 50, 'US': 51, 'UY': 52, 'VN': 53, 'ZA': 54}

In [None]:
import numpy as np

In [97]:
analysis = np.load('Platform-Analysis.npz')
overall_top_property = analysis['overall_top_property']
platform_top_properties = analysis['platform_top_properties']

In [92]:
overall_top_property

array(['Satisfactory Rating', 'Shower', 'WiFi (Public Areas)',
       'Television', 'Car Park', 'WiFi (Rooms)', 'Good Rating', 'Hotel',
       'Luxury Hotel', 'Business Hotel', 'Non-Smoking Rooms',
       'From 2 Stars', 'Reception (24/7)', 'Air Conditioning',
       'Free WiFi (Combined)', 'Hairdryer', 'Desk', 'Telephone',
       'Openable Windows', 'Family Friendly', 'From 3 Stars',
       'Free WiFi (Rooms)', 'Free WiFi (Public Areas)', 'Restaurant',
       'Central Heating', 'Laundry Service', 'Lift',
       'Computer with Internet', 'Very Good Rating', 'Hotel Bar',
       'Conference Rooms', 'Room Service', 'Cable TV', 'Cot',
       'Safe (Hotel)', 'Satellite TV', 'Bathtub', 'Safe (Rooms)',
       'Wheelchair Accessible', 'Electric Kettle', 'Terrace (Hotel)',
       'Ironing Board', 'Swimming Pool (Combined Filter)',
       'Business Centre', 'Concierge', 'Pet Friendly', 'From 4 Stars',
       'Gym', 'Beach', 'Fridge', 'Romantic', 'Sitting Area (Rooms)',
       'Swimming Pool (Out

In [102]:
platform_top_properties[platform_to_ind['US']]

array(['Car Park', 'Satisfactory Rating', 'Television',
       'Air Conditioning', 'Shower', 'Business Hotel',
       'Non-Smoking Rooms', 'WiFi (Public Areas)', 'From 2 Stars',
       'Telephone', 'Family Friendly', 'WiFi (Rooms)', 'Luxury Hotel',
       'Hairdryer', 'Reception (24/7)', 'Cable TV', 'Desk', 'Good Rating',
       'Hotel', 'Computer with Internet', 'Ironing Board',
       'Central Heating', 'Wheelchair Accessible', 'Lift',
       'Laundry Service', 'From 3 Stars',
       'Swimming Pool (Combined Filter)', 'Bathtub', 'Conference Rooms',
       'Gym', 'Business Centre', 'Electric Kettle', 'Romantic',
       'Safe (Hotel)', 'Cot', 'Very Good Rating', 'Radio', 'Restaurant',
       'Free WiFi (Combined)', 'Hotel Bar', 'Swimming Pool (Outdoor)',
       'Openable Windows', 'Deck Chairs', 'Fridge',
       'Free WiFi (Public Areas)', 'Room Service', 'Free WiFi (Rooms)',
       'Concierge', 'Express Check-In / Check-Out', 'Satellite TV',
       'Terrace (Hotel)', 'Safe (Rooms)', '