# Load data

In [None]:
import csv
import copy
import pickle
import pprint
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data.dataset import random_split

from collections import Counter
from tqdm import tqdm

In [None]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
item_df = pd.read_csv('data/item_metadata.csv')
submission_df = pd.read_csv('data/submission_popular.csv')

---

### Please refer to https://recsys.trivago.cloud/challenge/dataset/ for better understanding <i>item metadata</i> and <i>session actions</i>

# Make 'item encoding' dictionary using <i>item metadata</i>

In [None]:
properties = []
for i in range(len(item_df)):
    properties += item_df['properties'][i].split("|")
property_count = Counter(properties)
property_set = list(property_count.keys())

In [None]:
onehot_df = pd.DataFrame(np.zeros([len(item_df), len(property_set)]), index=item_df['item_id'], columns=property_set)
for i, row in tqdm(item_df.iterrows()):
    item_id = row['item_id']
    properties = row['properties'].split("|")
    onehot_df.loc[item_id][properties] = 1

In [None]:
batch_size = 1024
num_epochs = 50
learning_rate = 5e-3
criterion = nn.L1Loss()

dataset = onehot_df.values
loader = torch.utils.data.DataLoader(dataset=torch.tensor(dataset), batch_size=batch_size, shuffle=True)

In [None]:
class SimpleAE(nn.Module):
    def __init__(self):
        super(SimpleAE, self).__init__()
        self.enc = nn.Linear(157, 32)
        self.enc_act = nn.Tanh()
        self.dec = nn.Linear(32, 157)
        self.dec_act = nn.Sigmoid()

    def forward(self, x):
        encoded = self.enc_act(self.enc(x))
        decoded = self.dec_act(self.dec(encoded))
        return encoded, decoded

In [None]:
def fit(model,train_loader,learning_rate,num_epochs):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = lr_scheduler.MultiStepLR(optimizer, [20, 40], gamma=0.2)
    for epoch in range(num_epochs):
        model.train()
        losses = []
        for i, data in enumerate(train_loader):
            item_meta = data.type(torch.FloatTensor).cuda()
            recon_item_meta = model(item_meta)[1]
            loss = criterion(recon_item_meta, item_meta)
            
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        scheduler.step()

In [None]:
simple_ae = SimpleAE().cuda()
fit(simple_ae, loader, learning_rate, num_epochs)

In [None]:
loader = torch.utils.data.DataLoader(dataset=torch.tensor(dataset), batch_size=batch_size, shuffle=False)
simple_ae.eval()
encoding_lst = []
for i, data in enumerate(loader):
    item_meta = data.type(torch.FloatTensor).cuda()
    encoding, decoding = simple_ae(item_meta)
    
    encoding = encoding.cpu().detach().numpy().tolist()
    encoding_lst += encoding
encoding_lst = np.array(encoding_lst)

In [None]:
item_encoding_dict = {}
for i, item_id in enumerate(onehot_df.index):
    item_encoding_dict[item_id] = encoding_lst[i]

In [None]:
with open("data/item_encoding_dict.pickle", "wb") as f:
    pickle.dump(item_encoding_dict, f)

# Check the general properties of clicked items and make those properties dictionaries using <i>item metadata</i>
### These properties already were used for item encoding, but they are important features so that we use the information before encoding again.

In [None]:
impressions_idx = train_df[~train_df['impressions'].isna()].index
clicked_items = list(set([int(item_id) for item_id in list(train_df.loc[impressions_idx]['reference'])]))

In [None]:
clicked_item_properties = []
for item in tqdm(clicked_items):
    try:
        clicked_item_properties += list(item_df[item_df['item_id']==item]['properties'])[0].split("|")
    except:
        pass
clicked_item_property_count = Counter(clicked_item_properties)

In [None]:
for k, v in clicked_item_property_count.items():
    if v > 150000:
        print (k, v)

In [None]:
item_hotel = {}
item_star = {}
item_tv = {}
item_shower = {}
item_window = {}
item_car = {}
item_wifi = {}
item_nosmoke = {}
for i in tqdm(range(len(item_df))):
    item_id = item_df['item_id'][i]
    properties = item_df['properties'][i].split("|")
    stars = [p for p in properties if 'Star' in p and len(p)==6]
    if 'Hotel' in properties:
        item_hotel[item_id] = 1
        if len(stars) > 0:
            item_star[item_id] = int(stars[0][0])
        else:
            item_star[item_id] = 0
    else:
        item_hotel[item_id] = 0
        item_star[item_id] = 0
    
    item_tv[item_id] = 1 if 'Television' in properties else 0
    item_shower[item_id] = 1 if 'Shower' in properties else 0
    item_window[item_id] = 1 if 'Openable Windows' in properties else 0
    item_car[item_id] = 1 if 'Car Park' in properties else 0
    item_wifi[item_id] = 1 if 'WiFi (Public Areas)' in properties else 0
    item_wifi[item_id] = 1 if 'WiFi (Rooms)' in properties else item_wifi[item_id]
    item_nosmoke[item_id] = 1 if 'Non-Smoking Rooms' in properties else 0

# Make 'rating' dictionary using <i>item metadata</i>

In [None]:
properties = []
for i in range(len(item_df)):
    properties += item_df['properties'][i].split("|")
properties = list(set(properties))
ratings = [p for p in properties if 'Rating' in p]
print (ratings)

In [None]:
item_rating = {}
for i in range(len(item_df)):
    item_id = item_df['item_id'][i]
    properties = item_df['properties'][i].split("|")
    rating = [p for p in properties if p in ratings]
    item_rating[item_id] = len(rating)+1

---

# Make 'viewed' feature
### Binary check on items that the user has seen before (before session or before step within session)

In [None]:
user_lst = list(set(list(set(train_df['user_id']))+list(set(test_df['user_id']))))
user_items = {}
for user in user_lst:
    user_items[user] = []

viewed_lst = []
for i, row in tqdm(train_df.iterrows()):
    u_id = row['user_id']
    action = row['action_type']
    ref = row['reference']
    impressions = row['impressions']
    
    if action == "clickout item":
        viewed = "|".join([str(1) if int(item) in user_items[u_id] else str(0) for item in impressions.split("|")])
        viewed_lst.append(viewed)
    else:
        viewed_lst.append(np.nan)
        
    try:
        if int(ref) not in user_items[u_id]:
            user_items[u_id].append(int(ref))
    except:
        pass

train_df['viewed'] = viewed_lst

viewed_lst = []
for i, row in tqdm(test_df.iterrows()):
    u_id = row['user_id']
    action = row['action_type']
    ref = row['reference']
    impressions = row['impressions']
    
    if action == "clickout item":
        viewed = "|".join([str(1) if int(item) in user_items[u_id] else str(0) for item in impressions.split("|")])
        viewed_lst.append(viewed)
    else:
        viewed_lst.append(np.nan)
        
    try:
        if int(ref) not in user_items[u_id]:
            user_items[u_id].append(int(ref))
    except:
        pass
test_df['viewed'] = viewed_lst

# Delete meaningless train dataset for us
### delete sessions that have no 'clickout' action and delete the steps after the last 'clickout' action within a session.

In [None]:
act_lst = []
selected_idx = []

for i in tqdm(range(len(train_df))):
    step = train_df['step'][i]
    if i != 0 and step == 1:
        first_idx = i-(len(act_lst))
        clickout_idx = np.where(np.array(act_lst)=='clickout item')[0]
        if len(clickout_idx) != 0:
            last_idx = first_idx + clickout_idx[-1]
            selected_idx += list(range(first_idx, last_idx+1))
        else:
            last_idx = -9999
        act_lst = []
    act = train_df['action_type'][i]
    act_lst.append(act)
    
selected_idx += list(range(15932973, 15932992))

In [None]:
new_train_df = pd.DataFrame(train_df.loc[selected_idx], columns=train_df.columns)
new_train_df = new_train_df.reset_index(drop=True)

# Delete meaningless test dataset for us
### delete sessions that have no 'clickout' of NaN

In [None]:
selected_idx = []
for i in tqdm(range(len(test_df))):
    step = test_df['step'][i]
    action_type = test_df['action_type'][i]
    ref = test_df['reference'][i]
    if step == 1:
        first_idx = i
    if action_type == 'clickout item' and ref is np.nan:
        last_idx = i
        selected_idx += list(range(first_idx, last_idx+1))

In [None]:
new_test_df = pd.DataFrame(test_df.loc[selected_idx], columns=test_df.columns)
new_test_df = new_test_df.reset_index(drop=True)

---

# Add item features from dictionaries

In [None]:
def get_features(df, item_dict):
    impressions_idx = df[~df['impressions'].isna()].index
    features_lst = []
    prev_idx = -1
    for idx in tqdm(impressions_idx):
        impressions = df['impressions'][idx].split("|")
        tmp_features = []
        for impression in impressions:
            try:
                tmp_features.append(str(item_dict[int(impression)]))
            except:
                tmp_features.append(str(0))
        tmp_features = "|".join(tmp_features)
        tmp_features_lst = [np.nan]*((idx-1)-prev_idx) + [tmp_features]
        features_lst += tmp_features_lst
        prev_idx = idx
    return features_lst

In [None]:
names = ['ratings', 'hotel', 'star', 'tv', 'shower', 'window', 'car', 'wifi', 'nosmoke']
features_dict = [item_rating, item_hotel, item_star, item_tv, item_shower, item_window, item_car, item_wifi, item_nosmoke]

for name, feature_dict in list(zip(names, features_dict)):
    new_train_df[name] = get_features(new_train_df, feature_dict)
    new_test_df[name] = get_features(new_test_df, feature_dict)

# Add 'resident time' feature

In [None]:
def add_resident_time(df):
    timestamp_before_lst = np.array(df['timestamp'])[:-1]
    timestamp_after_lst = np.array(df['timestamp'])[1:]
    
    resident_time_lst = timestamp_after_lst-timestamp_before_lst
    resident_time_lst = np.append(resident_time_lst, [-9999])
    
    step1_idx = np.array(df[df['step']==1].index)
    final_idx = step1_idx-1
    final_idx = np.delete(final_idx, 0)
    final_idx = np.append(final_idx, df.index[-1])
    
    resident_time_lst[final_idx] = -9999
    return resident_time_lst

In [None]:
new_train_df['resident_time'] = add_resident_time(new_train_df)
new_test_df['resident_time'] = add_resident_time(new_test_df)

# Add 'price difference' feature and 'rating difference' feature
### 'price difference' is the difference between the accommodations on the screen and the price that the user has seen within the same session
### 'rating difference' is the binary feature. The value is 1 if the rating of the accommodations on the screen is equal to or greater than the minimum rating that the user has seen within the same session, otherwise 0.
### If user did not see any item, these features are filled with 0.

In [None]:
def copy_features(df):
    impressions_idx = df[~df['impressions'].isna()].index
    impressions_lst = []
    prices_lst = []
    ratings_lst = []
    prev_idx = -1
    for idx in tqdm(impressions_idx):
        impressions = df['impressions'][idx]
        tmp_impressions = [impressions] * (idx-prev_idx)
        impressions_lst += tmp_impressions
        
        prices = df['prices'][idx]
        tmp_prices = [prices] * (idx-prev_idx)
        prices_lst += tmp_prices
        
        ratings = df['ratings'][idx]
        tmp_ratings = [ratings] * (idx-prev_idx)
        ratings_lst += tmp_ratings
        
        prev_idx = idx
    return impressions_lst, prices_lst, ratings_lst

In [None]:
new_train_df['impressions'], new_train_df['prices'], new_train_df['ratings'] = copy_features(new_train_df)
new_test_df['impressions'], new_test_df['prices'], new_test_df['ratings'] = copy_features(new_test_df)

In [None]:
def calculate_diff(df):
    diff_price_lst = []
    diff_rating_lst = []
    ref_price = {}
    ref_rating = {}
    for i in tqdm(range(len(df))):
        ref = df['reference'][i]
        impressions = df['impressions'][i].split("|")
        prices = df['prices'][i].split("|")
        ratings = df['ratings'][i].split("|")
        resident_time = df['resident_time'][i]
        if resident_time != -9999:
            diff_price_lst.append(np.nan)
            diff_rating_lst.append(np.nan)
            if ref in impressions:
                ref_idx = impressions.index(ref)
                ref_price[ref] = int(prices[ref_idx])
                ref_rating[ref] = int(ratings[ref_idx])
        else:
            if len(ref_price) > 0:
                avg_price = sum(ref_price.values())/len(ref_price)
                min_rating = min(ref_rating.values())
                diff_prices = '|'.join([str(int(p)-avg_price) for p in prices])
                diff_ratings = '|'.join([str(1) if int(r)>=min_rating else str(0) for r in ratings])
            else:
                diff_prices = '|'.join([str(0)]*len(prices))
                diff_ratings = '|'.join([str(0)]*len(ratings))
            diff_price_lst.append(diff_prices)
            diff_rating_lst.append(diff_ratings)
            ref_price = {}
            ref_rating = {}
    return diff_price_lst, diff_rating_lst

In [None]:
new_train_df['diff_prices'], new_train_df['diff_ratings'] = calculate_diff(new_train_df)
new_test_df['diff_prices'], new_test_df['diff_ratings'] = calculate_diff(new_test_df)

# Delete the steps that have or might have items on the screen of the steps from those of the last step.
### case 1. the impression of the step is different from that of the last step
### case 2. 'change of sort, filter selection, search for item, search for destination, search for poi' actions happen

In [None]:
def filter_diff_impressions(df):
    selected_idx = []
    prev_impressions = ""
    for i, row in tqdm(df.iterrows()):
        step = row['step']
        action_type = row['action_type']
        curr_impressions = row['impressions']
        resident_time = row['resident_time']

        if step == 1:
            first_idx = i
        
        if action_type in ['change of sort order', 'filter selection', 'search for item', 'search for destination', 'search for poi']:
            first_idx = i
        
        if prev_impressions != curr_impressions:
            first_idx = i
        prev_impressions = curr_impressions
        
        if resident_time == -9999:
            last_idx = i
            selected_idx += list(range(first_idx, last_idx+1))
            
    return selected_idx

In [None]:
selected_idx = filter_diff_impressions(new_train_df)
filtered_train_df = pd.DataFrame(new_train_df.loc[selected_idx], columns=new_train_df.columns)
filtered_train_df = filtered_train_df.reset_index(drop=True)

In [None]:
selected_idx = filter_diff_impressions(new_test_df)
filtered_test_df = pd.DataFrame(new_test_df.loc[selected_idx], columns=new_test_df.columns)
filtered_test_df = filtered_test_df.reset_index(drop=True)

---

# Use the average price per nation or city

In [None]:
nation_lst = []
for i in tqdm(range(len(filtered_train_df))):
    nation_lst.append(filtered_train_df['city'][i].split(", ")[1])
filtered_train_df['nation'] = nation_lst

In [None]:
test_nation_lst = []
for i in tqdm(range(len(filtered_test_df))):
    test_nation_lst.append(filtered_test_df['city'][i].split(", ")[1])
filtered_test_df['nation'] = test_nation_lst

In [None]:
nation_price_dict = {}
nations = list(set(nation_lst))
last_step = filtered_train_df[filtered_train_df['resident_time']==-9999]
for nation in tqdm(nations):
    tmp = last_step[last_step['nation']==nation]

    prices_all = []
    for i in range(len(tmp)):
        prices = [int(p) for p in tmp['prices'].iloc[i].split("|")]
        prices_all += prices
    nation_price_dict[nation] = [np.mean(prices_all), np.median(prices_all)]

In [None]:
city_price_dict = {}
cities = list(set(filtered_train_df['city']))
last_step = filtered_train_df[filtered_train_df['resident_time']==-9999]
for city in tqdm(cities):
    tmp = last_step[last_step['city']==city]
    
    prices_all = []
    for i in range(len(tmp)):
        prices = [int(p) for p in tmp['prices'].iloc[i].split("|")]
        prices_all += prices
    city_price_dict[city] = [np.mean(prices_all), np.median(prices_all)]

In [None]:
def get_diff_prices(df):
    diff_city_mean_lst = []
    diff_city_median_lst = []
    diff_nation_mean_lst = []
    diff_nation_median_lst = []
    for i, row in tqdm(df.iterrows()):
        if row['resident_time'] == -9999:
            city = row['city']
            nation = row['nation']
            prices = row['prices'].split("|")
            try:
                city_mean = city_price_dict[city][0]
                city_median = city_price_dict[city][1]
                nation_mean = nation_price_dict[nation][0]
                nation_median = nation_price_dict[nation][1]
                diff_city_mean_lst.append('|'.join([str(int(p)-city_mean) for p in prices]))
                diff_city_median_lst.append('|'.join([str(int(p)-city_median) for p in prices]))
                diff_nation_mean_lst.append('|'.join([str(int(p)-nation_mean) for p in prices]))
                diff_nation_median_lst.append('|'.join([str(int(p)-nation_median) for p in prices]))
            except:
                diff_city_mean_lst.append('|'.join([str(0)]*len(prices)))
                diff_city_median_lst.append('|'.join([str(0)]*len(prices)))
                diff_nation_mean_lst.append('|'.join([str(0)]*len(prices)))
                diff_nation_median_lst.append('|'.join([str(0)]*len(prices)))
        else:
            diff_city_mean_lst.append(np.nan)
            diff_city_median_lst.append(np.nan)
            diff_nation_mean_lst.append(np.nan)
            diff_nation_median_lst.append(np.nan)
    return diff_city_mean_lst, diff_city_median_lst, diff_nation_mean_lst, diff_nation_median_lst

In [None]:
filtered_train_df['diff_city_mean'], filtered_train_df['diff_city_median'], filtered_train_df['diff_nation_mean'], filtered_train_df['diff_nation_median'] = get_diff_prices(filtered_train_df)
filtered_test_df['diff_city_mean'], filtered_test_df['diff_city_median'], filtered_test_df['diff_nation_mean'], filtered_test_df['diff_nation_median'] = get_diff_prices(filtered_test_df)

In [None]:
filtered_train_df.to_csv('data/train_final.csv', index=False)
filtered_test_df.to_csv('data/test_final.csv', index=False)

---

# From dataframe to real using data

In [None]:
train_df = pd.read_csv('data/train_final.csv')
test_df = pd.read_csv('data/test_final.csv')

In [None]:
with open('data/item_encoding_dict.pickle', 'rb') as f:
    item_encoding_dict = pickle.load(f)

In [None]:
filter_criteria = Counter(train_df[train_df['action_type']=='filter selection']['reference'])
for k, v in filter_criteria.items():
    if v > 2000:
        print (k, v)

In [None]:
common_filters = ['Price', 'Rating', 'Distance', 'Value', 'Hotel', 'Star', 'Hostal', 'Motel', 'Apartment', 'Breakfast', 'WiFi', 'Park']  

In [None]:
action_type_lst = ['clickout item', 'interaction item rating', 'interaction item info', 'interaction item image', 'interaction item deals',
                   'change of sort order', 'filter selection', 'search for item', 'search for destination', 'search for poi']
numeric_action_type = ['clickout item', 'interaction item rating', 'interaction item info', 'interaction item image', 'interaction item deals', 'search for item']

action_type_one_hot = {}
for i in range(len(action_type_lst)):
    tmp = [0]*10
    tmp[i] = 1
    action_type_one_hot[action_type_lst[i]] = tmp
print (action_type_one_hot)

In [None]:
def create_train_data():
    EMBEDDING_SIZE = 10+25+1
    THRESHOLD_TIME = 30.0
    with open('data/train_final.csv', encoding="utf-8") as train_f:
        rdr = csv.reader(train_f)
        next(rdr)

        sequence_lst = []
        train_lst = []
        display_lst = []
        encoding_lst = []
        device_lst = []
        criteria_lst = []
        label_lst = []

        for idx, line in enumerate(rdr):
            u_id = line[0]
            s_id = line[1]
            timestamp = line[2]
            step = line[3]
            action_type = line[4]
            reference = line[5]
            platform = line[6]
            city = line[7]
            device = line[8].split("|")
            current_filters = line[9]
            impressions = line[10].split("|")
            prices = line[11].split("|")
            viewed = line[12].split("|")
            ratings = line[13].split("|")
            hotel = line[14].split("|")
            star = line[15].split("|")
            tv = line[16].split("|")
            shower = line[17].split("|")
            window = line[18].split("|")
            car = line[19].split("|")
            wifi = line[20].split("|")
            nosmoke = line[21].split("|")
            resident_time = line[22]
            diff_prices = line[23].split("|")
            diff_ratings = line[24].split("|")
            nation = line[25]
            diff_city_mean = line[26].split("|")
            diff_city_median = line[27].split("|")
            diff_nation_mean = line[28].split("|")
            diff_nation_median = line[29].split("|")
            
            action_embedding = action_type_one_hot[action_type]
            ref_id_embedding = [0]*25
            if reference in impressions:
                ref_idx = impressions.index(reference)
                ref_id_embedding[ref_idx] += 1.

            time_embedding = [min([int(resident_time)+1, THRESHOLD_TIME])/THRESHOLD_TIME]
            step_embedding = action_embedding + ref_id_embedding + time_embedding
            
            if action_type in numeric_action_type:
                if reference in impressions:
                    sequence_lst.append(step_embedding)
            else:
                sequence_lst.append(step_embedding)
            
            if resident_time == "-9999" and len(sequence_lst)>0:
                sequence_lst.pop()
                if reference in impressions:
                    sequence_lst = sequence_lst[-50:]
                    sequence_lst = [([0] * EMBEDDING_SIZE) for _ in range((50 - len(sequence_lst)))] + sequence_lst
                    train_lst.append(sequence_lst)
                    
                    idx_lst = list(range(1,len(impressions)+1))
                    idx_lst += [0] * (25-len(idx_lst))
                    prices = [float(p) for p in prices]
                    prices += [0] * (25-len(prices))
                    viewed = [int(x) for x in viewed]
                    viewed += [0] * (25-len(viewed))
                    ratings = [int(x) for x in ratings]
                    ratings += [0] * (25-len(ratings))
                    hotel = [int(x) for x in hotel]
                    hotel += [0] * (25-len(hotel))
                    star = [int(x) for x in star]
                    star += [0] * (25-len(star))
                    tv = [int(x) for x in tv]
                    tv += [0] * (25-len(tv))
                    shower = [int(x) for x in shower]
                    shower += [0] * (25-len(shower))
                    window = [int(x) for x in window]
                    window += [0] * (25-len(window))
                    car = [int(x) for x in car]
                    car += [0] * (25-len(car))
                    wifi = [int(x) for x in wifi]
                    wifi += [0] * (25-len(wifi))
                    nosmoke = [int(x) for x in nosmoke]
                    nosmoke += [0] * (25-len(nosmoke))                    
                    
                    diff_prices = [float(x) for x in diff_prices]
                    diff_prices += [0] * (25-len(diff_prices))
                    diff_ratings = [float(x) for x in diff_ratings]
                    diff_ratings += [0] * (25-len(diff_ratings))
                    diff_city_mean = [float(x) for x in diff_city_mean]
                    diff_city_mean += [0] * (25-len(diff_city_mean))
                    diff_city_median = [float(x) for x in diff_city_median]
                    diff_city_median += [0] * (25-len(diff_city_median))
                    diff_nation_mean = [float(x) for x in diff_nation_mean]
                    diff_nation_mean += [0] * (25-len(diff_nation_mean))
                    diff_nation_median = [float(x) for x in diff_nation_median]
                    diff_nation_median += [0] * (25-len(diff_nation_median))
                    
                    features = [idx_lst, prices, hotel, star, tv, shower, window, car, wifi, nosmoke, ratings, viewed, diff_prices, diff_ratings, diff_city_mean, diff_city_median, diff_nation_mean, diff_nation_median]
                    display_lst.append(features)
                    
                    item_encoding = []
                    for j in range(25):
                        try:
                            item_encoding.append(item_encoding_dict[int(impressions[j])].tolist())
                        except:
                            item_encoding.append([0.]*32)
                    item_encoding = np.transpose(np.array(item_encoding), (1,0)).tolist()
                    encoding_lst.append(item_encoding)
                    
                    filtered_criteria = [0]*12
                    try:
                        current_filters = current_filters.split("|")
                        for i in range(len(common_filters)):
                            for fil in current_filters:
                                if common_filters[i] in fil:
                                    filtered_criteria[i] = 1
                        criteria_lst.append(filtered_criteria)
                    except:
                        criteria_lst.append(filtered_criteria)
                    
                    if device == "desktop":
                        device_lst.append([1,0,0])
                    elif device == "mobile":
                        device_lst.append([0,1,0])
                    else:
                        device_lst.append([0,0,1])
                    
                    label_lst.append(impressions.index(reference))
                sequence_lst = []

        with open('data/train_final.pickle', 'wb') as f:
            pickle.dump(train_lst, f)
        with open('data/train_display_final.pickle', 'wb') as f:
            pickle.dump(display_lst, f)
        with open('data/train_encoding_final.pickle', 'wb') as f:
            pickle.dump(encoding_lst, f)
        with open('data/train_criteria_final.pickle', 'wb') as f:
            pickle.dump(criteria_lst, f)
        with open('data/train_device_final.pickle', 'wb') as f:
            pickle.dump(device_lst, f)
        with open('data/label_final.pickle', 'wb') as f:
            pickle.dump(label_lst, f)

In [None]:
def create_test_data():
    EMBEDDING_SIZE = 10+25+1
    THRESHOLD_TIME = 30.0
    with open('data/test_final.csv', encoding="utf-8") as test_f:
        rdr = csv.reader(test_f)
        next(rdr)

        sequence_lst = []
        test_lst = []
        display_lst = []
        encoding_lst = []
        criteria_lst = []
        device_lst = []
        line_lst = []

        for idx, line in enumerate(rdr):
            u_id = line[0]
            s_id = line[1]
            timestamp = line[2]
            step = line[3]
            action_type = line[4]
            reference = line[5]
            platform = line[6]
            city = line[7]
            device = line[8].split("|")
            current_filters = line[9]
            impressions = line[10].split("|")
            prices = line[11].split("|")
            viewed = line[12].split("|")
            ratings = line[13].split("|")
            hotel = line[14].split("|")
            star = line[15].split("|")
            tv = line[16].split("|")
            shower = line[17].split("|")
            window = line[18].split("|")
            car = line[19].split("|")
            wifi = line[20].split("|")
            nosmoke = line[21].split("|")
            resident_time = line[22]
            diff_prices = line[23].split("|")
            diff_ratings = line[24].split("|")
            nation = line[25]
            diff_city_mean = line[26].split("|")
            diff_city_median = line[27].split("|")
            diff_nation_mean = line[28].split("|")
            diff_nation_median = line[29].split("|")
            
            action_embedding = action_type_one_hot[action_type]
            ref_id_embedding = [0]*25
            if reference in impressions:
                ref_idx = impressions.index(reference)
                ref_id_embedding[ref_idx] += 1.

            time_embedding = [min([int(resident_time)+1, THRESHOLD_TIME])/THRESHOLD_TIME]
            step_embedding = action_embedding + ref_id_embedding + time_embedding
            
            if action_type in numeric_action_type:
                if reference in impressions:
                    sequence_lst.append(step_embedding)
            else:
                sequence_lst.append(step_embedding)
            
            if resident_time == "-9999":
                sequence_lst = sequence_lst[-50:]
                sequence_lst = [([0] * EMBEDDING_SIZE) for _ in range((50 - len(sequence_lst)))] + sequence_lst
                test_lst.append(sequence_lst)    
            
                idx_lst = list(range(1,len(impressions)+1))
                idx_lst += [0] * (25-len(idx_lst))
                prices = [float(p) for p in prices]
                prices += [0] * (25-len(prices))
                viewed = [int(x) for x in viewed]
                viewed += [0] * (25-len(viewed))
                ratings = [int(x) for x in ratings]
                ratings += [0] * (25-len(ratings))
                hotel = [int(x) for x in hotel]
                hotel += [0] * (25-len(hotel))
                star = [int(x) for x in star]
                star += [0] * (25-len(star))
                tv = [int(x) for x in tv]
                tv += [0] * (25-len(tv))
                shower = [int(x) for x in shower]
                shower += [0] * (25-len(shower))
                window = [int(x) for x in window]
                window += [0] * (25-len(window))
                car = [int(x) for x in car]
                car += [0] * (25-len(car))
                wifi = [int(x) for x in wifi]
                wifi += [0] * (25-len(wifi))
                nosmoke = [int(x) for x in nosmoke]
                nosmoke += [0] * (25-len(nosmoke))
                
                diff_prices = [float(p) for p in diff_prices]
                diff_prices += [0] * (25-len(diff_prices))
                diff_ratings = [float(r) for r in diff_ratings]
                diff_ratings += [0] * (25-len(diff_ratings))
                diff_city_mean = [float(x) for x in diff_city_mean]
                diff_city_mean += [0] * (25-len(diff_city_mean))
                diff_city_median = [float(x) for x in diff_city_median]
                diff_city_median += [0] * (25-len(diff_city_median))
                diff_nation_mean = [float(x) for x in diff_nation_mean]
                diff_nation_mean += [0] * (25-len(diff_nation_mean))
                diff_nation_median = [float(x) for x in diff_nation_median]
                diff_nation_median += [0] * (25-len(diff_nation_median))
                    
                features = [idx_lst, prices, hotel, star, tv, shower, window, car, wifi, nosmoke, ratings, viewed, diff_prices, diff_ratings, diff_city_mean, diff_city_median, diff_nation_mean, diff_nation_median]
                display_lst.append(features)
                
                item_encoding = []
                for j in range(25):
                    try:
                        item_encoding.append(item_encoding_dict[int(impressions[j])].tolist())
                    except:
                        item_encoding.append([0.]*32)
                item_encoding = np.transpose(np.array(item_encoding), (1,0)).tolist()
                encoding_lst.append(item_encoding)
                
                filtered_criteria = [0]*12
                try:
                    current_filters = current_filters.split("|")
                    for i in range(len(common_filters)):
                        for fil in current_filters:
                            if common_filters[i] in fil:
                                filtered_criteria[i] = 1
                    criteria_lst.append(filtered_criteria)
                except:
                    criteria_lst.append(filtered_criteria)
                        
                if device == "desktop":
                    device_lst.append([1,0,0])
                elif device == "mobile":
                    device_lst.append([0,1,0])
                else:
                    device_lst.append([0,0,1])
                                      
                line_lst.append([u_id, s_id, timestamp, step, impressions])
                sequence_lst = []

        with open('data/test_final.pickle', 'wb') as f:
            pickle.dump(test_lst, f)
        with open('data/test_display_final.pickle', 'wb') as f:
            pickle.dump(display_lst, f)
        with open('data/test_encoding_final.pickle', 'wb') as f:
            pickle.dump(encoding_lst, f)
        with open('data/test_criteria_final.pickle', 'wb') as f:
            pickle.dump(criteria_lst, f)
        with open('data/test_device_final.pickle', 'wb') as f:
            pickle.dump(device_lst, f)
        with open('data/line_final.pickle', 'wb') as f:
            pickle.dump(line_lst, f)

In [None]:
print("... create train data ...")
create_train_data()
print("... end train data ...")

In [None]:
print("... create test data ...")
create_test_data()
print("... end test data ...")

---

# Feature nomalization and extension (Train)

In [None]:
with open('data/train_final.pickle', 'rb') as f:
    sessions = pickle.load(f)
with open('data/train_display_final.pickle', 'rb') as f:
    displays = pickle.load(f)
with open('data/train_encoding_final.pickle', 'rb') as f:
    encodings = pickle.load(f)
with open('data/train_criteria_final.pickle', 'rb') as f:
    criteria = pickle.load(f)
with open('data/train_device_final.pickle', 'rb') as f:
    devices = pickle.load(f)
with open('data/label_final.pickle', 'rb') as f:
    clicked_item = pickle.load(f)

In [None]:
# set max length as 15
for i in range(len(sessions)):
    sessions[i] = sessions[i][35:]

# For reciprocal price,
price = np.array(displays)[:,1,:].tolist()

# display:
# idx(0), price(1), hotel(2), star(3), tv(4), shower(5), window(6), car(7), wifi(8), nosmoke(9), ratings(10)
# viewed(11), diff_prices(12), diff_ratings(13)
# diff_city_mean(14), diff_city_median(15), diff_nation_mean(16), diff_nation_median(17)

# Normalize
PRICE_THRESHOLD = 1000
for i in range(len(displays)):
    displays[i][0] = [1/p if p!=0 else 0 for p in displays[i][0]]
    displays[i][1] = [p/PRICE_THRESHOLD if p<PRICE_THRESHOLD else 1 for p in displays[i][1]]
    displays[i][3] = [r/5 for r in displays[i][3]]
    displays[i][10] = [r/5 for r in displays[i][10]]
    
    max_diff = max([max(displays[i][12]),-min(displays[i][12])])
    if max_diff != 0:
        displays[i][12] = [p/max_diff for p in displays[i][12]]
        
    displays[i][14] = [p/PRICE_THRESHOLD if np.abs(p)<PRICE_THRESHOLD else p/np.abs(p) for p in displays[i][14]]
    displays[i][15] = [p/PRICE_THRESHOLD if np.abs(p)<PRICE_THRESHOLD else p/np.abs(p) for p in displays[i][15]]
    displays[i][16] = [p/PRICE_THRESHOLD if np.abs(p)<PRICE_THRESHOLD else p/np.abs(p) for p in displays[i][16]]
    displays[i][17] = [p/PRICE_THRESHOLD if np.abs(p)<PRICE_THRESHOLD else p/np.abs(p) for p in displays[i][17]]

In [None]:
# Feature Extension
# idx_sqrt(18), idx_square(19), price_sqrt(20), price_square(21)
# star_sqrt(22) start_square(23), rating_sqrt(24), rating_square(25)
# price_diff_sqrt(26), price_diff_square(27), price_mean_on_display(28), price_median_on_display(29)
# diff_city_mean_sqrt(30), diff_city_mean_square(31), diff_city_median_sqrt(32), diff_city_median_square(33)
# diff_nation_mean_sqrt(34), diff_nation_mean_square(35), diff_nation_median_sqrt(36), diff_nation_median_square(37)
# reciprocal_price(38)
for i in range(len(displays)):
    idx_sqrt = np.sqrt(displays[i][0]).tolist()
    idx_square = np.square(displays[i][0]).tolist()
    price_sqrt = np.sqrt(displays[i][1]).tolist()
    price_square = np.square(displays[i][1]).tolist()
    star_sqrt = np.sqrt(displays[i][3]).tolist()
    star_square = np.square(displays[i][3]).tolist()
    rating_sqrt = np.sqrt(displays[i][10]).tolist()
    rating_square = np.square(displays[i][10]).tolist()

    price_diff_sqrt = (np.where(np.array(displays[i][12])>=0,1,-1)*np.sqrt(np.abs(displays[i][12]))).tolist()
    price_diff_square = (np.where(np.array(displays[i][12])>=0,1,-1)*np.square(displays[i][12])).tolist()
    price_mean_on_display = (np.array(displays[i][1])-np.mean(displays[i][1])).tolist()
    price_median_on_display = (np.array(displays[i][1])-np.median(displays[i][1])).tolist()

    diff_city_mean_sqrt = (np.where(np.array(displays[i][14])>=0,1,-1)*np.sqrt(np.abs(displays[i][14]))).tolist()
    diff_city_mean_square = (np.where(np.array(displays[i][14])>=0,1,-1)*np.square(displays[i][14])).tolist()
    diff_city_median_sqrt = (np.where(np.array(displays[i][15])>=0,1,-1)*np.sqrt(np.abs(displays[i][15]))).tolist()
    diff_city_median_square = (np.where(np.array(displays[i][15])>=0,1,-1)*np.square(displays[i][15])).tolist()
    diff_nation_mean_sqrt = (np.where(np.array(displays[i][16])>=0,1,-1)*np.sqrt(np.abs(displays[i][16]))).tolist()
    diff_nation_mean_square = (np.where(np.array(displays[i][16])>=0,1,-1)*np.square(displays[i][16])).tolist()
    diff_nation_median_sqrt = (np.where(np.array(displays[i][17])>=0,1,-1)*np.sqrt(np.abs(displays[i][17]))).tolist()
    diff_nation_median_square = (np.where(np.array(displays[i][17])>=0,1,-1)*np.square(displays[i][17])).tolist()

    reciprocal_price = [1/p if p!=0 else 0 for p in price[i]]

    displays[i].extend([idx_sqrt, idx_square, price_sqrt, price_square, star_sqrt, star_square, rating_sqrt, rating_square])
    displays[i].extend([price_diff_sqrt, price_diff_square, price_mean_on_display, price_median_on_display])
    displays[i].extend([diff_city_mean_sqrt, diff_city_mean_square, diff_city_median_sqrt, diff_city_median_square, diff_nation_mean_sqrt, diff_nation_mean_square, diff_nation_median_sqrt, diff_nation_median_square, reciprocal_price])

In [None]:
train_dataset = list(zip(np.array(sessions), np.array(displays), np.array(encodings), np.array(criteria), np.array(devices), np.array(clicked_item)))

In [None]:
with open("data/train_dataset.pickle", "wb") as f:
    pickle.dump(train_dataset, f)

# Feature nomalization and extension (Test)

In [None]:
with open('data/test_final.pickle', 'rb') as f:
    test_sessions = pickle.load(f)
with open('data/test_display_final.pickle', 'rb') as f:
    test_displays = pickle.load(f)
with open('data/test_encoding_final.pickle', 'rb') as f:
    test_encodings = pickle.load(f)
with open('data/test_criteria_final.pickle', 'rb') as f:
    test_criteria = pickle.load(f)
with open('data/test_device_final.pickle', 'rb') as f:
    test_devices = pickle.load(f)

In [None]:
# set max length as 15
for i in range(len(test_sessions)):
    test_sessions[i] = test_sessions[i][35:]

# For reciprocal price,
test_price = np.array(test_displays)[:,1,:].tolist()

# display:
# idx(0), price(1), hotel(2), star(3), tv(4), shower(5), window(6), car(7), wifi(8), nosmoke(9), ratings(10)
# viewed(11), diff_prices(12), diff_ratings(13)
# diff_city_mean(14), diff_city_median(15), diff_nation_mean(16), diff_nation_median(17)

# Normalize
PRICE_THRESHOLD = 1000
for i in range(len(test_displays)):
    test_displays[i][0] = [1/p if p!=0 else 0 for p in test_displays[i][0]]
    test_displays[i][1] = [p/PRICE_THRESHOLD if p<PRICE_THRESHOLD else 1 for p in test_displays[i][1]]
    test_displays[i][3] = [r/5 for r in test_displays[i][3]]
    test_displays[i][10] = [r/5 for r in test_displays[i][10]]
    
    max_diff = max([max(test_displays[i][12]),-min(test_displays[i][12])])
    if max_diff != 0:
        test_displays[i][12] = [p/max_diff for p in test_displays[i][12]]
        
    test_displays[i][14] = [p/PRICE_THRESHOLD if np.abs(p)<PRICE_THRESHOLD else p/np.abs(p) for p in test_displays[i][14]]
    test_displays[i][15] = [p/PRICE_THRESHOLD if np.abs(p)<PRICE_THRESHOLD else p/np.abs(p) for p in test_displays[i][15]]
    test_displays[i][16] = [p/PRICE_THRESHOLD if np.abs(p)<PRICE_THRESHOLD else p/np.abs(p) for p in test_displays[i][16]]
    test_displays[i][17] = [p/PRICE_THRESHOLD if np.abs(p)<PRICE_THRESHOLD else p/np.abs(p) for p in test_displays[i][17]]
    
test_displays = np.nan_to_num(test_displays).tolist()

In [None]:
# Feature Extension
# idx_sqrt(18), idx_square(19), price_sqrt(20), price_square(21)
# star_sqrt(22) start_square(23), rating_sqrt(24), rating_square(25)
# price_diff_sqrt(26), price_diff_square(27), price_mean_on_display(28), price_median_on_display(29)
# diff_city_mean_sqrt(30), diff_city_mean_square(31), diff_city_median_sqrt(32), diff_city_median_square(33)
# diff_nation_mean_sqrt(34), diff_nation_mean_square(35), diff_nation_median_sqrt(36), diff_nation_median_square(37)
# reciprocal_price(38)
for i in range(len(test_displays)):
    idx_sqrt = np.sqrt(test_displays[i][0]).tolist()
    idx_square = np.square(test_displays[i][0]).tolist()
    price_sqrt = np.sqrt(test_displays[i][1]).tolist()
    price_square = np.square(test_displays[i][1]).tolist()
    star_sqrt = np.sqrt(test_displays[i][3]).tolist()
    star_square = np.square(test_displays[i][3]).tolist()
    rating_sqrt = np.sqrt(test_displays[i][10]).tolist()
    rating_square = np.square(test_displays[i][10]).tolist()

    price_diff_sqrt = (np.where(np.array(test_displays[i][12])>=0,1,-1)*np.sqrt(np.abs(test_displays[i][12]))).tolist()
    price_diff_square = (np.where(np.array(test_displays[i][12])>=0,1,-1)*np.square(test_displays[i][12])).tolist()
    price_mean_on_display = (np.array(test_displays[i][1])-np.mean(test_displays[i][1])).tolist()
    price_median_on_display = (np.array(test_displays[i][1])-np.median(test_displays[i][1])).tolist()

    diff_city_mean_sqrt = (np.where(np.array(test_displays[i][14])>=0,1,-1)*np.sqrt(np.abs(test_displays[i][14]))).tolist()
    diff_city_mean_square = (np.where(np.array(test_displays[i][14])>=0,1,-1)*np.square(test_displays[i][14])).tolist()
    diff_city_median_sqrt = (np.where(np.array(test_displays[i][15])>=0,1,-1)*np.sqrt(np.abs(test_displays[i][15]))).tolist()
    diff_city_median_square = (np.where(np.array(test_displays[i][15])>=0,1,-1)*np.square(test_displays[i][15])).tolist()
    diff_nation_mean_sqrt = (np.where(np.array(test_displays[i][16])>=0,1,-1)*np.sqrt(np.abs(test_displays[i][16]))).tolist()
    diff_nation_mean_square = (np.where(np.array(test_displays[i][16])>=0,1,-1)*np.square(test_displays[i][16])).tolist()
    diff_nation_median_sqrt = (np.where(np.array(test_displays[i][17])>=0,1,-1)*np.sqrt(np.abs(test_displays[i][17]))).tolist()
    diff_nation_median_square = (np.where(np.array(test_displays[i][17])>=0,1,-1)*np.square(test_displays[i][17])).tolist()

    reciprocal_price = [1/p if p!=0 else 0 for p in test_price[i]]

    test_displays[i].extend([idx_sqrt, idx_square, price_sqrt, price_square, star_sqrt, star_square, rating_sqrt, rating_square])
    test_displays[i].extend([price_diff_sqrt, price_diff_square, price_mean_on_display, price_median_on_display])
    test_displays[i].extend([diff_city_mean_sqrt, diff_city_mean_square, diff_city_median_sqrt, diff_city_median_square, diff_nation_mean_sqrt, diff_nation_mean_square, diff_nation_median_sqrt, diff_nation_median_square, reciprocal_price])

In [None]:
test_dataset = list(zip(np.array(test_sessions), np.array(test_displays), np.array(test_encodings), np.array(test_criteria), np.array(test_devices)))

In [None]:
with open("data/test_dataset.pickle", "wb") as f:
    pickle.dump(test_dataset, f)