# Challenge MeLi

In [66]:
import pandas as pd
from tqdm import tqdm, tqdm_notebook
import numpy as np
import time
import math
from random import sample, shuffle, seed, randint
import gc


SEED=42

#del variable
#gc.collect()

In [2]:
df_train = pd.read_csv('./data/03_primary/train_dataset.csv')
#df_test = pd.read_csv('./data/03_primary/test_dataset.csv')
dtype ={"item_id": "int", "title": "str", "domain_id": "str", "price": "float", "category_id": "str", "condition": "str", "site": "str"}
df_item = pd.read_csv('./data/02_intermediate/item_data.csv', dtype=dtype)
df_item.set_index('item_id')

Unnamed: 0_level_0,title,domain_id,price,category_id,condition,site
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
111260,Casa Sola En Venta Con Gran Patio Solo Pago De...,MLM-INDIVIDUAL_HOUSES_FOR_SALE,1150000.00,MLM170527,new,MLM
871377,Resident Evil Origins Collection Nintendo Swit...,MLM-VIDEO_GAMES,1392.83,MLM151595,new,MLM
490232,Falda De Imitación Piel Negra,MLM-SKIRTS,350.00,MLM7697,new,MLM
1150706,Powercolor Red Devil Radeon Rx 580 8gb Gddr5,MLM-GRAPHICS_CARDS,3200.00,MLM9761,used,MLM
934912,Laptop Hp Nx6320 Core Duo Con Puerto Db9 Windo...,MLM-NOTEBOOKS,1599.00,MLM1652,used,MLM
...,...,...,...,...,...,...
1099649,Carrinho De Bebê Stoke,MLB-BABY_STROLLERS,1600.00,MLB1386,used,MLB
1482187,Grelha Para Hambúrguer Preta Com Cabo Em Madei...,MLB-KITCHEN_SUPPLIES,69.90,MLB193425,new,MLB
1118904,Meia Tam 7/8 Anti Embolia Trombose Antitrombo,MLB-SOCKS,118.00,MLB108791,new,MLB
237229,Pano De Boca Cremer Menina Luxo Bordado C/3 Und,MLB-DISPOSABLE_BABY_DIAPERS,26.90,MLB40629,new,MLB


In [12]:
list_of_domains = df_item.domain_id.unique().tolist()
list_of_domains.sort()

In [13]:
items_view_by_session = {}
for name, group in tqdm_notebook(df_train.groupby('session_id')):
    list_of_items = group[group.event_type == "view"].event_info.unique().tolist()
    items_view_by_session[name] = list_of_items

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [14]:
temp = df_train[['session_id', 'event_info', 'event_type']]
temp = temp[df_train.event_type == 'view']
temp.drop(columns='event_type', inplace=True)
temp.event_info = pd.to_numeric(temp.event_info)

In [15]:
temp_merge = temp.merge(df_item[['item_id', 'domain_id']], left_on='event_info', right_on='item_id')[['session_id', 'domain_id']]


In [16]:
items_view_by_session = {}
for name, group in tqdm_notebook(temp_merge.groupby('session_id')):
    items_view_by_session[name] = group.domain_id.unique().tolist()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4655.0), HTML(value='')))




In [None]:
set_of_values = set()
for elem in items_view_by_session.values():
    for el in elem:
        set_of_values.add(el)

In [None]:
len(set_of_values)

In [None]:
list(items_view_by_session.values())[0]

### Base line tomando algun elemento de la lista que vio

In [53]:
def dcg(predicted, effective_item):
    dcg = 0
    for pos, (item, domain) in enumerate(predicted.items()):
        dcg += relevance((item, domain), effective_item) / (math.log10(2 + pos))
    return dcg


def relevance(predicted_item, effective_item):
    if predicted_item[0] == effective_item[0]:
        return 12
    elif (predicted_item[1] == effective_item[1]):
        return 1
    else:
        return 0


def idcg(number_of_items=10):
    # 12/math.log10(1 + 1) + 1/math.log10(1 + 2) + 1/math.log10(1 + 3) .... 
    first = 12 / math.log10(1 + 1)
    all_except_first = sum(map(lambda position: 1 / math.log10(1 + position), range(2, number_of_items + 1)))
    return first + all_except_first


def ndcg(predicted, effective_item, items):
    predicted_dict = {}
    for one_item in predicted:
        predicted_dict[one_item] = items[items.item_id == one_item].domain_id.iloc[0]
    effective_item = (
        effective_item, items[items.item_id == effective_item].domain_id.iloc[0]
    )
    return (dcg(predicted_dict, effective_item) / idcg())


In [54]:
def split_train_test_ids(list_of_ids, percentage=70):
    until = int(len(list_of_ids) * (percentage / 100))
    seed(SEED)
    shuffle(list_of_ids)
    return list_of_ids[:until], list_of_ids[until:]

In [55]:
def generate_dictionary_by_session(session_history):
    # genero un diccionario por session con otro diccionario que tiene el id del item visto y la cantidad de veces que lo vio. Ordenado desde el último visto como primero del diccionario
    # {session_id: {item_id: how_many_times_saw}, ....}
    recommended_by_session = {}
    for id, group in tqdm(session_history.groupby("session_id")):
        last_items_view = group[group.event_type == "view"].event_info.tolist()
        last_items_view.reverse()
        last_items_view = list(map(lambda item: int(item), last_items_view))
        last_items_view_quantity = {}
        for an_item in last_items_view:
            if an_item in last_items_view_quantity:
                last_items_view_quantity[an_item] += 1
            else:
                last_items_view_quantity[an_item] = 1

        recommended_by_session[id] = last_items_view_quantity

    return recommended_by_session

In [56]:
def generate_dictionary_by_domain(items):
    items_by_domain = {}
    for name, group in items.groupby("domain_id"):
        items_by_domain[name] = group.item_id.tolist()

    return items_by_domain

In [57]:
def generate_submission(filaname, predictions):
    with open(filaname, "w") as file:
        write = csv.writer(file)
        write.writerows(predictions)

In [58]:
def predict(recommendations, random_item, items, items_by_domain):
    predictions = []
    for _, recommendations in tqdm(recommendations.items()):
        # first 10 to predict
        predicted = list(recommendations.keys())[:10]

        if len(predicted) < 10:
            # sort items by views
            try:
                most_viewed = list(
                    {
                        k: v
                        for k, v in sorted(
                            recommendations.items(),
                            key=lambda item: item[1],
                            reverse=True,
                        )
                    }
                )[0]
            except:
                most_viewed = random_item

            most_viewed_by_domain = items[
                items.index == most_viewed
            ].domain_id.iloc[0]

            predicted += items_by_domain[most_viewed_by_domain][
                : 10 - len(predicted)
            ]
            for i in range(10 - len(predicted)):
                seed(i)
                onther_random_item = items.loc[
                    randint(0, len(items) - 1)
                ].item_id
                predicted.append(onther_random_item)

        predictions.append(predicted)

    return predictions

In [67]:
def generate_random_item(items):
    seed(SEED)
    return items.loc[randint(0, len(items) - 1)].item_id

In [60]:
seed(SEED)

In [61]:
train_id, test_id = split_train_test_ids(df_train.session_id.unique().tolist(), 70)

In [62]:
train = df_train[df_train.session_id.isin(train_id)]

In [75]:
recommendations = generate_dictionary_by_session(train)

100%|██████████| 289214/289214 [02:04<00:00, 2320.91it/s]


In [64]:
items_by_domain = generate_dictionary_by_domain(df_item)

In [68]:
random_item = generate_random_item(df_item)

In [76]:
predictions = predict(recommendations, random_item, df_item, items_by_domain)

100%|██████████| 289214/289214 [06:20<00:00, 759.77it/s]


In [79]:
len(predictions)

289214

In [80]:
train.head()

Unnamed: 0,session_id,event_info,event_timestamp,event_type,item_bought
0,0,1786148,2019-10-19T11:25:42.444-0400,view,1748830
1,0,1786148,2019-10-19T11:25:57.487-0400,view,1748830
2,0,RELOGIO SMARTWATCH,2019-10-19T11:26:07.063-0400,search,1748830
3,0,1615991,2019-10-19T11:27:26.879-0400,view,1748830
4,0,1615991,2019-10-19T11:28:36.558-0400,view,1748830


In [100]:
ndcg_values = 0
for index, ((_, item_bought), _) in tqdm_notebook(enumerate(train.groupby(['session_id', 'item_bought']))):
    ndcg_values += ndcg(predictions[index], item_bought, df_item)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




KeyboardInterrupt: 

In [95]:
for (a,b),c in train.groupby(['session_id', 'item_bought']):
    print(a)
    print(b)
    print(c)
    break

0
1748830
    session_id          event_info               event_timestamp event_type  \
0            0             1786148  2019-10-19T11:25:42.444-0400       view   
1            0             1786148  2019-10-19T11:25:57.487-0400       view   
2            0  RELOGIO SMARTWATCH  2019-10-19T11:26:07.063-0400     search   
3            0             1615991  2019-10-19T11:27:26.879-0400       view   
4            0             1615991  2019-10-19T11:28:36.558-0400       view   
5            0             1615991  2019-10-19T11:28:40.827-0400       view   
6            0             1615991  2019-10-19T11:30:42.089-0400       view   
7            0             1615991  2019-10-19T21:51:29.622-0400       view   
8            0             1615991  2019-10-19T21:52:09.281-0400       view   
9            0             1615991  2019-10-19T21:52:41.863-0400       view   
10           0             1615991  2019-10-19T21:54:16.119-0400       view   
11           0             1615991  2019-1