# MeLi Data Challenge 2020

#### Leaderboard score: 0.24563653006838435
#### Leaderboard position: 44 / 180

In [1]:
#Imports necesarios

import gzip
import json
import gc #garbage collector - optimiza el uso de ram
import math
import random
import csv

from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from pathlib import Path

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer

from challenge_metric import ndcg_score #Descargado de la pagina del challenge

In [2]:
#Auxiliary function
def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f: 
        for line in f:
            output.append(json.loads(line)) 
    return output

In [3]:
path = Path('C:\\Users\\desar\\OneDrive\\Escritorio\\data_science\\MeLi Challenge')

In [4]:
samples = 20000 ### Only a sample
rows = jl_to_list(path/'train_dataset.jl.gz')
if samples:
    rows = rows[:samples]
    
rows_train, rows_test= train_test_split(rows, test_size=0.2, random_state=42)

In [5]:
item_data = jl_to_list(path/'item_data.jl.gz')
metadata = {x['item_id']:x for x in item_data} #We create a dictionary to access easily access the item metadata
all_items = list(metadata.keys())

# Baseline

Recomienda los ultimos items vistos por el user, y si no alcanza rellena con los top selling items de cada domain. Si aun asi no alcanza, rellena con los item mas comprados segun los items vistos por el user. Por ultimo sino rellena al azar.

In [6]:
sales_by_domain = defaultdict(lambda: defaultdict(int)) 

for row in tqdm(rows_train): #etapa de training 
    items = [row['item_bought']]
    
    for item in items:
        domain = metadata[item]['domain_id']
        sales_by_domain[domain][item] += 1 

HBox(children=(FloatProgress(value=0.0, max=16000.0), HTML(value='')))




In [7]:
views_purchases = defaultdict(lambda: defaultdict(int))
for row in tqdm(rows_train):
    for ev in row['user_history']:
        if ev['event_type']=='view':
            views_purchases[int(ev['event_info'])][int(row['item_bought'])]+=1 #para cada vista, que se compro

HBox(children=(FloatProgress(value=0.0, max=16000.0), HTML(value='')))




In [8]:
def get_item_scores(row):
    """
    Given a user history (row) returns a counter of the items purchased 
    for the items viewed by the user
    """
    item_scores = defaultdict(int)
    for ev in row['user_history']:
        if ev['event_type']=='view':
            for k,v in views_purchases[int(ev['event_info'])].items():
                item_scores[k]+=v

    return Counter(item_scores)

In [9]:
def view_purchase_recom(row):
    """
    Given a user history (row) returns the top 10 items purchased 
    for the items viewed by the user. If there are less than k, 
    it fill the remaining spots with random items.
    
    """
    reco = []
    scores = get_item_scores(row)
    most_common = scores.most_common()
    for item, score in most_common:
        reco.append(item) #solo me interesa el item, pero el for lo tengo que armar segun la est del dict
        if len(reco) == 10:
            return reco
    
    k = 10 - len(reco)    
    relleno = random.choices(all_items, k=k)   
    
    return reco + relleno

In [10]:
def domains_visited(row, max_views=15): 
    """
    For a given user story (row), returns a Counter 
    of the domains visited by the user.
    """
    
    domains = Counter()
    
    items_visited = [event['event_info'] for event in row['user_history'] if event['event_type'] == 'view']
    
    if len(items_visited) > max_views:
        items_visited[:15]
    
    for item in items_visited:
        domain = metadata[item]['domain_id']
        domains[domain] += 1
        
    return domains

In [11]:
def top_items(domain,k=10):
    """
    Given the sales_x_domain info and a certain domain, 
    returns the top k selling items in that domain.
    """
    
    top_items = sales_by_domain[domain]
    top_items = Counter(top_items) 
    top_items = top_items.most_common(k)
    
    return [top[0] for top in top_items]

In [12]:
def top_by_best_domain(row, k=10):
    """
    For a given user story (row) it returns the top k selling
    items of the most visited domain.
    """
    items = [event['event_info'] for event in row['user_history'] if event['event_type'] == 'view']
    
    if len(items) == 0:
        return random.choices(all_items, k=k)
        
    domains = domains_visited(row)
    best_domain = domains.most_common(1)[0][0]
        
    return top_items(best_domain,k=k)

In [13]:
def last_viewed_and_best_domain(row, k=10):
    """
    Given a user story (row) it extracts the last k unique items visited.
    If there are less than k, it fills the remaining spots with best selling items from 
    most seen domain.
    """
    views = [view for view in row['user_history'] if view['event_type'] == 'view']
    views = sorted(views, key = lambda x: x['event_timestamp'], reverse = True)
    item_viewed = [events['event_info'] for events in views]
    
    items = []
    for item in item_viewed:
        if item not in items:
            items.append(item)
    
    #ahora ajusto a 10 values
    
    if len(items) == k:
        return items
    if len(items) > k:
        return items[:k]
    
    if len(items) < k:
        n = k - len(items)
        items_relleno = []                              #Relleno con el baseline de best by domain
        relleno_best_dom = top_by_best_domain(row, k=k)
        for relleno in relleno_best_dom:
            if relleno not in items:
                items_relleno.append(relleno)
        if len(items + items_relleno[:n]) == k:        
            return items + items_relleno[:n]
        
        elif len(items + items_relleno[:n]) < k:        #Algunos domains solo tienen un item bought.
            items_relleno_2 = []                        #Relleno con el baseline de views_purchases
            N = k - len(items + items_relleno[:n])
            relleno_views_purchase = view_purchase_recom(row)
            for relleno in relleno_views_purchase:
                if relleno not in items + items_relleno[:n]:
                    items_relleno_2.append(relleno)
            return items + items_relleno[:n] + items_relleno_2[:N]

In [14]:
y_preds = []
for row in tqdm(rows_test):
    baseline = last_viewed_and_best_domain(row)
    y_preds.append(baseline)

HBox(children=(FloatProgress(value=0.0, max=4000.0), HTML(value='')))




In [15]:
y_true = [row['item_bought'] for row in rows_test]

score = ndcg_score(y_true, y_preds, item_data,n_predictions=10)
print(f'Your score is: {score}')

Your score is: 0.24766468938964836


# Mejoras al baseline

Custom dataset: 

MultiClass calsification: ML for only the item bought and then 9 items left taken from baseline.

- Tried on train_set sample of 20000 rows.

- Features: last_seen_item (buen performance en el baseline, es conveniente incluirlo aca), most_viewed_item, m_viewed_veces, art vistos, view/search entries, condition, price. (todo en un dict con key el user_history)
- labels: item_bought


- Para tomar en cuenta el domain, se puede hacer subsets que sean cada uno de cada domain y entrenar modelos por separado y despues unir todo.

Comenzamos con el item most viewed

In [16]:
def last_viewed(row):
    """devuelve el ultimo item visto por el user previo a la compra"""
    
    views = [view for view in row['user_history'] if view['event_type'] == 'view']
    views = sorted(views, key = lambda x: x['event_timestamp'], reverse = True)
    item_viewed = [events['event_info'] for events in views]
    try:
        last_viewed = item_viewed[0]
    except IndexError: #me salio en algunas rows.
        return str(0)
    
    return str(last_viewed) #Lo quiero como str porque no es un numero continuo! es una var categorica

In [17]:
#testeamos la funcion

last_viewed(rows_train[784])

'1356812'

In [18]:
def most_viewed(row):
    """Devuelve el item most viewed para cada row del train/test_set"""
    
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
    
    total_views = Counter(viewed)
    
    try:
        return str(total_views.most_common(1)[0][0]) 
    except IndexError: #me salio en algunas rows.
        return str(0)  

In [19]:
#Testeamos la funcion

most_viewed(rows_train[7894])

'358667'

In [20]:
def most_viewed_times(row):
    """Devuelve la cantidad de veces que el user vio el
    item most viewed para cada row del train/test_set"""
    
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
    
    total_views = Counter(viewed)
    
    try:
        return total_views.most_common(1)[0][1]
    except IndexError: #me salio en algunas rows.
        return int(0)

In [21]:
#Testeamos la funcion

for row in rows_train:
    most_viewed_times(row)

Seguimos con cantidad de articulos vistos

In [22]:
def seen_items(row):
    """Devuelve cantidad de items seen por row"""
    
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
    
    return len(set(viewed))

In [23]:
#y testeamos

seen_items(rows_train[7360])

3

Ahora defino las views/searchs entries

In [24]:
def views_to_searchs(row):
    """Devuelve relacion entre views y searchs de cada user"""
    
    searched = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'search']
    views = seen_items(row)
    searchs = len(set(searched))
    
    if searchs != 0:
        return views / searchs
    if searchs == 0:
        return views / 0.1 #para evitar division sobre 0

In [25]:
#Testeamos

views_to_searchs(rows_train[7360])

1.5

#### Para crear features de la metadata no podemos utilizar el item bought como key por obvias razones
#### utilizamos el item mas visto

vamos con condition.
Con pandas hariamos un merge entre ambos df (user history y metadata) con el item bought como key, pero aca
no hace falta el merge porque trabajamos entre ambos dicts.

In [26]:
def item_condition(row):
    """devuelve la condition del most viewed item (new/used)"""
    
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
    total_views = Counter(viewed)
    try:
        most_viewed = total_views.most_common(1)[0][0]
    except IndexError: #me salio en algunas rows.
        return int(0)  
    
    condition = metadata[most_viewed]['condition']#la metadata ya la definimos como un dict con el item bought como key
    
    #Hago mapping de los valores para que sean numericos
    
    if condition == 'new':
        return int(1)
    elif condition == 'used':
        return int(0)
    else: 
        return int(0) #por las dudas
    

In [27]:
#Testeamos

item_condition(rows_train[7360])

1

Ahora ubicamos el Price

In [28]:
def item_price(row):
    """devuelve el price del item mas visto (new/used)"""
    
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
    total_views = Counter(viewed)
    try:
        most_viewed = total_views.most_common(1)[0][0]
    except IndexError: #me salio en algunas rows.
        return 0
    
    price = metadata[most_viewed]['price']
    
    try:
        if float(price) > 10000:
            return float(10000) #acotamos aca para evitar outliers
                          #utilizamos 10000 por la distribucion analizada en la notebook DataSetAnalysis con Pandas
        else:
            return float(price)
    except TypeError: #En caso de Nans, cuando no hay most_viewed porque no hay views!
        return 0

In [29]:
#Testeamos

item_price(rows_train[7360])

169.9

Antes de complicarnos con el time buy, avanzamos creando el dict final para el entrenamiento del modelo.
##### Si el modelo resulta ser competitivo con el baseline se puede crear la funcion para el time buy y agregarla sin problemas.

## Entrenamos un modelo en diferentes subsets de los datos segun el domain al cual pertenezcan.

### Creamos un train set y un valid set diferente para cada grupo de items bought en el domain

Empezamos contruyendo un custom set para toda la data y luego lo asignamos a cada domain

In [30]:
train_set = []

for row in tqdm(rows_train):
    
    user_info = {'most_viewed_times': most_viewed_times(row),'seen_items': seen_items(row), 
                 'views_to_searchs': views_to_searchs(row), 'item_condition': item_condition(row),
                 'item_price': item_price(row), 'most_viewed': most_viewed(row), 'last_viewed': last_viewed(row), 
                 'item_bought': row['item_bought']}
    
    train_set.append(user_info)

HBox(children=(FloatProgress(value=0.0, max=16000.0), HTML(value='')))




In [31]:
valid_set = []

for row in tqdm(rows_test):
    
    user_info = {'most_viewed_times': most_viewed_times(row),'seen_items': seen_items(row), 
                 'views_to_searchs': views_to_searchs(row), 'item_condition': item_condition(row),
                 'item_price': item_price(row), 'most_viewed': most_viewed(row), 'last_viewed': last_viewed(row),
                 'item_bought': row['item_bought']}
    
    valid_set.append(user_info)

HBox(children=(FloatProgress(value=0.0, max=4000.0), HTML(value='')))




In [32]:
def subset_assigner(dataset):
    """Asigna cada row a un subset segun el domain
    del item mas visto"""
    
    domains = []               #initialize una list con los domains de los item boughts del dataset
    subset = defaultdict(list) #indico que los values son lists
    
    #Creo la lista con los domains incluidos en el user history
    
    for dic in dataset:
        most_viewed = dic['most_viewed']
        domain = metadata[int(most_viewed)]['domain_id'] #uso el int() para que pueda leer el item de la metadata
        domains.append(domain)
    
    #itero sobre cada domain para crear el subset
    
    for dom in tqdm(set(domains)):
        for dic in dataset:
            most_viewed = dic['most_viewed']
            domain = metadata[int(most_viewed)]['domain_id']
            if domain == dom:
                subset[dom].append(list(dic.values())) #con los values solamente vectorizo el dataset directamente aca
                
    return subset            

Tarda mucho en runnear

In [33]:
train_subset = subset_assigner(train_set)
valid_subset = subset_assigner(valid_set)

HBox(children=(FloatProgress(value=0.0, max=2262.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1196.0), HTML(value='')))




In [34]:
#Testeamos la funcion

train_subset['MLB-BABIES_FORMULA']

[[1, 1, 10.0, 1, 57.9, '1592848', '1592848', 649408],
 [2, 6, 6.0, 1, 249.9, '1990337', '260994', 1106871],
 [5, 3, 1.0, 1, 119.0, '1399187', '2098106', 1731041]]

Para cada domain ya nos queda un array 2D listo para alimentar el modelo

#### Entreno un modelo para cada domain.

Comienzo creando un dict que me mapea el modelo entrenado para cada dominio como key

In [35]:
models_dict = defaultdict() #Dict con los modelos entrenados para cada domain (keys del dict)

for dom in tqdm(train_subset.keys()):
    try:
        try: 
    
            if dom in valid_subset.keys():
                X_train = np.array(train_subset[dom])[:,:-1] #Todo menos el item bought
                y_train = np.array(train_subset[dom])[:,-1]  #unicamente el item bought
            
                #Rellenamos con 0 los nans

                X_train = np.nan_to_num(X_train) 

                clf = DecisionTreeClassifier(random_state=42)
                model = clf.fit(X_train, y_train) 
                
                models_dict[dom] = model

        except IndexError: 
            pass
    except ValueError: 
        pass

HBox(children=(FloatProgress(value=0.0, max=2262.0), HTML(value='')))




In [36]:
models_dict #Obtengo un dict con cada model trained

defaultdict(None,
            {'MLB-BATHROOM_SUPPLIES': DecisionTreeClassifier(random_state=42),
             'MLB-HORN_DRIVERS': DecisionTreeClassifier(random_state=42),
             'MLB-HAIRDRESSING_CAPS': DecisionTreeClassifier(random_state=42),
             'MLB-SHOWER_DOORS': DecisionTreeClassifier(random_state=42),
             'MLB-COMPLETE_SKATEBOARDS': DecisionTreeClassifier(random_state=42),
             'MLB-GARDEN_SPRAYERS': DecisionTreeClassifier(random_state=42),
             'MLB-VEHICLE_WAXES': DecisionTreeClassifier(random_state=42),
             'MLB-AUTOMOTIVE_IGNITION_COILS': DecisionTreeClassifier(random_state=42),
             'MLM-BAR_SOAPS': DecisionTreeClassifier(random_state=42),
             'MLB-EMERGENCY_LIGHTS': DecisionTreeClassifier(random_state=42),
             'MLB-SPORTSWEAR': DecisionTreeClassifier(random_state=42),
             'MLB-BEER_DISPENSERS': DecisionTreeClassifier(random_state=42),
             'MLB-MOTORCYCLE_SHOCK_ABSORBERS': DecisionTr

In [37]:
def X_creator(row):
    """Devuelve el X_valid de cada row_test para la predicciones del modelo"""
    
    valid_set = {'most_viewed_times': most_viewed_times(row),'seen_items': seen_items(row), 
                     'views_to_searchs': views_to_searchs(row), 'item_condition': item_condition(row),
                     'item_price': item_price(row), 'most_viewed': most_viewed(row), 'last_viewed': last_viewed(row)}
    
    
    #vec = DictVectorizer()
    #X_valid = vec.fit_transform(valid_set).toarray() 
    
    X_valid = []
    for values in valid_set.values():
        X_valid.append(values) #uso esta forma y no el dict vectorizer asi puedo dar los datos de items como str
                               #y no como integer. El dict vectorizer hace OH encoding automaticamente con las str vars
                               #como es un decision tree no hace falta hacerlas dummy vars
    X_valid = np.array(X_valid)
    X_valid = X_valid.reshape(1, -1) #lo hago un array 2D para que me lo lea el decision Tree
    
    return X_valid

In [38]:
#testeamos

row = rows_test[90]

X = X_creator(row)
X

array([['3', '10', '10.0', '1', '54.99', '1801870', '1801870']],
      dtype='<U32')

In [39]:
model.predict(X)

array(['916978'], dtype='<U32')

In [40]:
def final_model(row, valid_set = valid_subset, models = models_dict, k = 9):
    """Devuelve predicciones para cada row del user_history
    con first value del decision tree y 9 values restantes
    de la combinacion de baselines"""
    
    #Obtengo primero el domain al que pertenece el item mas visto, que usamos como key para la metadata
    
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
    total_views = Counter(viewed)
    try:
        most_viewed = total_views.most_common(1)[0][0]
    except IndexError: #me salio en algunas rows.
        return last_viewed_and_best_domain(row) #si no tengo most viewed no tengo dom y no puedo usar el modelo
                                                #en ese caso que me devuelva el baseline nomas.
    
    dom = metadata[most_viewed]['domain_id']    
    
    if dom in models.keys(): #si el dom esta en el train set.
        model = models[dom] 
        
        valid_set = X_creator(row)
        
        first_value = model.predict(valid_set)
        first_value = int(first_value[0])
        
    else:
        return last_viewed_and_best_domain(row)
    
    #Obtengo resultados del baseline
    
    baseline = last_viewed_and_best_domain(row, k)
    
    #Uno ambas predicciones con el first value en pos 0
    
    output = baseline
    output.insert(0, first_value)
    
    return output

In [41]:
baseline_preds = []
y_preds = []
for row in tqdm(rows_test):
    recom = final_model(row)
    y_preds.append(recom)

HBox(children=(FloatProgress(value=0.0, max=4000.0), HTML(value='')))




In [42]:
y_true = [row['item_bought'] for row in rows_test]

from challenge_metric import ndcg_score
score = ndcg_score(y_true, y_preds, item_data,n_predictions=10)
print(f'Your score is: {score}')

Your score is: 0.19833803721780285


## el baseline lo supera por 0.05 puntos (0.247 a 0.198)

# Conclusion

### Me quedo con la submission del baseline, ya que no logro implementar un modelo que lo supere.

##### La submission del baseline se encuentra en la notebook MeLiChallenge2020BaselineSubmission