In [1]:
import polars as pl

In [2]:
from glob import glob
DIR = '/home/anhphantq/ensemble/*.csv'
paths = glob(DIR, recursive= True) 
paths

['/home/anhphantq/ensemble/587_submission.csv',
 '/home/anhphantq/ensemble/submission_588_21_1.csv',
 '/home/anhphantq/ensemble/submission_second_588.csv',
 '/home/anhphantq/ensemble/submission_589_20_1.csv',
 '/home/anhphantq/ensemble/submission_first_588.csv',
 '/home/anhphantq/ensemble/586_submission.csv',
 '/home/anhphantq/ensemble/public_submission.csv',
 '/home/anhphantq/ensemble/submission_588_latest.csv']

In [3]:
import numpy as np
def read_sub(path, shift=0): # higher shift more important
    '''a helper function for loading and preprocessing submissions'''
    return  (
        pl.read_csv(path)
            .with_column(pl.col('labels').str.split(by=' '))
            .with_column(pl.col("labels").arr.lengths().alias("num"))
            .rename({'labels': 'aid'})
            .with_column(pl.col('num').apply(lambda x: [i - shift for i  in range(x)]).alias('vote'))
            .explode(['aid', 'vote'])
            
    )


In [4]:
shifts = [0.1,0.2,0.2,0.9,0.2,0.07, -15 ,0.1]
assert len(paths) == len(shifts)
subs = [read_sub(path, weight) for path, weight in zip(paths, shifts)]
subs[0].head()

session_type,aid,num,vote
str,str,u32,f64
"""12899779_click...","""59625""",20,-0.1
"""12899779_click...","""737445""",20,0.9
"""12899779_click...","""1354960""",20,1.9
"""12899779_click...","""941596""",20,2.9
"""12899779_click...","""1253524""",20,3.9


In [5]:
from tqdm.notebook import tqdm
rs = subs[0]

for i in tqdm(range(1, len(subs))):
    rs= rs.join(subs[i], how = 'outer', on = ['session_type', 'aid'], suffix= f'_{i}')

  0%|          | 0/7 [00:00<?, ?it/s]

In [6]:
# rs.select([pl.col('^vote.*$')])
subs = (rs
    
    .fill_null(30)
    
    .with_column(pl.min(pl.all().exclude(["aid", 'session_type', 'num_votes'])).alias("rank_sum"))
    .sort([pl.col('rank_sum')], reverse= [False])
 )

In [7]:
subs

session_type,aid,num,vote,num_1,vote_1,num_2,vote_2,num_3,vote_3,num_4,vote_4,num_5,vote_5,num_6,vote_6,num_7,vote_7,rank_sum
str,str,u32,f64,u32,f64,u32,f64,u32,f64,u32,f64,u32,f64,u32,i64,u32,f64,f64
"""12899779_click...","""59625""",20,-0.1,20,-0.2,20,-0.2,20,-0.9,20,-0.2,20,-0.07,20,30,20,-0.1,-0.9
"""12899780_click...","""1142000""",20,-0.1,20,-0.2,20,-0.2,20,-0.9,20,-0.2,20,-0.07,20,31,20,-0.1,-0.9
"""12899781_click...","""199008""",20,-0.1,20,-0.2,20,-0.2,20,-0.9,20,-0.2,20,-0.07,20,32,20,-0.1,-0.9
"""12899782_click...","""1033148""",20,7.9,20,17.8,20,0.8,20,-0.9,20,2.8,20,5.93,20,20,20,5.9,-0.9
"""12899783_click...","""1817895""",20,-0.1,20,-0.2,20,-0.2,20,-0.9,20,-0.2,20,-0.07,20,30,20,-0.1,-0.9
"""12899784_click...","""1190477""",20,-0.1,20,-0.2,20,-0.2,20,-0.9,20,-0.2,20,-0.07,20,32,20,-0.1,-0.9
"""12899785_click...","""775584""",20,-0.1,20,-0.2,20,-0.2,20,-0.9,20,-0.2,20,-0.07,20,32,20,-0.1,-0.9
"""12899786_click...","""955252""",20,-0.1,20,-0.2,20,-0.2,20,-0.9,20,-0.2,20,-0.07,20,31,20,-0.1,-0.9
"""12899787_click...","""1024433""",20,-0.1,20,0.8,20,-0.2,20,-0.9,20,-0.2,20,-0.07,20,30,20,-0.1,-0.9
"""12899788_click...","""1663048""",20,-0.1,20,-0.2,20,-0.2,20,-0.9,20,-0.2,20,-0.07,20,30,20,-0.1,-0.9


In [8]:
%%time
preds = subs.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))

CPU times: user 4min 42s, sys: 17.3 s, total: 4min 59s
Wall time: 2min 51s


In [9]:
len = preds['labels'].apply(lambda x: len(x.split()))

In [10]:
preds

session_type,labels
str,str
"""13867994_click...","""534719 21794 1..."
"""13131889_carts...","""1459739 146535..."
"""13653466_click...","""1240818 630728..."
"""14217536_click...","""775211 493115 ..."
"""13567051_carts...","""1107341 157518..."
"""12961043_carts...","""1830854 106564..."
"""13377506_carts...","""296766 1310373..."
"""13174456_click...","""219515 533587 ..."
"""13574366_order...","""1248548 218822..."
"""14482627_order...","""216057 807417 ..."


In [11]:
FILE_NAME = 'ultimate_ensemble_submission.csv'
preds.write_csv(FILE_NAME)

In [12]:
!kaggle competitions submit -c otto-recommender-system -f {FILE_NAME} -m "hehe"

100%|████████████████████████████████████████| 783M/783M [00:34<00:00, 23.5MB/s]
Successfully submitted to OTTO – Multi-Objective Recommender System