In [1]:
import polars as pl

In [2]:
from glob import glob
DIR = '/home/anhphantq/ensemble/*.csv'
paths = glob(DIR, recursive= True) 
paths

['/home/anhphantq/ensemble/submission_588_21_1.csv',
 '/home/anhphantq/ensemble/submission_second_588.csv',
 '/home/anhphantq/ensemble/submission_589_20_1.csv',
 '/home/anhphantq/ensemble/submission_first_588.csv',
 '/home/anhphantq/ensemble/submission_588_latest.csv']

In [3]:
import numpy as np
def read_sub(path, weight=1): # by default let us assing the weight of 1 to predictions from each submission, this will be akin to a standard vote ensemble
    '''a helper function for loading and preprocessing submissions'''
    return  (
        pl.read_csv(path)
            .with_column(pl.col('labels').str.split(by=' '))
            .with_column(pl.col("labels").arr.lengths().alias("num"))
            .rename({'labels': 'aid'})
            .with_column(pl.col('num').apply(lambda x: [i * weight for i  in range(x)]).alias('vote'))
            .explode(['aid', 'vote'])
            
    )


In [4]:
subs = [read_sub(path, weight) for path, weight in zip(paths, [1,1,1,1,1])]
subs[0].head()

session_type,aid,num,vote
str,str,u32,i64
"""12899779_click...","""59625""",20,0
"""12899779_click...","""941596""",20,1
"""12899779_click...","""737445""",20,2
"""12899779_click...","""1253524""",20,3
"""12899779_click...","""894169""",20,4


In [5]:
from tqdm.notebook import tqdm
rs = subs[0]

for i in tqdm(range(1, len(subs))):
    rs= rs.join(subs[i], how = 'outer', on = ['session_type', 'aid'], suffix= f'_{i}')

  0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# rs.select([pl.col('^vote.*$')])
subs = (rs
    
    .fill_null(30)
    
    .with_column(pl.min(pl.all().exclude(["aid", 'session_type', 'num_votes'])).alias("rank_sum"))
    .sort([pl.col('rank_sum')], reverse= [False])
 )

In [7]:
subs

session_type,aid,num,vote,num_1,vote_1,num_2,vote_2,num_3,vote_3,num_4,vote_4,rank_sum
str,str,u32,i64,u32,i64,u32,i64,u32,i64,u32,i64,i64
"""12899779_click...","""59625""",20,0,20,0,20,0,20,0,20,0,0
"""12899780_click...","""1142000""",20,0,20,0,20,0,20,0,20,0,0
"""12899781_click...","""199008""",20,0,20,0,20,0,20,0,20,0,0
"""12899782_click...","""479970""",20,11,20,0,20,2,20,8,20,5,0
"""12899782_click...","""1033148""",20,18,20,1,20,0,20,3,20,6,0
"""12899782_click...","""1007613""",20,2,20,2,20,1,20,0,20,2,0
"""12899782_click...","""834354""",20,0,20,3,20,4,20,1,20,3,0
"""12899782_click...","""595994""",20,1,20,4,20,3,20,2,20,0,0
"""12899783_click...","""1817895""",20,0,20,0,20,0,20,0,20,0,0
"""12899784_click...","""1190477""",20,0,20,0,20,0,20,0,20,0,0


In [8]:
%%time
preds = subs.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))

CPU times: user 4min 5s, sys: 15 s, total: 4min 20s
Wall time: 2min 46s


In [9]:
preds

session_type,labels
str,str
"""13149231_click...","""942478 3435 26..."
"""13981933_carts...","""1486856 986042..."
"""13975803_order...","""1248868 119036..."
"""14479048_order...","""554660 1782334..."
"""13881888_carts...","""334651 1769732..."
"""13206272_carts...","""1223872 803544..."
"""13483794_carts...","""487594 886431 ..."
"""14477550_carts...","""707725 1749648..."
"""13403597_order...","""1114486 198793..."
"""13740592_order...","""1833890 393808..."


In [10]:
FILE_NAME = 'ultimate_ensemble_submission.csv'
preds.write_csv(FILE_NAME)

In [11]:
!kaggle competitions submit -c otto-recommender-system -f {FILE_NAME} -m "hehe"

100%|████████████████████████████████████████| 782M/782M [00:33<00:00, 24.8MB/s]
Successfully submitted to OTTO – Multi-Objective Recommender System