In [1]:
import polars as pl

In [2]:
from glob import glob
DIR = '/home/anhphantq/ensemble/*.csv'
paths = glob(DIR, recursive= True) 
paths

['/home/anhphantq/ensemble/587_submission.csv',
 '/home/anhphantq/ensemble/submission_588_21_1.csv',
 '/home/anhphantq/ensemble/submission_second_588.csv',
 '/home/anhphantq/ensemble/submission_589_20_1.csv',
 '/home/anhphantq/ensemble/submission_first_588.csv',
 '/home/anhphantq/ensemble/586_submission.csv',
 '/home/anhphantq/ensemble/submission_588_latest.csv']

In [3]:
import numpy as np
def read_sub(path, weight=1): # by default let us assing the weight of 1 to predictions from each submission, this will be akin to a standard vote ensemble
    '''a helper function for loading and preprocessing submissions'''
    return  (
        pl.read_csv(path)
            .with_column(pl.col('labels').str.split(by=' '))
            .with_column(pl.col("labels").arr.lengths().alias("num"))
            .rename({'labels': 'aid'})
            .with_column(pl.col('num').apply(lambda x: [i * weight for i  in range(x)]).alias('vote'))
            .explode(['aid', 'vote'])
            
    )


In [4]:
subs = [read_sub(path, weight) for path, weight in zip(paths, [1 for _ in paths])]
subs[0].head()

session_type,aid,num,vote
str,str,u32,i64
"""12899779_click...","""59625""",20,0
"""12899779_click...","""737445""",20,1
"""12899779_click...","""1354960""",20,2
"""12899779_click...","""941596""",20,3
"""12899779_click...","""1253524""",20,4


In [5]:
from tqdm.notebook import tqdm
rs = subs[0]

for i in tqdm(range(1, len(subs))):
    rs= rs.join(subs[i], how = 'outer', on = ['session_type', 'aid'], suffix= f'_{i}')

  0%|          | 0/6 [00:00<?, ?it/s]

In [6]:
# rs.select([pl.col('^vote.*$')])
subs = (rs
    
    .fill_null(30)
    
    .with_column(pl.min(pl.all().exclude(["aid", 'session_type', 'num_votes'])).alias("rank_sum"))
    .sort([pl.col('rank_sum')], reverse= [False])
 )

In [7]:
subs

session_type,aid,num,vote,num_1,vote_1,num_2,vote_2,num_3,vote_3,num_4,vote_4,num_5,vote_5,num_6,vote_6,rank_sum
str,str,u32,i64,u32,i64,u32,i64,u32,i64,u32,i64,u32,i64,u32,i64,i64
"""12899779_click...","""59625""",20,0,20,0,20,0,20,0,20,0,20,0,20,0,0
"""12899780_click...","""1142000""",20,0,20,0,20,0,20,0,20,0,20,0,20,0,0
"""12899781_click...","""199008""",20,0,20,0,20,0,20,0,20,0,20,0,20,0,0
"""12899782_click...","""834354""",20,1,20,0,20,3,20,4,20,1,20,2,20,3,0
"""12899782_click...","""595994""",20,2,20,1,20,4,20,3,20,2,20,1,20,0,0
"""12899782_click...","""1007613""",20,0,20,2,20,2,20,1,20,0,20,0,20,2,0
"""12899782_click...","""479970""",20,9,20,11,20,0,20,2,20,8,20,4,20,5,0
"""12899782_click...","""1033148""",20,8,20,18,20,1,20,0,20,3,20,6,20,6,0
"""12899783_click...","""1817895""",20,0,20,0,20,0,20,0,20,0,20,0,20,0,0
"""12899784_click...","""1190477""",20,0,20,0,20,0,20,0,20,0,20,0,20,0,0


In [8]:
%%time
preds = subs.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))

CPU times: user 4min 3s, sys: 14.5 s, total: 4min 18s
Wall time: 2min 44s


In [9]:
preds

session_type,labels
str,str
"""13601180_order...","""1816589 113907..."
"""13134559_click...","""30900 754567 1..."
"""14165949_click...","""728030 1529624..."
"""13948972_carts...","""1262735 150398..."
"""12942748_order...","""1484557 909246..."
"""14224362_carts...","""224356 1824425..."
"""13968708_click...","""1750812 413495..."
"""13579969_carts...","""378327 1814759..."
"""14112151_click...","""1128707 164602..."
"""13013181_order...","""1852660 555377..."


In [10]:
FILE_NAME = 'ultimate_ensemble_submission.csv'
preds.write_csv(FILE_NAME)

In [11]:
!kaggle competitions submit -c otto-recommender-system -f {FILE_NAME} -m "hehe"

100%|████████████████████████████████████████| 782M/782M [00:38<00:00, 21.3MB/s]
Successfully submitted to OTTO – Multi-Objective Recommender System