In [1]:
import polars as pl

In [2]:
from glob import glob
DIR = '/home/anhphantq/ensemble/*.csv'
paths = glob(DIR, recursive= True) 
paths

['/home/anhphantq/ensemble/submission_588_21_1.csv',
 '/home/anhphantq/ensemble/submission_second_588.csv',
 '/home/anhphantq/ensemble/submission_589_20_1.csv',
 '/home/anhphantq/ensemble/submission_first_588.csv',
 '/home/anhphantq/ensemble/submission_588_latest.csv']

In [3]:
import numpy as np
def read_sub(path, weight=1): # by default let us assing the weight of 1 to predictions from each submission, this will be akin to a standard vote ensemble
    '''a helper function for loading and preprocessing submissions'''
    return  (
        pl.read_csv(path)
            .with_column(pl.col('labels').str.split(by=' '))
            .with_column(pl.col("labels").arr.lengths().alias("num"))
            .rename({'labels': 'aid'})
            .with_column(pl.col('num').apply(lambda x: [i * weight for i  in range(x)]).alias('vote'))
            .explode(['aid', 'vote'])
            
    )


In [4]:
subs = [read_sub(path, weight) for path, weight in zip(paths, [0.5,0.5,5,2,3])]
subs[0].head()

session_type,aid,num,vote
str,str,u32,f64
"""12899779_click...","""59625""",20,0.0
"""12899779_click...","""941596""",20,0.5
"""12899779_click...","""737445""",20,1.0
"""12899779_click...","""1253524""",20,1.5
"""12899779_click...","""894169""",20,2.0


In [5]:
from tqdm.notebook import tqdm
rs = subs[0]

for i in tqdm(range(1, len(subs))):
    rs= rs.join(subs[i], how = 'outer', on = ['session_type', 'aid'], suffix= f'_{i}')

  0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# rs.select([pl.col('^vote.*$')])
subs = (rs
    .with_column(
        pl.fold(0, lambda acc, s: acc + s, pl.all().exclude(["aid", 'session_type']).is_not_null()).alias("num_votes")
    )
    .fill_null(30)
    .with_column(
        pl.fold(0, lambda acc, s: acc + s, pl.all().exclude(["aid", 'session_type', 'num_votes'])).alias("rank_sum")
    )
    .sort([pl.col('num_votes'), pl.col('rank_sum')], reverse= [True, False])
 )

In [7]:
%%time
preds = subs.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))

CPU times: user 4min 19s, sys: 13.5 s, total: 4min 32s
Wall time: 2min 50s


In [8]:
preds

session_type,labels
str,str
"""14354571_order...","""108125 225619 ..."
"""14430702_carts...","""242879 143334 ..."
"""13684301_order...","""953170 1340813..."
"""13168516_carts...","""1459264 420771..."
"""14304014_order...","""765462 925297 ..."
"""13533377_order...","""564107 64685 1..."
"""13652247_order...","""540844 1346294..."
"""13188137_carts...","""1554258 158672..."
"""13245878_carts...","""444276 1217885..."
"""13154317_click...","""1687012 657282..."


In [9]:
FILE_NAME = 'ultimate_ensemble_submission.csv'
preds.write_csv(FILE_NAME)

In [10]:
!kaggle competitions submit -c otto-recommender-system -f {FILE_NAME} -m "hehe"

100%|████████████████████████████████████████| 782M/782M [00:36<00:00, 22.2MB/s]
Successfully submitted to OTTO – Multi-Objective Recommender System