In [3]:
import polars as pl

In [4]:
from glob import glob
DIR = '/home/anhphantq/ensemble/*.csv'
paths = glob(DIR, recursive= True) 
paths

['/home/anhphantq/ensemble/587_submission.csv',
 '/home/anhphantq/ensemble/submission_588_21_1.csv',
 '/home/anhphantq/ensemble/submission_second_588.csv',
 '/home/anhphantq/ensemble/submission_589_20_1.csv',
 '/home/anhphantq/ensemble/submission_first_588.csv',
 '/home/anhphantq/ensemble/586_submission.csv',
 '/home/anhphantq/ensemble/submission_588_latest.csv']

In [5]:
def read_sub(path, weight=1): # by default let us assing the weight of 1 to predictions from each submission, this will be akin to a standard vote ensemble
    '''a helper function for loading and preprocessing submissions'''
    return (
        pl.read_csv(path)
            .with_column(pl.col('labels').str.split(by=' '))
            .with_column(pl.lit(weight).alias('vote'))
            .explode('labels')
            .rename({'labels': 'aid'})
            .with_column(pl.col('aid').cast(pl.UInt32)) # we are casting the `aids` to `Int32`! memory management is super important to ensure we don't run out of resources
            .with_column(pl.col('vote').cast(pl.UInt8))
    )

In [6]:
subs = [read_sub(path) for path, weight in zip(paths, [3,0.5,0.5,5,2,3,2])]
subs[0].head()

session_type,aid,vote
str,u32,u8
"""12899779_click...",59625,1
"""12899779_click...",737445,1
"""12899779_click...",1354960,1
"""12899779_click...",941596,1
"""12899779_click...",1253524,1


In [7]:
subs = subs[0].join(subs[1], how='outer', on=['session_type', 'aid']).join(subs[2], how='outer', on=['session_type', 'aid'], suffix='_right2')
subs.head()

session_type,aid,vote,vote_right,vote_right2
str,u32,u8,u8,u8
"""12899779_click...",59625,1,1,1
"""12899779_click...",941596,1,1,1
"""12899779_click...",737445,1,1,1
"""12899779_click...",1253524,1,1,1
"""12899779_click...",894169,1,1,1


In [8]:
subs = (subs
    .fill_null(0)
    .with_column((pl.col('vote') + pl.col('vote_right') + pl.col('vote_right2')).alias('vote_sum'))
    .drop(['vote', 'vote_right', 'vote_right2'])
    .sort(by='vote_sum')
    .reverse()
)

subs.head()

session_type,aid,vote_sum
str,u32,u8
"""14571581_order...",1392029,3
"""14571581_order...",555996,3
"""14571581_order...",785201,3
"""14571581_order...",1158237,3
"""14571581_order...",1678885,3


In [9]:
%%time
preds = subs.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))

CPU times: user 3min 20s, sys: 11.4 s, total: 3min 31s
Wall time: 2min 24s


In [10]:
preds

session_type,labels
str,str
"""13943586_order...","""87218 392090 9..."
"""12922939_carts...","""369224 1802254..."
"""14390047_order...","""289762 1658239..."
"""13596719_click...","""1272551 639239..."
"""13844915_click...","""1773311 131641..."
"""14342142_order...","""1310943 139792..."
"""14478180_carts...","""1095682 551645..."
"""13588863_order...","""68792 540316 9..."
"""13476467_carts...","""1222872 105598..."
"""13682425_carts...","""872978 1751770..."


In [11]:
FILE_NAME = 'haizz_ensemble_submission.csv'
preds.write_csv(FILE_NAME)

In [12]:
!kaggle competitions submit -c otto-recommender-system -f {FILE_NAME} -m "hehe"

100%|████████████████████████████████████████| 782M/782M [00:40<00:00, 20.4MB/s]
Successfully submitted to OTTO – Multi-Objective Recommender System