In [1]:
import polars as pl

In [2]:
from glob import glob
DIR = '/home/anhphantq/ensemble/*.csv'
paths = glob(DIR, recursive= True) 
paths

['/home/anhphantq/ensemble/submission_588_21_1.csv',
 '/home/anhphantq/ensemble/submission_second_588.csv',
 '/home/anhphantq/ensemble/submission_589_20_1.csv',
 '/home/anhphantq/ensemble/submission_first_588.csv',
 '/home/anhphantq/ensemble/public_submission.csv',
 '/home/anhphantq/ensemble/submission_588_latest.csv']

In [3]:
import numpy as np
def read_sub(path, shift=0): # higher shift more important
    '''a helper function for loading and preprocessing submissions'''
    return  (
        pl.read_csv(path)
            .with_column(pl.col('labels').str.split(by=' '))
            .with_column(pl.col("labels").arr.lengths().alias("num"))
            .rename({'labels': 'aid'})
            .with_column(pl.col('num').apply(lambda x: [i - shift for i  in range(x)]).alias('vote'))
            .explode(['aid', 'vote'])
            
    )[['session_type','aid', 'vote']]


In [4]:
shifts = [0.2,0.2,0.9,0.2, -18 ,0.1]
assert len(paths) == len(shifts)
subs = [read_sub(path, weight) for path, weight in zip(paths, shifts)]
subs[0].head()

session_type,aid,vote
str,str,f64
"""12899779_click...","""59625""",-0.2
"""12899779_click...","""941596""",0.8
"""12899779_click...","""737445""",1.8
"""12899779_click...","""1253524""",2.8
"""12899779_click...","""894169""",3.8


In [5]:
from tqdm.notebook import tqdm
rs = subs[0]

for i in tqdm(range(1, len(subs))):
    rs= rs.join(subs[i], how = 'outer', on = ['session_type', 'aid'], suffix= f'_{i}')

  0%|          | 0/5 [00:00<?, ?it/s]

In [6]:
# rs.select([pl.col('^vote.*$')])
subs = (rs
    
    .fill_null(30)
    
    .with_column(pl.min(pl.all().exclude(["aid", 'session_type', 'num_votes'])).alias("rank_sum"))
    .sort([pl.col('rank_sum')], reverse= [False])
 )

In [7]:
subs

session_type,aid,vote,vote_1,vote_2,vote_3,vote_4,vote_5,rank_sum
str,str,f64,f64,f64,f64,i64,f64,f64
"""12899779_click...","""59625""",-0.2,-0.2,-0.9,-0.2,33,-0.1,-0.9
"""12899780_click...","""1142000""",-0.2,-0.2,-0.9,-0.2,34,-0.1,-0.9
"""12899781_click...","""199008""",-0.2,-0.2,-0.9,-0.2,35,-0.1,-0.9
"""12899782_click...","""1033148""",17.8,0.8,-0.9,2.8,23,5.9,-0.9
"""12899783_click...","""1817895""",-0.2,-0.2,-0.9,-0.2,33,-0.1,-0.9
"""12899784_click...","""1190477""",-0.2,-0.2,-0.9,-0.2,35,-0.1,-0.9
"""12899785_click...","""775584""",-0.2,-0.2,-0.9,-0.2,35,-0.1,-0.9
"""12899786_click...","""955252""",-0.2,-0.2,-0.9,-0.2,34,-0.1,-0.9
"""12899787_click...","""1024433""",0.8,-0.2,-0.9,-0.2,33,-0.1,-0.9
"""12899788_click...","""1663048""",-0.2,-0.2,-0.9,-0.2,33,-0.1,-0.9


In [8]:
%%time
preds = subs.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))

CPU times: user 4min 52s, sys: 16.6 s, total: 5min 8s
Wall time: 3min 12s


In [9]:
len = preds['labels'].apply(lambda x: len(x.split()))

In [10]:
preds

session_type,labels
str,str
"""13038765_click...","""473227 483815 ..."
"""13554023_click...","""738098 435253 ..."
"""13699836_click...","""1501417 174188..."
"""14532338_click...","""1417794 457354..."
"""14446834_click...","""409620 1052124..."
"""13120164_click...","""1815731 484794..."
"""14127633_carts...","""891071 1677400..."
"""13177085_order...","""1615944 671829..."
"""14060055_click...","""281580 485992 ..."
"""12965971_carts...","""834162 1597320..."


In [11]:
FILE_NAME = 'ultimate_ensemble_submission.csv'
preds.write_csv(FILE_NAME)

In [12]:
!kaggle competitions submit -c otto-recommender-system -f {FILE_NAME} -m "hehe"

100%|████████████████████████████████████████| 783M/783M [00:33<00:00, 24.4MB/s]
Successfully submitted to OTTO – Multi-Objective Recommender System