In [2]:
import pandas as pd
import csv
import time
import numpy as np
import matplotlib.pyplot as plt
import polars as pl
import xgboost as xgb

In [3]:
paths = ['submission_rerank_0.575.csv', 'submission2_5_0.578.csv', 'submission_cvm_0.542.csv']

In [4]:
def read_sub(path, weight=1):
    return(
        pl.read_csv(path)
        .with_column(pl.col('labels').str.split(by=' '))
        .with_column(pl.lit(weight).alias('vote'))
        .explode('labels')
        .rename({'labels': 'aid'})
        .with_column(pl.col('aid').cast(pl.UInt32))
        .with_column(pl.col('vote').cast(pl.UInt8))
    )

In [5]:
subs = [read_sub(path) for path in paths]
subs[0].head()

session_type,aid,vote
str,u32,u8
"""12899779_click...",59625,1
"""12899779_click...",1253524,1
"""12899779_click...",737445,1
"""12899779_click...",438191,1
"""12899779_click...",731692,1


In [7]:
subs = subs[0].join(subs[1], how='outer', on=['session_type', 'aid']).join(subs[2], how = 'outer', on=['session_type', 'aid'], suffix = '_right2')

In [8]:
subs = (subs
    .fill_null(0)
    .with_column((pl.col('vote') + pl.col('vote_right') + pl.col('vote_right2')).alias('vote_sum'))
    .drop(['vote', 'vote_right', 'vote_right2'])
    .sort(by='vote_sum')
    .reverse()
)

subs.head()

session_type,aid,vote_sum
str,u32,u8
"""13574184_order...",607738,3
"""13574184_order...",1180072,3
"""13574184_order...",1607333,3
"""13574184_order...",867544,3
"""13574184_order...",942224,3


In [9]:
%%time
preds = subs.groupby('session_type').agg([
    pl.col('aid').head(20).alias('labels')
])

preds = preds.with_column(pl.col('labels').apply(lambda lst: ' '.join([str(aid) for aid in lst])))

CPU times: total: 4min 56s
Wall time: 4min 20s


In [10]:
preds.write_csv('submission.csv')

In [11]:
preds

session_type,labels
str,str
"""14139658_carts...","""199409 148725 ..."
"""14019634_carts...","""1008494 131109..."
"""13179740_click...","""727928 148534 ..."
"""13859500_click...","""1789394 718964..."
"""13716013_order...","""1387079 974098..."
"""13932970_order...","""1107006 83668 ..."
"""13284098_order...","""553229 1401653..."
"""14538914_carts...","""455191 1296039..."
"""12961724_carts...","""1172853 178027..."
"""13337542_order...","""1119715 110288..."


In [13]:
preds[0,1]

'199409 148725 481213 429447 10565 1508734 102005 1639829 619309 1084146 894938 1450052 198599 1303888 819288 169239 1752091 847138 1304914 857553'

In [None]:
preds.to_csv('submission_sam.csv', index=False)