In [1]:
# Modified from https://www.kaggle.com/mbrown89/boost-your-score-guaranteed-leaderboard-probing/data
import numpy as np
import pandas as pd
import os

DATA_DIR = 'data'

class LeaderBoardProbing:

    def __init__(self):
        self.test=pd.read_csv(os.path.join(DATA_DIR,'test.csv'))
        sales=pd.read_csv(os.path.join(DATA_DIR, 'sales_train.csv'))
        # some routine data cleaning code
        shop_id_map={0:57, 1:58, 10:11}
        sales['shop_id']=sales['shop_id'].apply(lambda x: shop_id_map.get(x, x))
        self.test['shop_id']=self.test['shop_id'].apply(lambda x: shop_id_map.get(x, x))

        pairs={ (a, b) for a, b in zip(sales.shop_id, sales.item_id) }
        items={ a for a in sales.item_id }
        self.test['date_block_num']=34
        self.test['test_group']=[ 2 if (a,b) in pairs else (1 if b in items else 0) for a,b in zip(self.test.shop_id, self.test.item_id)]
        self.test.sort_values('ID', inplace=True)
        self.test.to_csv('new_test.csv.gz', index=False)

        self.test['item_cnt_month']=0.0
        self.n=len(self.test)
        self.n0=sum(self.test.test_group==0)
        self.n1=sum(self.test.test_group==1)
        self.n2=sum(self.test.test_group==2)

    def probe_mean(self):
        """Generate 4 LeaderBoardProbing files, set target to 0 for all three test groups,
        then set target to 1 for only one test group at a time.
        Manually submit the files to obtain public leaderboard scores.
        Then feed the scores to estimate_mean() to obtain mean values for all groups
        and store those means in group_mean()
        """
        os.makedirs('leak', exist_ok=True)
        self.save(self.test, 'leak/Probe000.csv')

        tmp=self.test.copy()
        tmp.loc[tmp.test_group==2, 'item_cnt_month']=1.0
        self.save(tmp, 'leak/Probe001.csv')

        tmp=self.test.copy()
        tmp.loc[tmp.test_group==1, 'item_cnt_month']=1.0
        self.save(tmp, 'leak/Probe010.csv')

        tmp=self.test.copy()
        tmp.loc[tmp.test_group==0, 'item_cnt_month']=1.0
        self.save(tmp, 'leak/Probe100.csv')

    def estimate_mean(self, rmse000, rmse100, rmse010, rmse001):
        """Obtain public scores for Probe000, Probe100, Probe010, Probe001
        Public,Private
        Probe000,1.250111,1.236582
        Probe100,1.23528,1.221182
        Probe010,1.38637,1.373707
        Probe001,1.29326,1.279869
        """

        def calc(rmse000, n, rmse_i, ni):
            u=(1-(rmse_i**2-rmse000**2)*n/ni)/2
            return u

        u0=calc(rmse000, self.n, rmse100, self.n0)
        u1=calc(rmse000, self.n, rmse010, self.n1)
        u2=calc(rmse000, self.n, rmse001, self.n2)
        u=(self.n0*u0+self.n1*u1+self.n2*u2)/self.n
        return(u0, u1, u2, u)

    def true_means(self):
        # computed by leader board probing
        # u0, u1, u2 and overall mean u
        # Kaggle public scores and Coursera scores slightly differ
        # Kaggle scores
        #return [0.7590957299173547, 0.060230457160248385, 0.39458181098366407, 0.2839717256500001]
        # use Coursera scores here
        return [0.758939742420249, 0.0601995732152425, 0.3945593622881204, 0.28393632703149974]

    def mean_scale(self, filename):
        """Compare the mean of each test group to their true public leaderboard means
        shift the prediction so that the means match
        filename: your submission csv file name
        """
        df=pd.read_csv(filename)
        df.sort_values('ID', ascending=True, inplace=True)
        mask0=self.test.test_group==0
        mask1=self.test.test_group==1
        mask2=self.test.test_group==2
        U=self.true_means()
        print("Group0: predict mean=", df[ mask0 ].item_cnt_month.mean(), "true mean=", U[0])
        print("Group1: predict mean=", df[ mask1 ].item_cnt_month.mean(), "true mean=", U[1])
        print("Group2: predict mean=", df[ mask2 ].item_cnt_month.mean(), "true mean=", U[2])
        change=999
        previous=df.item_cnt_month.values.copy()
        i=1
        while change>1e-6:
            df.loc[mask0, 'item_cnt_month']+=U[0]-df[ mask0 ].item_cnt_month.mean()
            df.loc[mask1, 'item_cnt_month']+=U[1]-df[ mask1 ].item_cnt_month.mean()
            df.loc[mask2, 'item_cnt_month']+=U[2]-df[ mask2 ].item_cnt_month.mean()
            df['item_cnt_month']=df['item_cnt_month'].clip(0,20)
            change=np.sum(np.abs(df.item_cnt_month.values - previous))
            previous=df.item_cnt_month.values.copy()
            print(">loop", i, "change:", change)
            i+=1
        self.save(df, filename.replace('.csv', '_mean.csv'))

    def variance_scale(self, filename, rmse, rmse000=1.250111):
        """
        filename: your submission csv file name
        rmse: your public leaderboard score
        """
        df=pd.read_csv(filename)
        df.sort_values('ID', ascending=True, inplace=True)
        n=df.shape[0]
        u=self.true_means()[-1]
        Yp=df.item_cnt_month.values
        YpYp=np.sum(Yp*Yp)
        YYp=n*(rmse000**2-rmse**2)/2+YpYp/2
        lambda_ = (YYp-u*u*n)/(YpYp-u*u*n)
        print(">>>>>multipler lambda=", lambda_)
        df['item_cnt_month']=(Yp-u)*lambda_+u
        filename2=filename.replace('.csv', '_lambda.csv')
        self.save(df, filename2)
        self.mean_scale(filename2)

    def save(self, df, filename):
        """Produce LeaderBoardProbing file based on dataframe"""
        df = df[['ID','item_cnt_month']].copy()
        df.sort_values(['ID'], ascending=True, inplace=True)
        df['item_cnt_month']=df['item_cnt_month'].apply(lambda x: "%.5f" % x)
        if np.isnan(df.item_cnt_month.isnull().sum()):
            print("ERROR>>>>> There should be no nan entry in the LeaderBoardProbing file!")
        print("Save LeaderBoardProbing to file:", filename)
        df.to_csv(filename, index=False)

    def flip_signs(self, filename):
        """
        Produce LeaderBoardProbing file, flip the sign of prediction for each of the three groups
        filename: your submission csv file name
        output:
            three new submission files with suffix _mpp.csv, _pmp.csv, _ppm.csv
            notation in the notebook
            m: minus, p: plus
            mpp is -++, pmp is +-+, ppm is ++-

        You need to submit these three files to obtain
            rmse_mpp, rmse_pmp, rmse_ppm
        Then you call
            LeaderBoardProbing.variance_scale_v2(filename, rmse_mpp, rmse_pmp, rmse_ppm, rmse)
            Note: rmse is the original rmse score obtained by your filename
        """
        df=pd.read_csv(filename)
        df.sort_values(['ID'], ascending=True, inplace=True)
        mask0=self.test.test_group==0
        mask1=self.test.test_group==1
        mask2=self.test.test_group==2
        tmp=df.copy()
        tmp.loc[mask0, 'item_cnt_month']=-tmp[ mask0 ].item_cnt_month
        self.save(tmp, filename.replace('.csv', '_mpp.csv'))
        tmp=df.copy()
        tmp.loc[mask1, 'item_cnt_month']=-tmp[ mask1 ].item_cnt_month
        self.save(tmp, filename.replace('.csv', '_pmp.csv'))
        tmp=df.copy()
        tmp.loc[mask2, 'item_cnt_month']=-tmp[ mask2 ].item_cnt_month
        self.save(tmp, filename.replace('.csv', '_ppm.csv'))

    def variance_scale_v2(self, filename, rmse_mpp, rmse_pmp, rmse_ppm, rmse):
        """
        filename: your submission csv file name
        You must use LeaderBoardProbing.flip_signs(filename)
            to generate three additional submission files, obtain their public scores
            and feed those scores as parameters
        Scores: rmse-++, rmse+-+, rmse++-, rmse+++

        output:
            New scaled submission file
        """
        df=pd.read_csv(filename)
        df.sort_values(['ID'], ascending=True, inplace=True)
        mask0=self.test.test_group==0
        mask1=self.test.test_group==1
        mask2=self.test.test_group==2
        n=len(df)
        n0=sum(mask0)
        n1=sum(mask1)
        n2=sum(mask2)
        YYp0=n/4*(rmse_mpp**2-rmse**2)
        YYp1=n/4*(rmse_pmp**2-rmse**2)
        YYp2=n/4*(rmse_ppm**2-rmse**2)
        U=self.true_means()
        Yp0=df.loc[mask0, 'item_cnt_month'].values
        Yp1=df.loc[mask1, 'item_cnt_month'].values
        Yp2=df.loc[mask2, 'item_cnt_month'].values
        lambda0=(YYp0-U[0]**2*n0)/(np.sum(Yp0*Yp0)-U[0]**2*n0)
        lambda1=(YYp1-U[1]**2*n1)/(np.sum(Yp1*Yp1)-U[1]**2*n1)
        lambda2=(YYp2-U[2]**2*n2)/(np.sum(Yp2*Yp2)-U[2]**2*n2)
        print("Labmda: ", lambda0, lambda1, lambda2)
        df.loc[mask0, 'item_cnt_month']=U[0]+lambda0*(df[ mask0 ].item_cnt_month-U[0])
        df.loc[mask1, 'item_cnt_month']=U[1]+lambda1*(df[ mask1 ].item_cnt_month-U[1])
        df.loc[mask2, 'item_cnt_month']=U[2]+lambda2*(df[ mask2 ].item_cnt_month-U[2])
        df['item_cnt_month']=df['item_cnt_month'].clip(0,20)
        fn=filename.replace('.csv', '_labmdaV2.csv')
        self.save(df, fn)
        self.mean_scale(fn)

In [5]:
import os

FILE_DIR = 'probing'
lbp = LeaderBoardProbing()
lbp.mean_scale(os.path.join(FILE_DIR, 'submission-final.csv'))

Group0: predict mean= 0.8240911625616667 true mean= 0.758939742420249
Group1: predict mean= 0.07654534640393983 true mean= 0.0601995732152425
Group2: predict mean= 0.372822242516336 true mean= 0.3945593622881204
>loop 1 change: 4309.989019927566
>loop 2 change: 397.4842882943934
>loop 3 change: 100.70253869346402
>loop 4 change: 27.459713901931405
>loop 5 change: 7.672664585976546
>loop 6 change: 2.1656835113777615
>loop 7 change: 0.6147213847326545
>loop 8 change: 0.17529071618127515
>loop 9 change: 0.05016009276223721
>loop 10 change: 0.014397686912328733
>loop 11 change: 0.004143973375257848
>loop 12 change: 0.0011956852826626035
>loop 13 change: 0.00034576781783551835
>loop 14 change: 0.00010018874802451838
>loop 15 change: 2.9082189744900067e-05
>loop 16 change: 8.455094461345958e-06
>loop 17 change: 2.4616113630804293e-06
>loop 18 change: 7.176337196712623e-07
Save LeaderBoardProbing to file: probing/submission-final_mean.csv


original->Your public and private LB scores are: 0.828486 and 0.823450.

after the mean scaling->Your public and private LB scores are: 0.827726 and 0.822908

In [6]:
lbp.variance_scale(os.path.join(FILE_DIR, 'submission-final_mean.csv'), 0.827726)

>>>>>multipler lambda= 1.0068056484384023
Save LeaderBoardProbing to file: probing/submission-final_mean_lambda.csv
Group0: predict mean= 0.7621731122916819 true mean= 0.758939742420249
Group1: predict mean= 0.05867760799240407 true mean= 0.0601995732152425
Group2: predict mean= 0.39531219653542643 true mean= 0.3945593622881204
>loop 1 change: 278.36865601168125
>loop 2 change: 18.63568117389056
>loop 3 change: 4.97555400486474
>loop 4 change: 1.391606977887781
>loop 5 change: 0.3912674940753036
>loop 6 change: 0.11042405033764335
>loop 7 change: 0.031276808593900945
>loop 8 change: 0.00889061440673175
>loop 9 change: 0.002535900795186869
>loop 10 change: 0.0007259552492794308
>loop 11 change: 0.00020852648642807892
>loop 12 change: 6.00897285342461e-05
>loop 13 change: 1.7368634887895973e-05
>loop 14 change: 5.034828301095362e-06
>loop 15 change: 1.4635805384022316e-06
>loop 16 change: 4.2642215007659967e-07
Save LeaderBoardProbing to file: probing/submission-final_mean_lambda_mean.cs

Your public and private LB scores are: 0.827716 and 0.822888.

In [7]:
lbp.flip_signs(os.path.join(FILE_DIR, 'submission-final_mean.csv'))

Save LeaderBoardProbing to file: probing/submission-final_mean_mpp.csv
Save LeaderBoardProbing to file: probing/submission-final_mean_pmp.csv
Save LeaderBoardProbing to file: probing/submission-final_mean_ppm.csv


Submission,Public,Private

submission-final_mean_mpp.csv, Your public and private LB scores are: 1.457908 and 1.453490.

submission-final_mean_pmp.csv, Your public and private LB scores are: 0.852850 and 0.844612.

submission-final_mean_ppm.csv, Your public and private LB scores are: 1.655513 and 1.617827.

In [8]:
lbp.variance_scale_v2(os.path.join(FILE_DIR, 'submission-final_mean.csv'), 1.457908, 0.852850, 1.655513, 0.827726)

Labmda:  0.8903581511236648 1.1060851394194742 1.1486633830454325
Save LeaderBoardProbing to file: probing/submission-final_mean_labmdaV2.csv
Group0: predict mean= 0.7589393572085058 true mean= 0.758939742420249
Group1: predict mean= 0.06227713087114098 true mean= 0.0601995732152425
Group2: predict mean= 0.39425648060930363 true mean= 0.3945593622881204
>loop 1 change: 147.64797089348082
>loop 2 change: 41.37019674807945
>loop 3 change: 15.976880555438102
>loop 4 change: 6.317577526411823
>loop 5 change: 2.5214492717507
>loop 6 change: 1.0094791939310601
>loop 7 change: 0.40468010329338117
>loop 8 change: 0.16231810535779084
>loop 9 change: 0.06514581118636899
>loop 10 change: 0.026146046450402347
>loop 11 change: 0.010493625517551147
>loop 12 change: 0.004211580440570115
>loop 13 change: 0.0016903033188069433
>loop 14 change: 0.000678397482154619
>loop 15 change: 0.0002722725126705766
>loop 16 change: 0.00010927574802814533
>loop 17 change: 4.385748717680715e-05
>loop 18 change: 1.760

Your public and private LB scores are: 0.822393 and 0.818753.

<!-- 000: Your public and private LB scores are: 1.250111 and 1.236582

001: Your public and private LB scores are: 1.293260 and 1.279869

010: Your public and private LB scores are: 1.386370 and 1.373707.

100: Your public and private LB scores are: 1.235280 and 1.221182. -->