In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option('precision', 10)

In [4]:
BASE_DIR = '../../data/kddcup2020'
TRAIN_DIR = BASE_DIR + '/underexpose_train'
TEST_DIR = BASE_DIR + '/underexpose_test/underexpose_test_click-{}'
NEW_TRAIN_DIR = BASE_DIR + '/new_underexpose_train'
NEW_TEST_DIR = BASE_DIR + '/new_underexpose_test/new_underexpose_test_click-{}'

In [5]:
for t in range(10):
    df_train = pd.read_csv(TRAIN_DIR + '/underexpose_train_click-{}.csv'.format(t), names=['user_id', 'item_id', 'time'])
    df_test = pd.read_csv((TEST_DIR + '/underexpose_test_click-{}.csv').format(t, t), names=['user_id', 'item_id', 'time'])
    print('phase-{} train set shape: '.format(t), df_train.shape)
    print('phase-{} test set shape: '.format(t), df_test.shape)

    df_test_latest_ = df_test.groupby('user_id')['time'].max().reset_index()
    df_test_latest = df_test.merge(df_test_latest_, on=['user_id', 'time'], how='inner')
    print('phase-{} 新的预测样本shape: '.format(t), df_test_latest.shape)

    # 测试样本给定的前缀序列
    df_tmp = df_test_latest.copy()
    df_tmp['remove'] = 'remove'
    df_test_left = df_test.merge(df_tmp, on=['user_id', 'item_id', 'time'], how='left')
    df_test_left = df_test_left[df_test_left['remove'] != 'remove'].copy()
    df_test_left.drop(['remove'], axis=1, inplace=True)
    print('phase-{} 测试样本前缀序列shape: '.format(t), df_test_left.shape)

    # 训练样本剔除比预测样本时间早的数据
    df_train_tmp = df_train.merge(df_test_latest[['user_id', 'time']], on=['user_id'], how='left')
    df_train_tmp.dropna(subset=['time_y'], inplace=True)
    print('phase-{} 比预测数据时间大的数据shape: '.format(t), df_train_tmp.shape)
    if df_train_tmp.shape[0] != 0:
        time_index = df_train_tmp['time_x'] > df_train_tmp['time_y']
        df_train_tmp['remove'] = time_index
        df_train = df_train_tmp[df_train_tmp['remove'] == False][['user_id', 'item_id', 'time_x']]
        df_train.columns = ['user_id', 'item_id', 'time']

    df_train.to_csv(NEW_TRAIN_DIR + '/new_underexpose_train_click-{}.csv'.format(t), header=False, index=False)
    df_test_left.to_csv((NEW_TEST_DIR + '/new_underexpose_test_click-{}.csv').format(t, t), header=False, index=False)
    df_test_latest.to_csv((NEW_TEST_DIR + '/new_underexpose_test_qtime-{}.csv').format(t, t), header=False, index=False)
    print()

phase-0 train set shape:  (241784, 3)
phase-0 test set shape:  (21216, 3)
phase-0 新的预测样本shape:  (1663, 3)
phase-0 测试样本前缀序列shape:  (19553, 3)
phase-0 比预测数据时间大的数据shape:  (0, 4)
phase-1 train set shape:  (242132, 3)
phase-1 test set shape:  (24465, 3)
phase-1 新的预测样本shape:  (1726, 3)
phase-1 测试样本前缀序列shape:  (22739, 3)
phase-1 比预测数据时间大的数据shape:  (0, 4)
phase-2 train set shape:  (243569, 3)
phase-2 test set shape:  (22745, 3)
phase-2 新的预测样本shape:  (1690, 3)
phase-2 测试样本前缀序列shape:  (21055, 3)
phase-2 比预测数据时间大的数据shape:  (0, 4)
phase-3 train set shape:  (264263, 3)
phase-3 test set shape:  (24021, 3)
phase-3 新的预测样本shape:  (1675, 3)
phase-3 测试样本前缀序列shape:  (22346, 3)
phase-3 比预测数据时间大的数据shape:  (0, 4)
phase-4 train set shape:  (266994, 3)
phase-4 test set shape:  (26386, 3)
phase-4 新的预测样本shape:  (1708, 3)
phase-4 测试样本前缀序列shape:  (24678, 3)
phase-4 比预测数据时间大的数据shape:  (0, 4)
phase-5 train set shape:  (287523, 3)
phase-5 test set shape:  (27653, 3)
phase-5 新的预测样本shape:  (1801, 3)
phase-5 测试样本前缀序列sha

(241784, 3)

(21216, 3)

In [5]:
df_train.head()

Unnamed: 0,user_id,item_id,time
0,4965,18,0.983763
1,20192,34,0.983772
2,30128,91,0.98378
3,29473,189,0.98393
4,10625,225,0.983925


In [4]:
df_test.shape

(21216, 3)

In [6]:
df_test.head()

Unnamed: 0,user_id,item_id,time
0,1133,221,0.983812
1,17864,253,0.983783
2,6941,309,0.983785
3,34089,358,0.983781
4,21659,536,0.983793


In [19]:
df_test_latest_ = df_test.groupby('user_id')['time'].max().reset_index()

In [20]:
df_test_latest_.head()

Unnamed: 0,user_id,time
0,11,0.9838685327
1,22,0.9839563959
2,44,0.9839020247
3,55,0.9839383809
4,66,0.9838954698


In [21]:
df_test_latest_.shape

(1663, 2)

## qtime-with-answer

In [22]:
df_test_latest = df_test.merge(df_test_latest_, on=['user_id', 'time'], how='inner')

In [23]:
df_test_latest.shape

(1663, 3)

In [24]:
df_test_latest.head()

Unnamed: 0,user_id,item_id,time
0,24079,1518,0.9838959644
1,31515,9696,0.9839381235
2,7612,13702,0.9838836526
3,5709,19566,0.9838447079
4,28193,24878,0.9838817189


In [25]:
df_test_latest.sort_values('user_id')

Unnamed: 0,user_id,item_id,time
1217,11,8052,0.9838685327
91,22,101862,0.9839563959
263,44,25164,0.9839020247
995,55,51356,0.9839383809
945,66,43836,0.9838954698
...,...,...,...
422,35321,41138,0.9837885115
62,35343,29609,0.9837481347
828,35354,19713,0.9839025818
1655,35365,93864,0.9837867760


## test set

In [37]:
df_tmp = df_test_latest.copy()
df_tmp['remove'] = 'remove'
df_test_left = df_test.merge(df_tmp, on=['user_id', 'item_id', 'time'], how='left')

In [38]:
df_test_left.shape

(21216, 4)

In [39]:
df_test_left.head()

Unnamed: 0,user_id,item_id,time,remove
0,1133,221,0.9838116538,
1,17864,253,0.983782961,
2,6941,309,0.9837845963,
3,34089,358,0.9837808999,
4,21659,536,0.9837933069,


In [40]:
df_test_left = df_test_left[df_test_left['remove'] != 'remove'].copy()

In [45]:
df_test_left.shape

(19553, 3)

In [42]:
21216-1663

19553

In [43]:
df_test_left.drop(['remove'], axis=1, inplace=True)

In [44]:
df_test_left.head()

Unnamed: 0,user_id,item_id,time
0,1133,221,0.9838116538
1,17864,253,0.983782961
2,6941,309,0.9837845963
3,34089,358,0.9837808999
4,21659,536,0.9837933069


## train set

In [46]:
df_train.shape

(241784, 3)

In [50]:
df_train_tmp = df_train.merge(df_test_latest[['user_id', 'time']], on=['user_id'], how='left')

In [51]:
df_train_tmp.shape

(241784, 4)

In [52]:
df_train_tmp.head()

Unnamed: 0,user_id,item_id,time_x,time_y
0,4965,18,0.983763476,
1,20192,34,0.9837723703,
2,30128,91,0.9837801864,
3,29473,189,0.9839301484,
4,10625,225,0.9839253997,


In [58]:
tmp = df_train_tmp.dropna(subset=['time_y'])

In [64]:
if tmp.shape[0] != 0:
    time_index = tmp['time_x'] > tmp['time_y']
    tmp['remove'] = time_index
    df_train = tmp[tmp['remove'] == False][['user_id', 'item_id', 'time_x']]
    df_train.columns = ['user_id', 'item_id', 'time']