In [1]:
import os
import gc
import json
import copy
import time
from tqdm.auto import tqdm

import scipy
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import dataset
from torch.utils.data import dataloader
from torch.autograd import Variable
from torch.utils.data import Dataset
from deepctr_torch.layers import DNN

import torchvision
from PIL import Image
import torchvision.models as models
import torchvision.transforms as transforms

In [2]:
def random_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


random_seed(2021)

In [3]:
train = pd.read_csv('data/onlinetrainbseqex3.csv')
test = pd.read_csv('data/onlinetestbseqex3.csv')

In [4]:
del train['Unnamed: 0']
del test['Unnamed: 0']

In [7]:
train_pb = pd.read_csv('features/online_train_pb.csv').rename( columns = { 'date' : 'end_date' } )
train = pd.merge(train, train_pb, how='left')

test_pb = pd.read_csv('features/online_test_pb.csv').rename( columns = { 'date' : 'end_date' } )
test = pd.merge(test, test_pb, how='left')

In [8]:
user_trait = pd.read_csv('features/user_trait_feature.csv')

train = pd.merge(train, user_trait, how='left')
test = pd.merge(test, user_trait, how='left')

In [9]:
launch_train = pd.read_csv('features/launch_online_train.csv').rename( columns = { 'date' : 'end_date' } )
train = pd.merge(train, launch_train, how='left')

launch_test = pd.read_csv('features/launch_online_test.csv').rename( columns = { 'date' : 'end_date' } )
test = pd.merge(test, launch_test, how='left')

In [11]:
train['week'] = train['end_date'].apply(lambda x: (x-130)%7+1)
test['week'] = test['end_date'].apply(lambda x: (x-130)%7+1)

In [12]:
train = train.fillna(0)
test = test.fillna(0)

In [13]:
train["future"] = train.future.apply(lambda x: json.loads(x))
test["future"] = test.future.apply(lambda x: json.loads(x))

In [14]:
train['label'] = train['future'].apply(lambda x: sum(x))
test['label'] = test['future'].apply(lambda x: sum(x))

In [15]:
train['flag'] = train['label'].apply(lambda x: 1 if x>=6 else 0)
test['flag'] = test['label'].apply(lambda x: 1 if x>=6 else 0)

In [16]:
transform = transforms.Compose([transforms.Normalize(mean=[0.5,],std=[0.5,])])

In [17]:
feats = ['playtime_last0', 'video_count_last0', 'playtime_last1', 'video_count_last1', 'playtime_last2', 'video_count_last2',
         'playtime_last3', 'video_count_last3', 'playtime_last4', 'video_count_last4', 'playtime_last5', 'video_count_last5',
         'playtime_last6', 'video_count_last6', 'playtime_last7', 'video_count_last7', 'device_type', 'sex','age', 'education',
         'occupation_status','device_ram_new','device_rom_new','diff_near','is_launch','launch_type_new','launchNum','NumLastWeek','preds_median_30',
         'preds_mean_4','preds_mean_4_weighted','weighted_median','week']

In [18]:
sparse_features = ['device_type','sex','age','education','occupation_status','week','is_launch','launch_type_new']

In [19]:
dense_features = [x for x in feats if x not in sparse_features]

In [20]:
len(feats)

33

In [21]:
voc_size = {}
for i in sparse_features:
    voc_size[i] = train[i].nunique()+1

In [22]:
class AQYDataset(Dataset):
    def __init__(self, df, device):

        self.launch_seq_list = df.list.apply( lambda x : json.loads(x.replace('nan' , '-1')) ).values
        
        self.sparse_feature_list = df[sparse_features].values
        self.dense_feature_list = df[dense_features].values
        
        self.label_list = df['flag'].values

    def __getitem__(self, index):

        
        launch_seq = self.launch_seq_list[index]#[:128] 
        launch_seq = np.array(launch_seq).astype('float') 
        launch_seq.resize(8 , 8 , 3 )

        label = self.label_list[index]
        launch_seq = torch.tensor( launch_seq )
        launch_seq = transform(launch_seq)
        
        sparse_feats = self.sparse_feature_list[index]
        dense_feats = self.dense_feature_list[index]

        return launch_seq, sparse_feats, dense_feats, label


    def __len__(self):
        return len(self.launch_seq_list)

In [23]:
def cal_score(pred, label):
    pred = np.array(pred)
    label = np.array(label)

    diff = (pred - label) / 7
    diff = np.abs(diff)

    score = 100 * (1 - np.mean(diff))
    return score

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [25]:
device

device(type='cuda')

In [26]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN , self).__init__()
 
        self.layer1 = nn.Sequential(  
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=2), 
            nn.BatchNorm2d(16), 
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        
        emb_dict = {}
        for feat in sparse_features:
            emb_dict[feat] = nn.Embedding(voc_size[feat], 32)
        self.embedding_layer =  nn.ModuleDict(emb_dict)
        
        self.fc = nn.Linear(288, 16)
        
        self.final = DNN(16+8*32+25, (64,32,1), activation='relu', 
                         l2_reg=0.03, dropout_rate=0.03, use_bn=True, device=device)

   
    def forward(self, x, sparse_feats, dense_feats):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        
        sparse_list = []
        for i in range(len(sparse_features)):
            sparse_list.append(self.embedding_layer[sparse_features[i]](sparse_feats[:,i]))
        
        sparse_emb = torch.cat(sparse_list, 1)
        dnn_input = torch.cat([out, sparse_emb, dense_feats], 1)
        result = self.final(dnn_input)

        result = result.sigmoid()
        
        return result

In [27]:
model = CNN().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.002)
criterion = nn.BCELoss()
best_val_score = float('-inf')
last_improve = 0

best_model = None

In [28]:
train_dataset = AQYDataset(train, device )
test_dataset = AQYDataset(test, device )

train_loader = DataLoader(train_dataset,
                      batch_size=256*16,
                      shuffle=True,
                      num_workers=4)
test_loader = DataLoader(test_dataset,
                      batch_size=256*16,
                      shuffle=False,
                      num_workers=4)

In [29]:
from sklearn.metrics import accuracy_score
def cal_acc(label_list,pred_list):
    pre_list = []
    for i in pred_list:
        if i>0.5:
            pre_list.append(1)
        else:
            pre_list.append(0)
            
    return accuracy_score(label_list,pre_list)

In [30]:
ls = []
for epoch in range(7):
    model.train()
    pred_list = []
    label_list = []
    for seq, sparse_feats, dense_feats, label in tqdm(train_loader):
        seq = seq.reshape( ( -1,3,8,8 ) ).to(device).to(torch.float32)
        sparse_feats = sparse_feats.long().to(device)
        dense_feats = dense_feats.long().to(device)
        label = label.to(device).to(torch.float32)
        
        pred = model(seq, sparse_feats, dense_feats)
        loss = criterion(pred.squeeze(), label)
        loss.backward()
        optimizer.step()
        model.zero_grad()
        
        pred_list.extend(pred.squeeze().cpu().detach().numpy())
        label_list.extend(label.squeeze().cpu().detach().numpy())
        del sparse_feats
        del dense_feats
        del pred 
        del seq
        del label
    
    total_loss = cal_acc(label_list,pred_list)
    model.eval()

    print(
        f'Epoch: {epoch} Loss: {total_loss}'
    )
    

  0%|          | 0/777 [00:00<?, ?it/s]

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch: 0 Loss: 0.9092451664687292


  0%|          | 0/777 [00:00<?, ?it/s]

Epoch: 1 Loss: 0.925947394774452


  0%|          | 0/777 [00:00<?, ?it/s]

Epoch: 2 Loss: 0.9263078337929508


  0%|          | 0/777 [00:00<?, ?it/s]

Epoch: 3 Loss: 0.9264864807061334


  0%|          | 0/777 [00:00<?, ?it/s]

Epoch: 4 Loss: 0.9267204830008655


  0%|          | 0/777 [00:00<?, ?it/s]

Epoch: 5 Loss: 0.9267720641518549


  0%|          | 0/777 [00:00<?, ?it/s]

Epoch: 6 Loss: 0.9267736367479217


In [31]:
def testf(model, test_loader, device):
    model.eval()

    pred_list = []
    label_list = []

    for seq, sparse_feats, dense_feats, label in tqdm(test_loader):
        seq = seq.reshape( ( -1,3,8,8 ) ).to(device).to(torch.float32)
        sparse_feats = sparse_feats.long().to(device)
        dense_feats = dense_feats.long().to(device)
        label = torch.tensor(label).to(torch.float32).to(device)

        pred = model(seq, sparse_feats, dense_feats)

        pred_list.extend(pred.squeeze().cpu().detach().numpy())
        label_list.extend(label.squeeze().cpu().detach().numpy())

    return pred_list , label_list

In [32]:
pred , label = testf( model, test_loader , device )

  0%|          | 0/9 [00:00<?, ?it/s]

  label = torch.tensor(label).to(torch.float32).to(device)


In [33]:
test[ 'pred' ] = pred

In [34]:
test[['user_id','pred']].to_csv('res/binary_seven.csv',index=False)