In [1]:
import pandas as pd
import numpy as np
# import config
import data_process.neg_sample as ng_sample
from sklearn import metrics, preprocessing
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import tqdm
import torch.nn.functional as F
import random
import evaluate_entity
from model_entity import EntityCat
from data_utils import CatData
from torch.utils.tensorboard import SummaryWriter
import os
from datetime import datetime
import time
# import argparse

In [2]:
BATCH_SIZE = 2**15
EPOCHS  = 10
TOP_K = 10

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [4]:
device

device(type='cuda', index=0)

In [5]:
df_train1  = ng_sample.read_feather("../Data/jobs/leave_one_train")
df_train2 = pd.read_feather("../Data/jobs/pos_neg_train_uncode0")
df_test = pd.read_csv("../Data/jobs/apps_neg.csv")

In [6]:
df_train1['rating'] = 1
df_train_all = pd.concat([df_train1, df_train2], axis=0)
df_train_all['flag'] = 1
df_test['flag'] = -1
df_all = pd.concat([df_train_all, df_test], axis=0)

In [7]:
data_input_path = '/home/hao/Documents/MA_thesis_win/hao_jrs/data/clean/sub'
apps = pd.read_feather(data_input_path + '/users_sub')
apps.rename(columns = {'UserID':'user'}, inplace=True)

In [8]:
# 'UserID', 'Applies', 'Split', 'City', 'DegreeType', 'State', 'Major', 'CurrentlyEmployed',
#          'ManagedOthers', 'WorkHistoryCount', 'TotalYearsExperience',
#          'ManagedHowMany', 'JobID', 'WindowID'

In [9]:
df_all = pd.merge(df_all, apps, how='left', on=['user'])

In [10]:
df_all.columns

Index(['user', 'item', 'rating', 'flag', 'WindowID', 'Split', 'City', 'State',
       'Country', 'ZipCode', 'DegreeType', 'Major', 'GraduationDate',
       'WorkHistoryCount', 'TotalYearsExperience', 'CurrentlyEmployed',
       'ManagedOthers', 'ManagedHowMany'],
      dtype='object')

In [11]:
le = preprocessing.LabelEncoder()

In [12]:
features = df_all.columns
features

Index(['user', 'item', 'rating', 'flag', 'WindowID', 'Split', 'City', 'State',
       'Country', 'ZipCode', 'DegreeType', 'Major', 'GraduationDate',
       'WorkHistoryCount', 'TotalYearsExperience', 'CurrentlyEmployed',
       'ManagedOthers', 'ManagedHowMany'],
      dtype='object')

In [48]:
context_feature = ['ManagedOthers', 'ManagedHowMany']

In [49]:
features = ['user', 'item'] + context_feature

In [50]:
for f in features:
    df_all[f] = le.fit_transform(df_all[f].astype('category').cat.codes \
                                                          .fillna(-1).values)

In [51]:
df_train = df_all[df_all.flag==1]
df_test = df_all[df_all.flag==-1]

In [52]:
df_train=df_train.drop(['flag'], axis=1)
df_test=df_test.drop(['flag'], axis=1)

In [53]:
df_train=df_train[features+['rating']]
df_test=df_test[features+['rating']]

In [54]:
num_feature=[]
cat_feature = features
label_name = 'rating'

In [55]:
np_train = df_train.values
# np_train

In [56]:
np_test = df_test.values

In [57]:
np_train.shape

(2747605, 5)

In [58]:
train_dataset = CatData(np_train)
test_dataset = CatData(np_test) 
train_loader = data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
test_loader = data.DataLoader(test_dataset, batch_size=100+1, shuffle=False, num_workers=0 )

In [59]:
embedding_size = []
for c in cat_feature:
    num_unique_values = int(df_all[c].nunique())
    embed_dim = int(min(np.ceil(num_unique_values/2), 50))
    embedding_size.append([num_unique_values, embed_dim])  

In [60]:
embedding_size

[[90169, 50], [131997, 50], [2, 1], [253, 50]]

In [61]:
model = EntityCat(embedding_size = embedding_size, num_numerical_cols = len(num_feature),
               output_size = 2)
model.to(device)
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [62]:
print(model)

EntityCat(
  (all_embeddings): ModuleList(
    (0): Embedding(90169, 50)
    (1): Embedding(131997, 50)
    (2): Embedding(2, 1)
    (3): Embedding(253, 50)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (layers): Sequential(
    (0): Linear(in_features=151, out_features=100, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=100, out_features=2, bias=True)
  )
)


In [63]:
tb_cf = "-".join(context_feature) 
tb_cf

'ManagedOthers-ManagedHowMany'

In [64]:
tb_cf = "-".join(context_feature) 
timestamp = datetime.now().strftime('%m-%d_%H-%M-%S')
writer = SummaryWriter('runs/trainer_{}_{}'.format(tb_cf, timestamp))
plot_n_batch = 10



def run_one_epoch(model, epoch_index, writer, data_loader=train_loader, is_train=True):
    running_loss = 0.
    avg_loss = 0.
    HR, NDCG, ROC = [], [], []
    
    for batch, (cat_data, label) in enumerate(data_loader):
        cat_data = cat_data.to(device)
        label = label.to(device).float()
        
        prediction = model(cat_data)[:,1]
        loss = loss_function(prediction, label)
        running_loss += loss.item()
        if is_train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        HR_1batch, NDCG_1batch, ROC_1batch = evaluate_entity.metrics(cat_data, prediction, label, TOP_K, is_train)
        HR.append(HR_1batch)
        NDCG.append( NDCG_1batch)
        ROC.append( ROC_1batch)
            
    avg_loss = running_loss / (batch +1)  
 
    avg_HR = np.mean(HR)
    avg_NDCG = np.mean(NDCG)
    avg_ROC = np.mean(ROC)



    return avg_loss, avg_HR, avg_NDCG , avg_ROC

 
for epoch in range(EPOCHS):
    print('EPOCH {}/{}: ---------'.format(epoch, EPOCHS))
    start_time = time.time()
    # Make sure gradient tracking is on, and do a pass over the data
 
    model.train(True)
    avg_loss_train, avg_HR_train, avg_NDCG_train , avg_ROC_train = run_one_epoch(model,epoch,\
                                                         writer, data_loader=train_loader, is_train=True)
    model.train(False)
    avg_loss_test, avg_HR_test, avg_NDCG_test, avg_ROC_test = run_one_epoch(model,epoch,\
                                                         writer, data_loader=test_loader, is_train=False)
    
    elapsed_time = time.time() - start_time
    print("The time elapse of epoch {:03d}".format(epoch) + " is: " +
          time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
    
    print(f"train_loss:{avg_loss_train}\ntrain_HR:{avg_HR_train}\
            \ntrain_NDCG:{avg_NDCG_train}\ntrain_ROC:{avg_ROC_train}\
            \ntest_loss:{avg_loss_test}\ntest_HR:{avg_HR_test}\
            \ntest_NDCG:{avg_NDCG_test}\ntest_ROC:{avg_ROC_test}" )
 
    writer.add_scalars('Loss',
                    { 'Train' : avg_loss_train, 'Test' : avg_loss_test },
                    epoch )
 
    writer.add_scalars('HitRate',
                { 'Train' : avg_HR_train, 'Test' : avg_HR_test },
                epoch)
    
    writer.add_scalars('NDCG',
            { 'Train' : avg_NDCG_train, 'Test' : avg_NDCG_test },
            epoch)
    
    writer.add_scalars('ROC',
            { 'Train' : avg_ROC_train, 'Test' : avg_ROC_test },
            epoch)

# # model_path = 'runs/model_{}'.format(timestamp)
# # torch.save(model.state_dict(), model_path)

writer.flush()
writer.close()

EPOCH 0/10: ---------
The time elapse of epoch 000 is: 00: 02: 27
train_loss:0.7218547818206605
train_HR:0.0            
train_NDCG:0.0
train_ROC:0.0            
test_loss:0.7074419527267505
test_HR:0.09912371619711674            
test_NDCG:0.04770466257251391
test_ROC:0.49968825295123226
EPOCH 1/10: ---------
The time elapse of epoch 001 is: 00: 02: 26
train_loss:0.6938206340585437
train_HR:0.0            
train_NDCG:0.0
train_ROC:0.0            
test_loss:0.7031366280252913
test_HR:0.11107671184934918            
test_NDCG:0.052872137754261014
test_ROC:0.5089856106392431
EPOCH 2/10: ---------
The time elapse of epoch 002 is: 00: 02: 28
train_loss:0.6912366918155125
train_HR:0.0            
train_NDCG:0.0
train_ROC:0.0            
test_loss:0.7258544233982954
test_HR:0.12673136719118064            
test_NDCG:0.06139332349654843
test_ROC:0.5220352398002449
EPOCH 3/10: ---------
The time elapse of epoch 003 is: 00: 02: 24
train_loss:0.6715300381183624
train_HR:0.0            
train_NDCG

In [None]:
# for epoch in tqdm.tqdm(range(EPOCHS)):
#     model.train()
#     for batch, (cat_data, label) in enumerate(train_loader):
#         cat_data = cat_data.to(device)
#         label = label.to(device)
#         model.zero_grad()
#         prediction = model(cat_data)[:,1]
#         label = label.float()
#         loss = loss_function(prediction, label)
#         loss.backward()
#         optimizer.step()
#     model.eval()
#     HR, NDCG, ROC = evaluate_entity.metrics(model, test_loader, TOP_K)
#     print("HR: {:.3f}\tNDCG: {:.3f}\tROC: {:.3f}".format(HR, NDCG, ROC))
