In [1]:
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split # train test비율에 맞게 짜르기

import argparse
import os
import time

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')


import easydict

In [2]:

def bind_model(model, optimizer=None):
    def save(path, *args, **kwargs):
        state = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }
        torch.save(state, os.path.join(path, 'model.pt'))
        print('Model saved')

    def load(path, *args, **kwargs):
        state = torch.load(os.path.join(path, 'model.pt'))
        model.load_state_dict(state['model'])
        if 'optimizer' in state and optimizer:
            optimizer.load_state_dict(state['optimizer'])
        print('Model loaded')

    # 추론
    def infer(path, **kwargs):
        return inference(path, model)

def inference(path, model, **kwargs):
    model.eval()
    
    data = Variable(preproc_data(pd.read_csv(path), train=False))
    output_pred_labels = torch.round(torch.sigmoid(model(data)))
    output_pred_labels = output_pred_labels.detach().numpy()
    output_pred_labels = output_pred_labels.astype('int').reshape(-1).tolist()

    # output format
    # [(step, label), (step, label), ..., (step, label)]
    results = [(step, label) for step, label in enumerate(output_pred_labels)]
    
    return results

def preproc_data(data, label=None, train=True, val_ratio=0.2, seed=1234):
    if train:
        dataset = dict()

        # NaN 값 0으로 채우기
        data = data.fillna(0)
        
        # 성별 ['M', 'F'] -> [0, 1]로 변환
        #data['gender_enc'] = np.where(data['gender'] == 'M', 0, 1)
        data['d_l_match_yn']=data['d_l_match_yn'].replace([True,False],[1,0])
        data['d_m_match_yn']=data['d_m_match_yn'].replace([True,False],[1,0])
        data['d_s_match_yn']=data['d_s_match_yn'].replace([True,False],[1,0])
        data['h_l_match_yn']=data['h_l_match_yn'].replace([True,False],[1,0])
        data['h_m_match_yn']=data['h_m_match_yn'].replace([True,False],[1,0])
        data['h_s_match_yn']=data['h_s_match_yn'].replace([True,False],[1,0])
        # 날짜 datetime으로 변환
        # df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y%m%d')

        #DROP_COLS = ['CDMID', 'gender', 'date', 'date_E','Ht','Wt','LDL','Cr','AST']
        DROP_COLS = ['id', 'contents_open_dt']
        X = data.drop(columns=DROP_COLS).copy()
        y = label

        # bsmote=BorderlineSMOTE(radnom_state=42, k_neighbors=5, m_neighbor=10)
        # X_bsmote, y_bsmote = bsmote.fit_resample(X,y)

        X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                          stratify=y,
                                                          test_size=val_ratio,
                                                          random_state=seed,
                                                          )

        # print(X_train.values)

        X_train = torch.as_tensor(X_train.values).float()
        y_train = np.array(y_train)
        y_train = torch.as_tensor(y_train.reshape(-1, 1)).float()
        X_val = torch.as_tensor(X_val.values).float()
        y_val = np.array(y_val)
        y_val = torch.as_tensor(y_val.reshape(-1, 1)).float()

        dataset['train'] = TensorDataset(X_train, y_train)
        dataset['val'] = TensorDataset(X_val, y_val)

        return dataset
    
    else:
        # NaN 값 0으로 채우기
        data = data.fillna(0)

        # 성별 ['M', 'F'] -> [0, 1]로 변환
        # data['gender_enc'] = np.where(data['gender'] == 'M', 0, 1)

        # 날짜 datetime으로 변환
        # df.loc[:, 'date'] = pd.to_datetime(df['date'], format='%Y%m%d')
        DROP_COLS = ['id', 'contents_open_dt']
        data = data.drop(columns=DROP_COLS).copy()

        X_test = torch.as_tensor(data.values).float()

        return X_test

In [3]:
class LogisticRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Sequential(
            nn.Linear(input_size,16),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Linear(16, output_size)
        )        

    def forward(self, x):
        return self.linear(x)

In [4]:

# args = argparse.ArgumentParser()

# # DONOTCHANGE: They are reserved for nsml
# args.add_argument('--mode', type=str, default='train', help='submit일때 해당값이 test로 설정됩니다.')
# args.add_argument('--iteration', type=str, default='0',
#                     help='fork 명령어를 입력할때의 체크포인트로 설정됩니다. 체크포인트 옵션을 안주면 마지막 wall time 의 model 을 가져옵니다.')
# args.add_argument('--pause', type=int, default=0, help='model 을 load 할때 1로 설정됩니다.')

# args.add_argument('--seed', type=int, default=42)
# args.add_argument('--batch_size', type=int, default=128)
# args.add_argument('--val_ratio', type=int, default=0.2)
# args.add_argument('--lr', type=float, default=0.1)
# args.add_argument('--input_size', type=int, default=17)
# args.add_argument('--epochs', type=int, default=50)
# config = args.parse_args()


config = easydict.EasyDict({
 
        "mode": 'train',
 
        "iteration": '0',
 
        "pause": 0,
 
        "seed": 42,
 
        "batch_size": 64,
 
        "val_ratio": 0.2,
    
        "lr": 0.1,
 
        "input_size": 32,
 
        "epochs": 50,
 
})

print(config.epochs)
time_init = time.time()

torch.manual_seed(config.seed)
np.random.seed(config.seed)

model = LogisticRegression(config.input_size, 1)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=config.lr)

# nsml.bind() should be called before nsml.paused()
bind_model(model, optimizer=optimizer)

50


In [15]:
train = pd.read_csv('C:\\Users\\user\\Desktop\\computing\\outside\\dacon\\jobcare_recommend\\JobCare_data\\train.csv')
test = pd.read_csv('C:\\Users\\user\\Desktop\\computing\\outside\\dacon\\jobcare_recommend\\JobCare_data\\test.csv')

x = train.iloc[:, :-1]
y = train.iloc[:, -1]

raw_data = x
raw_labels = y
dataset = preproc_data(raw_data, raw_labels, train=True, val_ratio=0.2, seed=1234) #train 데이터 분할
train_dl = DataLoader(dataset['train'], config.batch_size, shuffle=True) # train 데이터
val_dl = DataLoader(dataset['val'], config.batch_size, shuffle=False) # val데이터
#config.batch_size=128
print(dataset) #train 데이터 분할
print(train_dl)
print(val_dl)

time_dl_init = time.time()
# print('Time to dataloader initialization: ', time_dl_init - time_init)


model = LogisticRegression(config.input_size, 1)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=config.lr)


min_val_loss = np.inf
for epoch in range(config.epochs): #epochs=50
    # train model
    running_loss = 0.
    num_runs = 0
    model.train() #모델을 학습시키겠다는 신호
    
    
    # def train(self, mode=True):
    #     r"""Sets the module in training mode."""      
    #     self.training = mode
    #     for module in self.children():
    #         module.train(mode)
    #     return self


    total_length = len(train_dl)
    auc = 0
    for iter_idx, (data, labels) in enumerate(train_dl):
        data = Variable(data) #데이터
        labels = Variable(labels) #정답지

        output_pred = model(data) #모델이 학습해놓은 예측값
        print(output_pred)
        auc_labels = np.array(labels)
        auc_pred = np.array([])

        for i in range(len(labels)):
            if labels[i][0] == 0.:
                auc_pred = np.append(auc_pred, 1-output_pred[i][0].detach())
            else:
                auc_pred = np.append(auc_pred, output_pred[i][0].detach())

        #auc_pred = np.array([1 if i >=0.5 else 0 for i in output_pred])

        if 1. not in auc_labels:
            auc_labels = np.append(auc_labels, [[1]], axis =0)
            auc_pred = np.append(auc_pred, [0.5], axis=0)

        auc += roc_auc_score(auc_labels, auc_pred)

        loss = loss_fn(output_pred, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        num_runs += 1


        # get current lr
        opt_params = optimizer.state_dict()['param_groups'][0]
        step = epoch * total_length + iter_idx

        # nsml.report(
        #     epoch=epoch + int(config.iteration),
        #     epoch_total=config.epochs,
        #     iter=iter_idx,
        #     iter_total=total_length,
        #     batch_size=config.batch_size,
        #     train__loss=running_loss / num_runs,
        #     train_auc=auc / num_runs,
        #     step=step,
        #     lr=opt_params['lr'],
        #     scope=locals()
        # )

    print(f"[Epoch {epoch}] Loss: {running_loss / num_runs}")
    print(f"[Epoch {epoch}] auc: {auc / num_runs}")

    # test model with validation data
    model.eval() #모델을 평가하겠다는 신호
    running_loss = 0.
    auc = 0.
    num_runs = 0
    for data, labels in val_dl:
        data = Variable(data)
        labels = Variable(labels)

        output_pred = model(data)
        loss = loss_fn(output_pred, labels)

        auc_labels = np.array(labels)
        auc_pred = np.array([])

        for i in range(len(labels)):
            if labels[i][0] == 0.:
                auc_pred = np.append(auc_pred, 1-output_pred[i][0].detach())
            else:
                auc_pred = np.append(auc_pred, output_pred[i][0].detach())


        if 1. not in auc_labels:
            auc_labels = np.append(auc_labels, [[1]], axis =0)
            auc_pred = np.append(auc_pred, [0.5], axis=0)

        auc += roc_auc_score(auc_labels, auc_pred)

        running_loss += loss.item()
        num_runs += 1

    print(f"[Validation] Loss: {running_loss / num_runs}")
    print(f"[Validation] auc: {auc}")

    # nsml.report(
    #     summary=True,
    #     epoch=epoch,
    #     epoch_total=config.epochs,
    #     val__loss=running_loss / num_runs,
    #     val_auc=auc,
    #     step=(epoch + 1) * total_length,
    #     lr=opt_params['lr']
    # )

    # if (running_loss < min_val_loss) or (epoch % 10 == 0):
    #     nsml.save(epoch)

final_time = time.time()
print("Time to dataloader initialization: ", time_dl_init - time_init)
print("Time spent on training :", final_time - time_dl_init)
print("Total time: ", final_time - time_init)

print("Done")

{'train': <torch.utils.data.dataset.TensorDataset object at 0x0000021F9D85BD00>, 'val': <torch.utils.data.dataset.TensorDataset object at 0x0000021F9D85BCD0>}
<torch.utils.data.dataloader.DataLoader object at 0x0000021F9D85BE80>
<torch.utils.data.dataloader.DataLoader object at 0x0000021F8B6EB850>


SyntaxError: 'break' outside loop (<ipython-input-15-e76cd742c7e8>, line 21)