In [1]:

import numpy as np

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')


import easydict

In [2]:
#평가 지표

def f1_loss(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:

    assert y_true.ndim == 1
    assert y_pred.ndim == 1 or y_pred.ndim == 2
    
    if y_pred.ndim == 2:
        y_pred = y_pred.argmax(dim=1)
        
    
    tp = (y_true * y_pred).sum().to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
    fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum().to(torch.float32)
    
    epsilon = 1e-7
    
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    
    f1 = 2* (precision*recall) / (precision + recall + epsilon)
    f1.requires_grad = is_training
    return f1

In [163]:
#데이터 불러오기
train = pd.read_csv('C:\\Users\\user\\Desktop\\computing\\outside\\dacon\\jobcare_recommend\\JobCare_data\\train.csv')
test = pd.read_csv('C:\\Users\\user\\Desktop\\computing\\outside\\dacon\\jobcare_recommend\\JobCare_data\\test.csv')
d_code=pd.read_csv('C:\\Users\\user\\Desktop\\computing\\outside\\dacon\\jobcare_recommend\\Jobcare_data\\속성_D_코드.csv',index_col=0).T.to_dict()
h_code=pd.read_csv('C:\\Users\\user\\Desktop\\computing\\outside\\dacon\\jobcare_recommend\\Jobcare_data\\속성_H_코드.csv',index_col=0).T.to_dict()
l_code=pd.read_csv('C:\\Users\\user\\Desktop\\computing\\outside\\dacon\\jobcare_recommend\\Jobcare_data\\속성_L_코드.csv',index_col=0).T.to_dict()

In [164]:
#데이터 재가공
def add_code(df, d_code, h_code, l_code):
    df = df.copy()   

    # D Code
    df['person_prefer_d_1_n'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_2_n'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_3_n'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['contents_attribute_d_n'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    # H Code
    df['person_prefer_h_1_m'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df['person_prefer_h_2_m'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df['person_prefer_h_3_m'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df['contents_attribute_h_m'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 중분류코드'])

    df['person_prefer_h_1_l'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_2_l'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_3_l'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['contents_attribute_h_l'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 대분류코드'])

    # L Code
    df['contents_attribute_l_n'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
    
    return df


train = add_code(train, d_code, h_code, l_code)
test = add_code(test, d_code, h_code, l_code)

In [165]:
class LogisticRegression(nn.Module):

    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Sequential(
            nn.Linear(57,16),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )        

    def forward(self, x):
        return self.linear(x)

In [166]:
model=LogisticRegression()


In [167]:
train = train.drop(['id', 'contents_open_dt','person_rn', 'contents_rn'], axis=1) 
test = test.drop(['id', 'contents_open_dt','person_rn', 'contents_rn'], axis=1)

# print(train.head())
train['d_l_match_yn']=train['d_l_match_yn'].replace([True,False],[1,0])
train['d_m_match_yn']=train['d_m_match_yn'].replace([True,False],[1,0])
train['d_s_match_yn']=train['d_s_match_yn'].replace([True,False],[1,0])
train['h_l_match_yn']=train['h_l_match_yn'].replace([True,False],[1,0])
train['h_m_match_yn']=train['h_m_match_yn'].replace([True,False],[1,0])
train['h_s_match_yn']=train['h_s_match_yn'].replace([True,False],[1,0])

test['d_l_match_yn']=test['d_l_match_yn'].replace([True,False],[1,0])
test['d_m_match_yn']=test['d_m_match_yn'].replace([True,False],[1,0])
test['d_s_match_yn']=test['d_s_match_yn'].replace([True,False],[1,0])
test['h_l_match_yn']=test['h_l_match_yn'].replace([True,False],[1,0])
test['h_m_match_yn']=test['h_m_match_yn'].replace([True,False],[1,0])
test['h_s_match_yn']=test['h_s_match_yn'].replace([True,False],[1,0])

In [168]:
#컬럼 순서 변경
train.columns
train=train[['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_f', 'person_prefer_g',
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'person_prefer_d_1_n',
       'person_prefer_d_1_s', 'person_prefer_d_1_m', 'person_prefer_d_1_l',
       'person_prefer_d_2_n', 'person_prefer_d_2_s', 'person_prefer_d_2_m',
       'person_prefer_d_2_l', 'person_prefer_d_3_n', 'person_prefer_d_3_s',
       'person_prefer_d_3_m', 'person_prefer_d_3_l', 'contents_attribute_d_n',
       'contents_attribute_d_s', 'contents_attribute_d_m',
       'contents_attribute_d_l', 'person_prefer_h_1_m', 'person_prefer_h_2_m',
       'person_prefer_h_3_m', 'contents_attribute_h_m', 'person_prefer_h_1_l',
       'person_prefer_h_2_l', 'person_prefer_h_3_l', 'contents_attribute_h_l',
       'contents_attribute_l_n', 'contents_attribute_l_s',
       'contents_attribute_l_m', 'contents_attribute_l_l','target']]

test=test[['d_l_match_yn', 'd_m_match_yn', 'd_s_match_yn', 'h_l_match_yn',
       'h_m_match_yn', 'h_s_match_yn', 'person_attribute_a',
       'person_attribute_a_1', 'person_attribute_b', 'person_prefer_c',
       'person_prefer_d_1', 'person_prefer_d_2', 'person_prefer_d_3',
       'person_prefer_e', 'person_prefer_f', 'person_prefer_g',
       'person_prefer_h_1', 'person_prefer_h_2', 'person_prefer_h_3',
       'contents_attribute_i', 'contents_attribute_a',
       'contents_attribute_j_1', 'contents_attribute_j',
       'contents_attribute_c', 'contents_attribute_k', 'contents_attribute_l',
       'contents_attribute_d', 'contents_attribute_m', 'contents_attribute_e',
       'contents_attribute_h', 'person_prefer_d_1_n',
       'person_prefer_d_1_s', 'person_prefer_d_1_m', 'person_prefer_d_1_l',
       'person_prefer_d_2_n', 'person_prefer_d_2_s', 'person_prefer_d_2_m',
       'person_prefer_d_2_l', 'person_prefer_d_3_n', 'person_prefer_d_3_s',
       'person_prefer_d_3_m', 'person_prefer_d_3_l', 'contents_attribute_d_n',
       'contents_attribute_d_s', 'contents_attribute_d_m',
       'contents_attribute_d_l', 'person_prefer_h_1_m', 'person_prefer_h_2_m',
       'person_prefer_h_3_m', 'contents_attribute_h_m', 'person_prefer_h_1_l',
       'person_prefer_h_2_l', 'person_prefer_h_3_l', 'contents_attribute_h_l',
       'contents_attribute_l_n', 'contents_attribute_l_s',
       'contents_attribute_l_m', 'contents_attribute_l_l']]


In [204]:
torch.manual_seed(42)


train.shape
test.shape

(46404, 58)

In [201]:

x = train.iloc[0:1000, :-1]
y = train.iloc[0:1000, -1]




x= x.to_numpy()
print(x[0])
print(y[0])
x=np.where(x > 1000, x/1000, x)
x=np.where(x > 100, x/100, x)
x=np.where(x > 10, x/10, x)
x=x/100
print(x)

x_train = torch.FloatTensor(x)
y_train = torch.FloatTensor(y)



x_test=test.iloc[0:1000]
x_test=x_test.to_numpy()
x_test=np.where(x_test > 1000, x_test/1000, x_test)
x_test=np.where(x_test > 100, x_test/100, x_test)
x_test=np.where(x_test > 10, x_test/10, x_test)
x_test=x_test/100


x_test=torch.FloatTensor(x_test)

[   1    1    1    0    0    0    1    4    3    5  275  370  369    8
    1    1    4   95   59    3    3   10    2    1    2 1608  275    1
    4  139  275  274  274  216  369  368  297  216  369  368  297  216
  275  274  274  216  316  398  368  422    3   94   58   94 1607 1606
 1605 2016]
1
[[0.01    0.01    0.01    ... 0.01606 0.01605 0.02016]
 [0.      0.      0.      ... 0.01606 0.01605 0.02016]
 [0.      0.      0.      ... 0.01595 0.01572 0.02016]
 ...
 [0.      0.      0.      ... 0.0973  0.0954  0.02009]
 [0.      0.      0.      ... 0.0973  0.0954  0.02009]
 [0.01    0.      0.      ... 0.099   0.099   0.02006]]


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [182]:



print(x_train.shape)
print(y_train.shape)



torch.Size([1000, 58])
torch.Size([1000])


In [183]:
W = torch.rand((58, 1), requires_grad=True) # 크기는 2 x 1
b = torch.rand(1, requires_grad=True)
x_train

tensor([[0.0100, 0.0100, 0.0100,  ..., 0.0161, 0.0160, 0.0202],
        [0.0000, 0.0000, 0.0000,  ..., 0.0161, 0.0160, 0.0202],
        [0.0000, 0.0000, 0.0000,  ..., 0.0159, 0.0157, 0.0202],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0973, 0.0954, 0.0201],
        [0.0000, 0.0000, 0.0000,  ..., 0.0973, 0.0954, 0.0201],
        [0.0100, 0.0000, 0.0000,  ..., 0.0990, 0.0990, 0.0201]])

In [184]:
# hypothesis = 1 / (1 + torch.exp(-(x_train.matmul(W) + b)))
# print(x_train.matmul(W)+b)


print(x_train.matmul(W) + b)
torch.sigmoid(x_train.matmul(W) + b)


tensor([[0.9572],
        [0.8762],
        [1.1170],
        [1.2399],
        [0.9132],
        [0.8714],
        [1.1992],
        [0.9310],
        [1.1980],
        [1.1480],
        [1.0006],
        [1.1765],
        [1.2865],
        [1.1089],
        [1.0401],
        [1.0921],
        [1.4107],
        [1.2262],
        [1.5264],
        [0.8738],
        [0.9814],
        [0.9345],
        [0.8951],
        [0.7838],
        [1.1006],
        [1.0155],
        [1.1288],
        [0.8154],
        [1.1915],
        [0.9074],
        [1.0296],
        [1.3976],
        [0.9757],
        [0.8504],
        [0.9305],
        [1.2719],
        [0.8986],
        [1.0592],
        [1.6408],
        [1.2990],
        [1.2355],
        [1.2788],
        [1.2437],
        [1.6044],
        [0.8252],
        [0.8839],
        [1.0289],
        [0.9457],
        [0.8230],
        [0.8452],
        [0.7863],
        [1.2980],
        [0.8020],
        [1.1967],
        [1.0626],
        [0

tensor([[0.7226],
        [0.7060],
        [0.7534],
        [0.7755],
        [0.7137],
        [0.7050],
        [0.7684],
        [0.7173],
        [0.7682],
        [0.7591],
        [0.7312],
        [0.7643],
        [0.7836],
        [0.7519],
        [0.7389],
        [0.7488],
        [0.8039],
        [0.7732],
        [0.8215],
        [0.7055],
        [0.7274],
        [0.7180],
        [0.7099],
        [0.6865],
        [0.7504],
        [0.7341],
        [0.7556],
        [0.6933],
        [0.7670],
        [0.7125],
        [0.7368],
        [0.8018],
        [0.7263],
        [0.7006],
        [0.7172],
        [0.7811],
        [0.7107],
        [0.7425],
        [0.8376],
        [0.7857],
        [0.7748],
        [0.7822],
        [0.7762],
        [0.8326],
        [0.6953],
        [0.7076],
        [0.7367],
        [0.7203],
        [0.6949],
        [0.6996],
        [0.6870],
        [0.7855],
        [0.6904],
        [0.7679],
        [0.7432],
        [0

In [185]:
# optimizer 설정
optimizer = optim.SGD([W, b], lr=0.001)

nb_epochs = 1000
for epoch in range(nb_epochs + 1):

    # Cost 계산
    hypothesis = torch.sigmoid(x_train.matmul(W) + b)
    cost = -(y_train * torch.log(hypothesis) + 
             (1 - y_train) * torch.log(1 - hypothesis)).mean()

    # cost로 H(x) 개선
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    # 100번마다 로그 출력
    if epoch % 100 == 0:
        print('Epoch {:4d}/{} Cost: {:.6f}'.format(
            epoch, nb_epochs, cost.item()
        ))

Epoch    0/1000 Cost: 0.949955
Epoch  100/1000 Cost: 0.937104
Epoch  200/1000 Cost: 0.924774
Epoch  300/1000 Cost: 0.912951
Epoch  400/1000 Cost: 0.901622
Epoch  500/1000 Cost: 0.890774
Epoch  600/1000 Cost: 0.880392
Epoch  700/1000 Cost: 0.870464
Epoch  800/1000 Cost: 0.860974
Epoch  900/1000 Cost: 0.851909
Epoch 1000/1000 Cost: 0.843253


In [186]:
hypothesis = torch.sigmoid(x_train.matmul(W) + b)
print(hypothesis)

tensor([[0.6503],
        [0.6323],
        [0.6855],
        [0.7107],
        [0.6412],
        [0.6318],
        [0.7027],
        [0.6445],
        [0.7011],
        [0.6907],
        [0.6603],
        [0.6967],
        [0.7190],
        [0.6834],
        [0.6683],
        [0.6789],
        [0.7427],
        [0.7068],
        [0.7636],
        [0.6321],
        [0.6566],
        [0.6452],
        [0.6363],
        [0.6114],
        [0.6819],
        [0.6627],
        [0.6865],
        [0.6180],
        [0.7012],
        [0.6402],
        [0.6660],
        [0.7404],
        [0.6550],
        [0.6267],
        [0.6448],
        [0.7168],
        [0.6378],
        [0.6728],
        [0.7838],
        [0.7227],
        [0.7086],
        [0.7187],
        [0.7110],
        [0.7781],
        [0.6206],
        [0.6343],
        [0.6655],
        [0.6482],
        [0.6203],
        [0.6260],
        [0.6119],
        [0.7211],
        [0.6145],
        [0.7013],
        [0.6732],
        [0

In [187]:
prediction = hypothesis >= torch.FloatTensor([0.5])
print(prediction)

tensor([[True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        [True],
        

In [191]:
prediction=pd.DataFrame(prediction)
prediction=prediction.replace([True,False],[1,0])
prediction

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
995,1
996,1
997,1
998,1


In [203]:
hypothesis2 = torch.sigmoid(x_test.matmul(W) + b)
print(hypothesis2)

tensor([[0.7117],
        [0.6638],
        [0.6497],
        [0.6189],
        [0.6085],
        [0.6206],
        [0.7385],
        [0.6414],
        [0.6605],
        [0.7508],
        [0.6485],
        [0.7712],
        [0.6543],
        [0.6632],
        [0.6357],
        [0.6130],
        [0.6533],
        [0.6229],
        [0.6282],
        [0.6496],
        [0.6451],
        [0.7149],
        [0.6769],
        [0.7030],
        [0.6205],
        [0.6871],
        [0.6521],
        [0.6412],
        [0.6882],
        [0.6632],
        [0.6466],
        [0.6290],
        [0.6302],
        [0.6985],
        [0.7007],
        [0.6262],
        [0.6527],
        [0.6621],
        [0.7332],
        [0.6838],
        [0.6337],
        [0.6773],
        [0.6660],
        [0.6788],
        [0.7306],
        [0.6803],
        [0.6486],
        [0.6133],
        [0.6479],
        [0.7316],
        [0.7215],
        [0.6476],
        [0.6264],
        [0.6648],
        [0.6645],
        [0

In [None]:
prediction2 = hypothesis2 >= torch.FloatTensor([0.5])
print(prediction2)

prediction2=pd.DataFrame(prediction2)
prediction2=prediction2.replace([True,False],[1,0])
prediction2