In [None]:
!pip install -U deepctr-torch
!git clone https://github.com/jaeyoung-kang/career_recommendation.git

# import

In [None]:
import sys
sys.path.append('./career_recommendation')

In [None]:
import numpy as np
import pandas as pd
import torch

from sklearn.metrics import confusion_matrix

from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr_torch.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr_torch.models import DeepFM

In [None]:
from src.utils.major_cleaner import MajorCleaner

In [None]:
from src.utils.label_encoder import MultiFeatureLabelEncoder
from src.utils.label_encoder import VariableLenghthLabelEncoder
from src.dataset.augmentation import make_binary_target

# data load

In [None]:
data = pd.read_csv('./data/school.csv').sort_values('id')

In [None]:
major_cleaner = MajorCleaner(split_section=True)
data['school_major_name'] = major_cleaner.transform(data['school_major_name'])


# binary feature 변환

In [None]:
data['field'] = data['field'].str.split(',')
data = data.explode('field')
data = make_binary_target(data, 'field')

# feature 설정

In [None]:
# school 용
variable_length_sparse_feature = 'school_major_name'
sparse_features = ["id", "school_name", "school_major_state",
                   "school_major_level", "school_start",'school_end',
                   'school_state', 'field']
target = ['target']


# train-test 나누기

In [None]:
train_len = int(len(data) * 0.8)
train_data = data.iloc[:train_len]
test_data = data.iloc[train_len:]

# LabelEncoder 적용

## MultiFeatureLabelEncoder

In [None]:
multi_encoder = MultiFeatureLabelEncoder()

multi_encoder.fit(train_data, sparse_features)
train = multi_encoder.transform(train_data)

## VariableLenghthLabelEncoder

In [None]:
variable_encoder = VariableLenghthLabelEncoder()
variable_encoder.fit(train[variable_length_sparse_feature].str.split(','))
train[variable_length_sparse_feature] = variable_encoder.transform(
    train[variable_length_sparse_feature].str.split(',')
)

genres_length = np.array(list(map(len, train[variable_length_sparse_feature])))
max_len = max(genres_length)

train_major_list = pad_sequences(
    train[variable_length_sparse_feature], maxlen=max_len, padding='post',
)

# Feature 생성

In [None]:
fixlen_feature_columns = [SparseFeat(feat, train[feat].max() + 2, embedding_dim=4)
                          for feat in sparse_features]
varlen_feature_columns = [
    VarLenSparseFeat(
        SparseFeat(variable_length_sparse_feature, vocabulary_size=(train_genres_list).max() + 1, embedding_dim=4),
        maxlen=max_len,
        combiner='mean',
    ),
] 

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)


# 학습

In [None]:
model_input = {name: train[name] for name in sparse_features}
model_input[variable_length_sparse_feature] = train_major_list

In [None]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', device=device)

model.compile("adam", "binary_crossentropy", metrics=['accuracy'], )
history = model.fit(model_input, train[target].values, batch_size=256, epochs=1, verbose=2, validation_split=0.2)


# 추론

In [None]:
test = multi_encoder.transform(test_data)

In [None]:
test[variable_length_sparse_feature] = variable_encoder.transform(
    test[variable_length_sparse_feature].str.split(',')
)

test_major_list = pad_sequences(
    test[variable_length_sparse_feature], maxlen=max_len, padding='post',
)

In [None]:
model_input = {name: test[name] for name in sparse_features}
model_input[variable_length_sparse_feature] = test_major_list

In [None]:
predict = model.predict(model_input)

In [None]:
predict[predict<0.5] = 0
predict[predict>=0.5] = 1

In [None]:
tn, fp, fn, tp = confusion_matrix(test[target].values, predict).ravel()

In [None]:
sum([tn, tp]) / sum([tn, fp, fn, tp])