In [None]:
# !pip install -U deepctr-torch
# !git clone https://github.com/jaeyoung-kang/career_recommendation.git

# import sys
# sys.path.append('./career_recommendation')

In [None]:
import sys
sys.path.append('..')

In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import confusion_matrix

from src.model import DeepFMTrainer
from src.utils import MajorCleaner
from src.dataset.augmentation import make_binary_target

In [None]:
data_path = '/Users/m/localspace/career_recommendation/data/merge_data.csv'
epochs = 3
target_col = 'career_task'

# Data Load

In [None]:
data = pd.read_csv(data_path).sort_values('id')

# Data Split

In [None]:
index_list = list(range(len(data)))
random.shuffle(index_list)

data = data.iloc[index_list]

In [None]:
train_len = int(len(data) * 0.8)
train_data = data.iloc[:train_len]
test_data = data.iloc[train_len:].sort_index()

# Data Preprocessing

In [None]:
def preproc(
    data,
    target_col,
    positive_ratio=0.5
):
    data = data.copy()
    data[target_col] = data[target_col].fillna('')
    data[target_col] = data[target_col].str.strip() # naive preprocessing
    data = make_binary_target(
        data, 
        target_col, 
        positive_ratio=positive_ratio,
    )

    data['school_major_name'] = data['school_major_name'].str.split(',')
    data['skill'] = data['skill'].str.split(',')
    
    return data

In [None]:
train_data = preproc(
    train_data,
    target_col=target_col,
)

test_data = preproc(
    test_data,
    target_col=target_col,
    positive_ratio=0,
)

In [None]:
train_data.columns

# Trainer

In [None]:
deepfm_target = ['label']
sparse_features = [
    'career_name', 'career_start', 'career_end', 'career_period',
    'career_turn', 'career_sum_period', 'career_가입자수',
    'career_당월고지금액', 'certificate_name', 'certificate_year',
    'school_name', 'school_major_state',
    'school_major_level', 'school_start', 'school_end', 'school_state',
    '기타', '독일어', '러시아어', '베트남어', '에스파냐어', '영어', '인도네시아어', '일본어', '중국어',
    '프랑스어', 'award', 'accum_count', 'career_task',
]
variable_length_sparse_features = [
    "school_major_name", 'skill'
]


In [None]:
deepfm = DeepFMTrainer(
    target=deepfm_target,
    sparse_features=sparse_features,
    variable_length_features=variable_length_sparse_features,
)

# Fit

In [None]:
deepfm.fit(
    train_data, 
    epochs=epochs,
)

# Predict

In [None]:
predict = deepfm.predict(
    test_data,
)

In [None]:
test_data['predict'] = predict

## Evaluate

In [None]:
# id별로 predict 값이 가장 큰 index 
predict_data = test_data.loc[
    test_data.groupby('id')['predict'].idxmax().tolist()
]

In [None]:
predict_data

In [None]:
predict_data['label'].mean()