In [None]:
# !pip install -U deepctr-torch
# !git clone https://github.com/jaeyoung-kang/career_recommendation.git

# import sys
# sys.path.append('./career_recommendation')

In [None]:
import sys
sys.path.append('..')

In [None]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import confusion_matrix

from src.model import DeepFMTrainer
from src.dataset.augmentation import make_binary_target

pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 500

In [None]:
data_path = '/Users/m/localspace/career_recommendation/data/merge_data.csv'
epochs = 3
target_col = 'career_task'

In [None]:
deepfm_target = ['label']

variable_length_sparse_features = [
    "school_major_name", 'skill',
]

sparse_features = [
    'career_turn', 'career_sum_period',
    'school_major_state',# 'school_name', 
    'school_major_level',  'school_state',
    '기타', '독일어', '러시아어', '베트남어', '에스파냐어', 
    '영어', '인도네시아어', '일본어', '중국어', '프랑스어', 
    'accum_count', 'career_task', 'enterprise_size'
]


# Data Load

In [None]:
data = pd.read_csv(data_path).sort_values('id')

# 추가 전처리

## enterprise_size

In [None]:
data.loc[(data['career_가입자수'].isna()) & (~data['career_name'].isna()), 'enterprise_size'] = '스타트업'
data.loc[data['career_가입자수'] > 1000, 'enterprise_size'] = '대기업'
data.loc[(data['career_가입자수'] <= 1000) & (~data['career_name'].isna()), 'enterprise_size'] = '중소기업'


## accum_count : 누적 award 수

In [None]:
data['accum_count'] = data['accum_count'].fillna(0)

In [None]:
data = data.drop_duplicates(subset=['id', 'career_name', 'field', 'career_task'], keep='last')

## 빈 career task <- field 대체

In [None]:
data.loc[data['career_task'].isna(), 'career_task'] = data.loc[data['career_task'].isna(), 'field'] 

## duplicated

In [None]:
data = data.drop_duplicates(subset=sparse_features + variable_length_sparse_features)

# Data Split

In [None]:
test_id = pd.read_pickle('/Users/m/Downloads/test_id.pkl')

In [None]:
train_data = data.loc[~data['id'].isin(test_id)].sort_index()
test_data = data.loc[data['id'].isin(test_id)].sort_index()

# Data Preprocessing

In [None]:
def preproc(
    data,
    target_col,
    positive_ratio=0.5
):
    data = data.copy()
    data[target_col] = data[target_col].fillna('')
    data[target_col] = data[target_col].str.strip() # naive preprocessing
    data = make_binary_target(
        data, 
        target_col, 
        positive_ratio=positive_ratio,
    )

    data = make_binary_target(
        data,
        target_col='enterprise_size',
        target_lst=['대기업', '중소기업', '스타트업'],
        positive_ratio=positive_ratio,
    ).drop_duplicates()

    data['label'] = data['career_task_label'] * data['enterprise_size_label']
#     data = data.drop(['career_task_label', 'enterprise_size_label'], axis=1)

    for col in variable_length_sparse_features:
        data[col] = data[col].str.split(',')

    data = data.reset_index(drop=True)
    return data

In [None]:
train_data = preproc(
    train_data,
    target_col=target_col,
)

test_data = preproc(
    test_data,
    target_col=target_col,
    positive_ratio=0,
)

In [None]:
train_data.loc[:, sparse_features]

In [None]:
train_data['skill']

## balanced train data

In [None]:
train_data.shape

In [None]:
n_count = (train_data['label'] == 1).sum()
negative_index = np.random.choice(train_data[train_data['label'] == 0].index, n_count)
positive_index = train_data[train_data['label'] == 1].index
train_data = train_data.loc[np.concatenate([positive_index, negative_index])]


In [None]:
train_data.shape

# Trainer

In [None]:
deepfm = DeepFMTrainer(
    target=deepfm_target,
    sparse_features=sparse_features,
    variable_length_features=variable_length_sparse_features,
)

# Fit

In [None]:
deepfm.fit(
    train_data, 
    epochs=10#epochs,
)

# Predict

In [None]:
predict = deepfm.predict(
    test_data,
)

In [None]:
test_data['predict'] = predict

In [None]:
# test_data.iloc[:50].loc[:, sparse_features+variable_length_sparse_features+['predict', 'label']]

## Evaluate

In [None]:
predict_data = test_data.drop_duplicates(subset=['id', 'career_task', 'enterprise_size'], keep='last')


In [None]:
# id별로 predict 값이 가장 큰 index 
predict_data = predict_data.loc[
    predict_data.groupby('id')['predict'].nlargest(5).reset_index()['level_1'].tolist(),
#     sparse_features + variable_length_sparse_features+['id', 'label', 'predict'],
]

In [None]:
predict_data = predict_data.sample(frac=1).reset_index(drop=True)x

In [None]:
predict_data = predict_data.loc[predict_data.groupby('id')['label'].idxmax().tolist()]

In [None]:
predict_data['label'].mean()

In [None]:
predict_data['label'].sum()

In [None]:
data['skill']

In [None]:
import dill as pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(deepfm, f)

# Test

In [None]:
import dill as pickle
with open('deepfm_model.pkl', 'rb') as f:
    deepfm = pickle.load(f)

In [None]:
deepfm.test(
    career_turn=0, # 이직 횟수
    career_sum_period=10.0, # 총 재직 기간
    certificate_name='컴퓨터활용능력1급', # 최근 취득 자격증
    school_name='숙명여자대학교', # 학교명
    school_major_name='통계학과', # 전공명 
    school_major_state='전공', # ['전공', '부전공', '복수전공', '연합전공'] 중 하나
    school_major_level='학사', # ['학사', '석사', '전문학사', '박사', '수료'] 중 하나
    school_state='졸업', # ['졸업', '재학', '중퇴', '휴학', '교환학생', '수료']
    skill='AI,Data Science,Machine Learning', # 본인 능력 , 연결로 입력
    기타=None, # 언어 능력 ['중상급(업무상 원활한 의사소통)', '고급(자유자재의 의사소통)', '초급', '중급(업무상 의사소통 가능)', '원어민 수준']
    독일어=None, 
    러시아어=None, 
    베트남어=None, 
    에스파냐어=None, 
    영어='중급(업무상 의사소통 가능)', 
    인도네시아어=None, 
    일본어=None, 
    중국어=None,
    프랑스어=None, 
    accum_count=0, # 수상 개수
)

# Lime

In [None]:
import lime
import lime.lime_tabular

In [None]:
def to_np_data(deepfm, data, feature_names):
    data = data.copy()
    data = data.loc[:, feature_names].to_numpy()
    categorical_features = range(len(feature_names))
    
    les = deepfm.multi_label_encoder._encoders

    categorical_names = {}
    for feature, idx in zip(feature_names, categorical_features):
        le = les[feature]
        data[:, idx] = deepfm.multi_label_encoder.uni_feature_transform(
            data[:, idx], feature)
        categorical_names[idx] = le.classes_
    data = data.astype(float)
    return data

In [None]:
data = to_np_data(deepfm, train_data, sparse_features)

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(
    data,
    class_names=[0, 1],
    feature_names = feature_names,
    categorical_features=categorical_features, 
    categorical_names=categorical_names, 
    verbose=False,
    mode='classification',
)

In [None]:
test = pd.DataFrame(
    {'career_turn':[0],
    'career_sum_period':[10.0],
    'certificate_name':['컴퓨터활용능력,ADsP'],
    'school_name':['아무대학교'],
    'school_major_name':['통계학과'],
    'school_major_state':['전공'],
    'school_major_level':['학사'],
    'school_state':['졸업'],
    '기타':[None],
    '독일어':[None], 
    '러시아어':[None], 
    '베트남어':[None], 
    '에스파냐어':[None], 
    '영어':['중급(업무상 의사소통 가능)'], 
    '인도네시아어':[None], 
    '일본어':[None], 
    '중국어':[None],
    '프랑스어':[None], 
    'accum_count':[1],
    'career_task':['연구원'],
     'enterprise_size': ['대기업']
    }, columns=feature_names)

In [None]:
test = to_np_data(deepfm, test, sparse_features)

In [None]:
exp = explainer.explain_instance(
    test[0], 
    deepfm.predict_encoded_data, 
    num_features=5,
    labels=(0,),
)
a = exp.as_list(0)

In [None]:
a