In [1]:
# !pip install -U deepctr-torch
# !git clone https://github.com/jaeyoung-kang/career_recommendation.git

# import sys
# sys.path.append('./career_recommendation')

In [2]:
import sys
sys.path.append('..')

In [3]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import confusion_matrix

from src.model import DeepFMTrainer
from src.utils import MajorCleaner
from src.dataset.augmentation import make_binary_target



In [4]:
data_path = '/Users/m/localspace/career_recommendation/data/school_variable_major.csv'
epochs = 3
target_col = 'field'

# Data Load

In [5]:
data = pd.read_csv(data_path).sort_values('id')

# Data Split

In [6]:
index_list = list(range(len(data)))
random.shuffle(index_list)

data = data.iloc[index_list]

In [7]:
train_len = int(len(data) * 0.8)
train_data = data.iloc[:train_len]
test_data = data.iloc[train_len:].sort_index()

# Data Preprocessing

In [8]:
def preproc(
    data,
    target_col,
    positive_ratio=0.5
):
    data = data.copy()
    data[target_col] = data[target_col].str.split(',')
    data = data.explode(target_col)
    data[target_col] = data[target_col].str.strip() # naive preprocessing
    data = make_binary_target(
        data, 
        target_col, 
        positive_ratio=positive_ratio,
    )

    data['school_major_name'] = data['school_major_name'].str.split(',')
    return data

In [9]:
train_data = preproc(
    train_data,
    target_col=target_col,
)

test_data = preproc(
    test_data,
    target_col=target_col,
    positive_ratio=0,
)

In [10]:
train_data

Unnamed: 0,id,school_name,school_major_name,school_major_state,school_major_level,school_start,school_end,school_state,field,label
0,10884,서울대학교,,전공,학사,1988.0,1992.0,졸업,경영지원,1.0
1,10884,서울대학교,,전공,학사,1988.0,1992.0,졸업,SW 개발,1.0
2,10884,서울대학교,,전공,학사,1988.0,1992.0,졸업,운영,1.0
3,10884,서울대학교,,전공,학사,1988.0,1992.0,졸업,비즈니스,0.0
4,21673,안산대학교,[경영],전공,학사,2012.0,2018.0,졸업,운영,0.0
...,...,...,...,...,...,...,...,...,...,...
131609,73168,서울대학교,,전공,학사,2012.0,2017.0,졸업,마케팅,0.0
131610,73168,서울대학교,,전공,학사,2012.0,2017.0,졸업,기획/PM,1.0
131611,7163,동서대학교,"[디자인, 시각]",전공,학사,2012.0,2017.0,졸업,디자인,1.0
131612,7163,동서대학교,"[디자인, 시각]",전공,학사,2012.0,2017.0,졸업,투자,0.0


# Trainer

In [11]:
deepfm_target = ['label']
sparse_features = [
    "school_major_state", #"school_name",
    "school_major_level", "school_start",
    "school_end", "school_state", "field",
]
variable_length_sparse_feature = "school_major_name"


In [12]:
deepfm = DeepFMTrainer(
    target=deepfm_target,
    sparse_features=sparse_features,
    variable_length_feature=variable_length_sparse_feature,
)

# Fit

In [13]:
train_data.head()

Unnamed: 0,id,school_name,school_major_name,school_major_state,school_major_level,school_start,school_end,school_state,field,label
0,10884,서울대학교,,전공,학사,1988.0,1992.0,졸업,경영지원,1.0
1,10884,서울대학교,,전공,학사,1988.0,1992.0,졸업,SW 개발,1.0
2,10884,서울대학교,,전공,학사,1988.0,1992.0,졸업,운영,1.0
3,10884,서울대학교,,전공,학사,1988.0,1992.0,졸업,비즈니스,0.0
4,21673,안산대학교,[경영],전공,학사,2012.0,2018.0,졸업,운영,0.0


In [14]:
deepfm.fit(
    train_data, 
    epochs=epochs,
)

Label Encoding ...

Build Model ...

Model Input ...
	exmple)
	 school_major_state :  6
	 school_major_level :  7
	 school_start :  31
	 school_end :  25
	 school_state :  5
	 field :  5
	 school_major_name :  [2 0 0 0]

cpu
Train on 105291 samples, validate on 26323 samples, 412 steps per epoch
Epoch 1/3
6s - loss:  0.6325 - accuracy:  0.6549 - val_accuracy:  0.6738
Please check the latest version manually on https://pypi.org/project/deepctr-torch/#history
Epoch 2/3
6s - loss:  0.5895 - accuracy:  0.6763 - val_accuracy:  0.6819
Epoch 3/3
6s - loss:  0.5820 - accuracy:  0.6787 - val_accuracy:  0.6790


# Predict

In [15]:
predict = deepfm.predict(
    test_data,
)

In [16]:
test_data['predict'] = predict

In [17]:
test_data

Unnamed: 0,id,school_name,school_major_name,school_major_state,school_major_level,school_start,school_end,school_state,field,label,predict
0,0,Yokohama University,[경영],전공,,2016.0,2017.0,졸업,경영지원,1.0,0.554357
1,0,Yokohama University,[경영],전공,,2016.0,2017.0,졸업,SW 개발,0.0,0.398351
2,0,Yokohama University,[경영],전공,,2016.0,2017.0,졸업,비즈니스,0.0,0.590907
3,0,Yokohama University,[경영],전공,,2016.0,2017.0,졸업,기획/PM,0.0,0.698997
4,0,Yokohama University,[경영],전공,,2016.0,2017.0,졸업,운영,0.0,0.626919
...,...,...,...,...,...,...,...,...,...,...,...
83225,74133,한성대학교,[경영],전공,학사,2004.0,2011.0,졸업,디자인,0.0,0.587032
83226,74133,한성대학교,[경영],전공,학사,2004.0,2011.0,졸업,마케팅,0.0,0.767365
83227,74133,한성대학교,[경영],전공,학사,2004.0,2011.0,졸업,투자,0.0,0.409641
83228,74133,한성대학교,[경영],전공,학사,2004.0,2011.0,졸업,HW 개발,0.0,0.140750


## Evaluate

In [18]:
# id별로 predict 값이 가장 큰 index 
predict_data = test_data.loc[
    test_data.groupby('id')['predict'].idxmax().tolist()
]

In [19]:
predict_data

Unnamed: 0,id,school_name,school_major_name,school_major_state,school_major_level,school_start,school_end,school_state,field,label,predict
16,0,동아대학교,[국제],전공,학사,2014.0,2018.0,졸업,마케팅,0.0,0.742879
21,1,대구경북과학기술원(DGIST),,전공,학사,2017.0,,재학,SW 개발,1.0,0.674574
31,11,성균관대학교 정보통신대학원,[컴퓨터],전공,석사,2010.0,2019.0,졸업,SW 개발,1.0,0.960054
42,14,국민대학교,[경영],전공,학사,2014.0,2019.0,재학,기획/PM,0.0,0.748338
51,21,고려대학교,"[정보, 컴퓨터]",복수전공,학사,2006.0,2013.0,졸업,SW 개발,1.0,0.937069
...,...,...,...,...,...,...,...,...,...,...,...
83183,74066,아주대학교,,전공,학사,2014.0,2018.0,졸업,기획/PM,0.0,0.641275
83191,74086,경희대학교,,전공,학사,2004.0,2013.0,졸업,기획/PM,0.0,0.687726
83201,74111,선문대학교,[컴퓨터],전공,학사,2007.0,2014.0,졸업,SW 개발,1.0,0.961754
83211,74126,충북대학교,[소프트웨어],전공,학사,2013.0,2019.0,재학,SW 개발,1.0,0.954027


In [20]:
predict_data['label'].mean()

0.4770071138211382