In [7]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import random
from scipy.sparse import coo_matrix
import torch

import warnings
warnings.filterwarnings(action='ignore')

In [8]:
CFG = {'SEED' : 42}

In [9]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [10]:
# 데이터 준비
apply_train = pd.read_csv('Data/apply_train.csv')
resume_update_date = pd.read_csv('Data/resume.csv')[['resume_seq', 'updated_date']]
resume_update_date['updated_date'] = pd.to_datetime(resume_update_date['updated_date'])
resume_update_date['updated_date'] = resume_update_date['updated_date'].apply(lambda x: int(x.timestamp()))

In [11]:
def save_index_mappings(apply_train):
    # 'resume_seq'와 'recruitment_seq' 각각의 고유값을 추출.
    unique_resume_seq = apply_train['resume_seq'].unique()
    unique_recruitment_seq = apply_train['recruitment_seq'].unique()

    # 'resume_seq'에 대한 고유한 정수 인덱스 매핑을 생성.
    resume_seq_to_index = {seq: idx for idx, seq in enumerate(unique_resume_seq)}

    # 'recruitment_seq'에 대한 정수 인덱스 매핑을 생성.
    recruitment_seq_to_index = {seq: idx + len(unique_resume_seq) for idx, seq in enumerate(unique_recruitment_seq)}

    # 결과를 파일에 저장할 수 있습니다. 예를 들어 pickle 형식을 사용할 수 있습니다.
    with open('resume_seq_to_index.pkl', 'wb') as f:
        pd.to_pickle(resume_seq_to_index, f)

    with open('recruitment_seq_to_index.pkl', 'wb') as f:
        pd.to_pickle(recruitment_seq_to_index, f)
    
    return resume_seq_to_index, recruitment_seq_to_index

# apply_train 데이터프레임을 함수에 전달하여 인덱스 매핑을 저장하고 결과를 받습니다.
resume_seq_to_index, recruitment_seq_to_index = save_index_mappings(apply_train)

# 결과를 확인합니다.
print(resume_seq_to_index)
print(recruitment_seq_to_index)

{'U05833': 0, 'U06456': 1, 'U07807': 2, 'U04842': 3, 'U08336': 4, 'U03753': 5, 'U01045': 6, 'U00825': 7, 'U07438': 8, 'U01917': 9, 'U00357': 10, 'U05806': 11, 'U05450': 12, 'U06130': 13, 'U01346': 14, 'U00385': 15, 'U05824': 16, 'U06246': 17, 'U00948': 18, 'U06687': 19, 'U04199': 20, 'U07934': 21, 'U04227': 22, 'U06206': 23, 'U01224': 24, 'U06389': 25, 'U07292': 26, 'U00671': 27, 'U03148': 28, 'U08015': 29, 'U00310': 30, 'U04319': 31, 'U06367': 32, 'U07627': 33, 'U05776': 34, 'U02907': 35, 'U05153': 36, 'U01456': 37, 'U05780': 38, 'U01128': 39, 'U08050': 40, 'U07467': 41, 'U03000': 42, 'U02220': 43, 'U00073': 44, 'U03806': 45, 'U03858': 46, 'U07038': 47, 'U00038': 48, 'U00725': 49, 'U00776': 50, 'U05438': 51, 'U02164': 52, 'U06477': 53, 'U06548': 54, 'U02225': 55, 'U01790': 56, 'U07653': 57, 'U02624': 58, 'U04972': 59, 'U07469': 60, 'U01432': 61, 'U03564': 62, 'U01944': 63, 'U06463': 64, 'U06173': 65, 'U04073': 66, 'U04661': 67, 'U05239': 68, 'U01558': 69, 'U03586': 70, 'U03001': 71, '

In [12]:
apply_train['rating:float'] = 1
train = apply_train.merge(resume_update_date, how='left', on='resume_seq')
train.rename(columns={'resume_seq' : 'user_id:token', 'recruitment_seq' : 'item_id:token', 'updated_date' : 'timestamp:float'}, inplace=True)
train['user_id:token'] = train['user_id:token'].map(resume_seq_to_index)
train['item_id:token'] = train['item_id:token'].map(recruitment_seq_to_index)
train

Unnamed: 0,user_id:token,item_id:token,rating:float,timestamp:float
0,0,8482,1,1599436800
1,1,8483,1,1596672000
2,2,8484,1,1580428800
3,3,8485,1,1591315200
4,4,8486,1,1604966400
...,...,...,...,...
57941,864,12173,1,1605916800
57942,4358,14658,1,1599782400
57943,8370,12943,1,1578268800
57944,1157,12370,1,1587772800


In [13]:
train.to_csv('/Users/seunghoonchoi/Downloads/RecBole/dataset/Dacon/Dacon.inter', sep='\t', index=False)