In [None]:
import re
import os
import random
import sys
sys.path.append('/Users/seunghoonchoi/Downloads/Dacon_recommend_system')
import requests

import warnings
warnings.filterwarnings(action='ignore') 

import ast

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from gensim.models import FastText

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import IterableDataset

import dgl
import dgl.nn as dglnn
import dgl.function as fn
from dgl.nn import HeteroGraphConv, SAGEConv

In [None]:
CFG = {'SEED' : 42}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

In [None]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.
    
    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions
    
    Returns:
    - recall: Recall@5 value
    """
    
    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]
    
    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")
    
    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]
    
    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()
    
    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()
    
    
    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함 
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [None]:
# 이력서 관련
resume = pd.read_csv('/Users/seunghoonchoi/Downloads/Dacon_recommend_system/Data/resume.csv')
resume_edu = pd.read_csv('/Users/seunghoonchoi/Downloads/Dacon_recommend_system/Data/resume_education.csv')
resume_cert = pd.read_csv('/Users/seunghoonchoi/Downloads/Dacon_recommend_system/Data/resume_certificate.csv')
resume_lang = pd.read_csv('/Users/seunghoonchoi/Downloads/Dacon_recommend_system/Data/resume_language.csv')

In [None]:
# 공고 관련
recruitment = pd.read_csv('/Users/seunghoonchoi/Downloads/Dacon_recommend_system/Data/recruitment.csv')
company = pd.read_csv('/Users/seunghoonchoi/Downloads/Dacon_recommend_system/Data/company.csv')

In [None]:
# 이력서 - 공고 매칭
apply_train = pd.read_csv('/Users/seunghoonchoi/Downloads/Dacon_recommend_system/Data/apply_train.csv')

In [None]:
resume.info()

In [None]:
resume_edu.info()

In [None]:
resume_cert.info()

In [None]:
resume_lang.info()

In [None]:
resume_lang['language'].unique()

In [None]:
resume_lang['exam_name'].unique()

In [None]:
# resume를 제외한 이력서 관련 데이터프레임에 'resume_seq' 중복값 있는지 확인
resume_edu_duplicates = resume_edu['resume_seq'].value_counts()
resume_cert_duplicates = resume_cert['resume_seq'].value_counts()
resume_lang_duplicates = resume_lang['resume_seq'].value_counts()

resume_edu_duplicates_count = resume_edu_duplicates[resume_edu_duplicates > 1].count()
resume_cert_duplicates_count = resume_cert_duplicates[resume_cert_duplicates > 1].count()
resume_lang_duplicates_count = resume_lang_duplicates[resume_lang_duplicates > 1].count()

resume_edu_duplicates_count, resume_cert_duplicates_count, resume_lang_duplicates_count

In [None]:
# 자격증 종류 확인
unique_certificate_contents = resume_cert['certificate_contents'].nunique()
unique_certificate_contents

In [None]:
# 가장 많은 자격증을 가진 사람이 몇 개의 자격증을 가졌는지.
max_certificates = resume_cert['resume_seq'].value_counts().max()
max_certificates

In [None]:
# resume_seq별로 자격증 그룹화 및 개수 반영(null값 제거)
resume_cert_cleaned = resume_cert.dropna(subset=['certificate_contents'])

cert_grouped = resume_cert_cleaned.groupby('resume_seq').agg(
    certificate_list=('certificate_contents', list),
    certificate_count=('certificate_contents', 'size')
).reset_index()

all_resume_seq = resume[['resume_seq']]
cert_grouped_complete = all_resume_seq.merge(cert_grouped, on='resume_seq', how='left')
cert_grouped_complete['certificate_list'] = cert_grouped_complete['certificate_list'].apply(lambda x: x if isinstance(x, list) else [])
cert_grouped_complete['certificate_count'].fillna(0, inplace=True)
cert_grouped_complete['certificate_count'] = cert_grouped_complete['certificate_count'].astype(int)

cert_grouped_complete.head()

In [None]:
# 외국어 종류 및 자격 개수만 원핫 인코딩, 점수나 시험 종류는 드랍.
lang_encoded = pd.get_dummies(resume_lang, columns=['language'], prefix="", prefix_sep="")
lang_grouped = lang_encoded.groupby('resume_seq').sum().reset_index()
lang_grouped_cleaned = lang_grouped.drop(columns=['exam_name', 'score'])
lang_grouped_cleaned.columns = ['resume_seq', 'lang_2', 'lang_3', 'lang_4', 'lang_8', 'lang_9']
lang_grouped_cleaned[['lang_2', 'lang_3', 'lang_4', 'lang_8', 'lang_9']] = lang_grouped_cleaned[['lang_2', 'lang_3', 'lang_4', 'lang_8', 'lang_9']].clip(upper=1)
lang_grouped_cleaned

In [None]:
# 중복 없앤 이력서 관련 데이터프레임들 병합
final_resume = resume.merge(cert_grouped_complete, on='resume_seq', how='left')
final_resume = final_resume.merge(lang_grouped_cleaned, on='resume_seq', how='left')
final_resume = final_resume.merge(resume_edu, on='resume_seq', how='left')

final_resume

In [None]:
recruitment.info()

In [None]:
recruitment['address_seq1'].unique()

In [None]:
# 공고 주소가 빈 데이터 확인
recruitment[recruitment['address_seq1'].isnull()]

In [None]:
# 저 공고의 회사가 어떤 주소인지 확인하기 위해, 저 공고에 지원한 이력서들이 다른 어디에 지원했는지 확인
applied_resumes_check = apply_train[apply_train['recruitment_seq'] == 'R01512']
resume_seqs_for_R01512 = applied_resumes_check['resume_seq'].tolist()
other_applications = apply_train[apply_train['resume_seq'].isin(resume_seqs_for_R01512)]
other_recruitments_seq = other_applications['recruitment_seq'].unique()

addresses_for_other_recruitments = recruitment[recruitment['recruitment_seq'].isin(other_recruitments_seq)]['address_seq1']

addresses_for_other_recruitments.unique()

In [None]:
# 3, 5, 20 중 하나, 어느 주소가 가장 많은지 확인.
address_counts_recruitment = recruitment['address_seq1'].value_counts()
address_counts_recruitment.loc[[3.0, 5.0, 20.0]]

In [None]:
# 3이 가장 많으므로, 3으로 채움
recruitment['address_seq1'] = recruitment['address_seq1'].fillna(3.0).astype(int)

In [None]:
# 다른 주소들은 nan값이 훨씬 많으므로, nan을 '정보없음'으로 함.
recruitment['address_seq2'].unique()

In [None]:
recruitment['address_seq3'].unique()

In [None]:
# recruitment의 check_box_keyword 고유값 계산
all_keywords = recruitment['check_box_keyword'].str.split(';').explode().dropna().unique()

num_unique_keywords = len(all_keywords)
num_unique_keywords

In [None]:
# recruitment 데이터프레임에서 text_keyword가 있으면 1, 아니면 0
recruitment['has_text_keyword'] = recruitment['text_keyword'].notna().astype(int)

# text_keyword에 해당 키워드가 있으면 1, 아니면 0
recruitment['part_time'] = recruitment['text_keyword'].str.contains('아르바이트').fillna(0).astype(int)
recruitment['intern'] = recruitment['text_keyword'].str.contains('인턴').fillna(0).astype(int)
recruitment['entry_level'] = recruitment['text_keyword'].str.contains('신입').fillna(0).astype(int)
recruitment['experienced'] = recruitment['text_keyword'].str.contains('경력|경력직').fillna(0).astype(int)
recruitment['team_leader'] = recruitment['text_keyword'].str.contains('팀장|팀장급').fillna(0).astype(int)

# check_box_keyword를 나눈 다음 one_hot encoding
check_box_encoded = recruitment['check_box_keyword'].str.get_dummies(sep=';')
recruitment = pd.concat([recruitment, check_box_encoded], axis=1)

# 원래 열들 드랍
recruitment_cleaned = recruitment.drop(columns=['address_seq2', 'address_seq3', 'text_keyword', 'check_box_keyword'])
recruitment_cleaned

In [None]:
# company와 recruitment 병합 전 company데이터 살펴보기.
unique_company_type_seq = company['company_type_seq'].unique()
unique_supply_kind = company['supply_kind'].unique()
unique_employee = company['employee'].unique()

unique_company_type_seq, unique_supply_kind, unique_employee

In [None]:
# employee 최소, 최대
min_employee = company['employee'].min()
max_employee = company['employee'].max()

min_employee, max_employee

In [None]:
# company 정보와 최종 병합. 없는 데이터는 일단 nan으로.
final_recruitment = recruitment_cleaned.merge(company, on='recruitment_seq', how='left')
final_recruitment

In [None]:
'''
employee nan값 채우기
'정보없음' : 0 - nan값인 경우
'영세기업' : 1 - employee가 5인 미만인 경우
'중소기업' : 2 - employee가 1000명 미만인 경우
'중견기업' : 3 - 'employee가 1000명 이상인 경우
다른 열도 nan값 채우기, '정보없음'이라는 의미로 0
'''
final_recruitment['employee_category'] = np.where(final_recruitment['employee'].isna(), 0,
                                         np.where(final_recruitment['employee'] < 5, 1,
                                                  np.where(final_recruitment['employee'] < 1000, 2, 3)))

final_recruitment = final_recruitment.drop(columns=['employee'])
final_recruitment[['company_type_seq', 'supply_kind']] = final_recruitment[['company_type_seq', 'supply_kind']].fillna(0)
final_recruitment

In [None]:
# job_code_seq2와 job_code_seq3의 유의값, nan값 개수 다시 살펴보기
job_code_seq2_nan_count = final_resume['job_code_seq2'].isna().sum()
job_code_seq2_non_nan_count = final_resume['job_code_seq2'].notna().sum()

job_code_seq3_nan_count = final_resume['job_code_seq3'].isna().sum()
job_code_seq3_non_nan_count = final_resume['job_code_seq3'].notna().sum()

job_code_seq2_nan_count, job_code_seq2_non_nan_count, job_code_seq3_nan_count, job_code_seq3_non_nan_count

In [None]:
# seq2, seq3의 nan을 '정보없음'으로 변경.
final_resume['job_code_seq2'].fillna('정보없음', inplace=True)
final_resume['job_code_seq3'].fillna('정보없음', inplace=True)

# 각각 label encoding
label_encoders = {}

for column in ['job_code_seq1', 'job_code_seq2', 'job_code_seq3']:
    le = LabelEncoder()
    final_resume[column] = le.fit_transform(final_resume[column])
    label_encoders[column] = le

final_resume[['job_code_seq1', 'job_code_seq2', 'job_code_seq3']].head()

In [None]:
# language 결측치는 어차피 없는 정보이므로 모두 0으로 통일
final_resume[['lang_2', 'lang_3', 'lang_4', 'lang_8', 'lang_9']] = final_resume[['lang_2', 'lang_3', 'lang_4', 'lang_8', 'lang_9']].fillna(0)

In [None]:
# career_job_code의 경우에도 nan은 정보없음으로 하고, 나머지 label_encoding
final_resume['career_job_code'].fillna('정보없음', inplace=True)

le_career = LabelEncoder()

final_resume['career_job_code'] = le_career.fit_transform(final_resume['career_job_code'])

final_resume['career_job_code'].head()

In [None]:
# 전공은 univ_major_type으로 대체. univ_major와 univ_sub_major는 drop
final_resume = final_resume.drop(['univ_major', 'univ_sub_major'], axis=1)
final_resume

In [None]:
# text_keyword에서 ;를 기준으로 분리
keywords = final_resume['text_keyword'].str.split(';').dropna().tolist()

# FastText 학습
model = FastText(sentences=keywords, vector_size=100, window=5, min_count=2, workers=4, sg=1, epochs=200)

# 임베딩 결과 확인을 위한 샘플 키워드 출력
sample_keyword = "디자이너"
model.wv.most_similar(sample_keyword)

In [None]:
# 군집화
words = list(model.wv.index_to_key)
vectors = [model.wv[word] for word in words]

n_clusters = 61 # recruitment check_box_keyword의 고유값과 같은 숫자로 매핑
kmeans = KMeans(n_clusters=n_clusters, random_state=CFG['SEED']).fit(vectors)

for cluster_num in range(n_clusters):
    words_in_cluster = [words[i] for i, label in enumerate(kmeans.labels_) if label == cluster_num]
    print(f"Cluster {cluster_num+1}: {', '.join(words_in_cluster)}")

In [None]:
# 각 키워드가 어떤 클러스터에 속하는지 확인하는 함수
def get_clusters_for_keywords(keywords_list, model, kmeans):
    clusters = [kmeans.predict([model.wv[k]])[0] if k in model.wv.index_to_key else -1 for k in keywords_list]
    return clusters

# 키워드를 분리하고 각 키워드의 클러스터를 가져옴.
# NaN 값이 있을 경우 빈 리스트로 처리.
all_clusters = final_resume['text_keyword'].str.split(';').fillna('').apply(lambda x: get_clusters_for_keywords(x, model, kmeans))

# 각 클러스터에 대해 final_resume에 새로운 열을 추가.
for i in range(n_clusters):
    final_resume[f'keyword_cluster_{i+1}'] = all_clusters.apply(lambda clusters: int(i in clusters))

# text_keyword 열 삭제
final_resume.drop(columns=['text_keyword'], inplace=True)

In [None]:
# certificate의 맞춤법 교정 및 클러스터링을 시도하려고 했으나, 제대로 클러스터링이 안되는 것 같음. 
# recruitment의 qualification에 해당되는 지원자만 지원했다고 생각하고, certificate_list는 드랍.
# 단 certificate_count는 남겨놓음.

final_resume = final_resume.drop('certificate_list', axis=1)
final_resume

In [None]:
# 아직 object로 남아있는 부분 정리
# 1. resume_edu에서 가져왔던 고등학교 정보들 label_encoding
columns_to_encode = ['hischool_special_type', 'hischool_nation', 'hischool_gender']

for col in columns_to_encode:
    le = LabelEncoder()
    final_resume[col] = le.fit_transform(final_resume[col])

In [None]:
# 시간 정보 변환
final_resume['updated_date'] = pd.to_datetime(final_resume['updated_date'])
final_resume['reg_date'] = pd.to_datetime(final_resume['reg_date'])

final_resume

In [None]:
# graduate_date가 0인 데이터가 많은데, degree로 확인이 불가하므로 그냥 0으로 둠.
final_resume[final_resume['graduate_date'] == 0]

In [None]:
# graphSAGE 모델 만들기
# 1. bi-partite 그래프 형성.

# ID 매핑
resume_ids = {v: k for k, v in enumerate(final_resume['resume_seq'].unique())}
recruitment_ids = {v: k for k, v in enumerate(final_recruitment['recruitment_seq'].unique())}

# 엣지 생성
edges = apply_train.apply(lambda row: (resume_ids[row['resume_seq']], recruitment_ids[row['recruitment_seq']]), axis=1)
# 랜덤 워크 위한 양방향 관계를 추가.
edges_reversed = [(dst, src) for src, dst in zip(src_nodes, dst_nodes)]

# 소스와 목적지 노드 리스트
src_nodes = [edge[0] for edge in edges]
dst_nodes = [edge[1] for edge in edges]

# 소스와 목적지 노드 리스트 (역방향)
src_nodes_reversed = [edge[0] for edge in edges_reversed]
dst_nodes_reversed = [edge[1] for edge in edges_reversed]

# 이분 그래프 생성
graph_data = {
    ('resume', 'applies_to', 'recruitment'): (torch.tensor(src_nodes), torch.tensor(dst_nodes)),
    ('recruitment', 'is_applied_by', 'resume'): (torch.tensor(src_nodes_reversed), torch.tensor(dst_nodes_reversed))
}
G = dgl.heterograph(graph_data)

# 특성 스케일링
scaler_resume = MinMaxScaler()
scaler_recruitment = MinMaxScaler()

resume_features_scaled = scaler_resume.fit_transform(final_resume.drop(columns=['resume_seq']).values)
recruitment_features_scaled = scaler_recruitment.fit_transform(final_recruitment.drop(columns=['recruitment_seq']).values)

resume_features_tensor = torch.tensor(resume_features_scaled, dtype=torch.float32)
recruitment_features_tensor = torch.tensor(recruitment_features_scaled, dtype=torch.float32)

# 이분 그래프에 특성 설정
G.nodes['resume'].data['features'] = resume_features_tensor
G.nodes['recruitment'].data['features'] = recruitment_features_tensor

# 그래프 정보 출력
num_resumes = G.num_nodes('resume')
num_recruitments = G.num_nodes('recruitment')
num_edges = G.num_edges('applies_to')

print("Number of resumes:", num_resumes)
print("Number of recruitments:", num_recruitments)
print("Number of edges:", num_edges)

In [None]:
class ItemToItemBatchSampler(IterableDataset):
    def __init__(self, g, resume_type, recruitment_type, batch_size):
        self.g = g
        self.resume_type = resume_type
        self.recruitment_type = recruitment_type
        self.resume_to_recruitment_etype = ('resume', 'applies_to', 'recruitment')
        self.recruitment_to_resume_etype = ('recruitment', 'is_applied_by', 'resume')
        self.batch_size = batch_size

    def __iter__(self):
        while True:
            heads = torch.randint(0, self.g.number_of_nodes(self.resume_type), (self.batch_size,))
            result = dgl.sampling.random_walk(
                self.g,
                heads,
                metapath=[self.resume_to_recruitment_etype, self.recruitment_to_resume_etype])
            tails = result[0][:, 2]
            neg_tails = torch.randint(0, self.g.number_of_nodes(self.recruitment_type), (self.batch_size,))
            mask = (tails != -1)
            yield heads[mask], tails[mask], neg_tails[mask]

In [None]:
# sampling : 코드 참고(https://yamalab.tistory.com/165)
class NeighborSampler(object):
    def __init__(self, g, resume_type, recruitment_type, random_walk_length, random_walk_restart_prob,
                 num_random_walks, num_neighbors, num_layers):
        self.g = g
        self.resume_type = resume_type
        self.recruitment_type = recruitment_type
        self.resume_to_recruitment_etype = ('resume', 'applies_to', 'recruitment')
        self.recruitment_to_resume_etype = ('recruitment', 'is_applied_by', 'resume')
        self.samplers = [
            dgl.sampling.PinSAGESampler(g, recruitment_type, resume_type, random_walk_length,
                                        random_walk_restart_prob, num_random_walks, num_neighbors)
            for _ in range(num_layers)]

    def sample_blocks(self, seeds, heads=None, tails=None, neg_tails=None):
        blocks = []
        for sampler in self.samplers:
            frontier = sampler(seeds)
            block = compact_and_copy(frontier, seeds)
            seeds = block.srcdata[dgl.NID]
            blocks.insert(0, block) 
        return blocks

    def sample_from_item_pairs(self, heads, tails, neg_tails):
        pos_graph = dgl.graph(
            (heads, tails),
            num_nodes=self.g.number_of_nodes(self.recruitment_type))
        neg_graph = dgl.graph(
            (heads, neg_tails),
            num_nodes=self.g.number_of_nodes(self.recruitment_type))
        pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
        seeds = pos_graph.ndata[dgl.NID]
        blocks = self.sample_blocks(seeds, heads, tails, neg_tails)
        return pos_graph, neg_graph, blocks

In [None]:
class PinSAGECollator(object):
    def __init__(self, sampler, g, ntype):
        self.sampler = sampler
        self.ntype = ntype
        self.g = g

    def collate_train(self, batches):
        heads, tails, neg_tails = batches[0]
        pos_graph, neg_graph, blocks = self.sampler.sample_from_item_pairs(heads, tails, neg_tails)
        assign_features_to_blocks(blocks, self.g, self.ntype)
        return pos_graph, neg_graph, blocks

    def collate_valid(self, batches):
        heads, tails = batches[0], batches[1]
        pos_graph = dgl.graph((heads, tails), num_nodes=self.g.number_of_nodes(self.ntype))
        pos_graph = dgl.compact_graphs([pos_graph])[0]
        seeds = pos_graph.ndata[dgl.NID]
        blocks = self.sampler.sample_blocks(seeds)
        assign_features_to_blocks(blocks, self.g, self.ntype)
        return pos_graph, blocks