In [None]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.
    
    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions
    
    Returns:
    - recall: Recall@5 value
    """
    
    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]
    
    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")
    
    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]
    
    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()
    
    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()
    
    
    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함 
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [2]:
import random
import os

import numpy as np
import pandas as pd

from multiprocessing import Pool
import itertools

from tqdm.auto import tqdm
from collections import defaultdict

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings(action='ignore')

from src.generate_neg_samples import generate_negative_samples_of_user

In [3]:
CFG = {'SEED' : 42}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED']) # Seed 고정

In [7]:
apply_train = pd.read_csv('./Data/apply_train.csv')
resume_data = pd.read_csv('./Data/resume_for_cosine.csv')

In [8]:
# 'resume_seq' 컬럼 제외
resume_features = resume_data.drop(columns=['resume_seq'])

# 코사인 유사도 계산
cosine_sim = cosine_similarity(resume_features)

# 코사인 유사도 결과를 DataFrame으로 변환
cosine_sim_df = pd.DataFrame(cosine_sim, 
                             index=resume_data['resume_seq'], 
                             columns=resume_data['resume_seq'])

# 결과 출력 (옵션)
cosine_sim_df

resume_seq,U00001,U00002,U00003,U00004,U00005,U00006,U00007,U00008,U00009,U00010,...,U08473,U08474,U08475,U08476,U08477,U08478,U08479,U08480,U08481,U08482
resume_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,1.000000,0.484995,0.648015,0.520206,0.539193,0.436084,0.367257,0.552206,0.532357,0.431512,...,0.548405,0.496440,0.589285,0.576954,0.536866,0.640190,0.434560,0.450777,0.610337,0.378312
U00002,0.484995,1.000000,0.418616,0.425232,0.308574,0.690865,0.698196,0.502984,0.435338,0.744270,...,0.404757,0.694373,0.590405,0.490450,0.490014,0.487568,0.487372,0.645143,0.514630,0.295726
U00003,0.648015,0.418616,1.000000,0.657585,0.377308,0.518786,0.302322,0.627550,0.570646,0.464603,...,0.534759,0.386414,0.676765,0.564506,0.567766,0.676061,0.367384,0.341040,0.542344,0.231229
U00004,0.520206,0.425232,0.657585,1.000000,0.569777,0.480298,0.296397,0.677025,0.579019,0.578286,...,0.581899,0.448977,0.673056,0.615703,0.527101,0.775396,0.379692,0.349301,0.691692,0.379286
U00005,0.539193,0.308574,0.377308,0.569777,1.000000,0.270664,0.287039,0.505810,0.399549,0.321392,...,0.456360,0.422399,0.447150,0.533152,0.456519,0.538655,0.309154,0.288872,0.519360,0.602154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U08478,0.640190,0.487568,0.676061,0.775396,0.538655,0.488876,0.405332,0.691890,0.641802,0.593306,...,0.596049,0.507024,0.845071,0.631868,0.641644,1.000000,0.488023,0.406924,0.708952,0.343332
U08479,0.434560,0.487372,0.367384,0.379692,0.309154,0.396491,0.509269,0.464245,0.387918,0.345776,...,0.406253,0.368173,0.589620,0.394342,0.489035,0.488023,1.000000,0.312237,0.513413,0.338471
U08480,0.450777,0.645143,0.341040,0.349301,0.288872,0.595620,0.563254,0.468059,0.499638,0.647525,...,0.333548,0.822564,0.409326,0.370689,0.457802,0.406924,0.312237,1.000000,0.434852,0.318568
U08481,0.610337,0.514630,0.542344,0.691692,0.519360,0.470188,0.339384,0.705297,0.563879,0.569289,...,0.523261,0.529512,0.659652,0.646871,0.616894,0.708952,0.513413,0.434852,1.000000,0.501862


In [18]:
# 임계값을 정의합니다.
threshold = 0.26

# 유사도가 임계값 이하인 쌍의 인덱스를 찾습니다.
low_similarity_pairs = np.column_stack(np.where(cosine_sim_df.values <= threshold))

# 각 resume_seq에 대해 지원한 공고의 집합을 만듭니다.
resume_to_recruitments = apply_train.groupby('resume_seq')['recruitment_seq'].apply(set).to_dict()

# 각 사용자에 대한 negative 후보들의 집합을 준비합니다.
negative_candidates_dict = {}
for i, j in low_similarity_pairs:
    resume_seq_i, resume_seq_j = cosine_sim_df.index[i], cosine_sim_df.index[j]
    
    # 이미 처리된 이력서를 체크하기 위한 집합을 사용합니다.
    processed_resumes = set()
    
    # 첫 번째 이력서에 대한 negative 후보 업데이트
    if resume_seq_i not in processed_resumes:
        applied_positions_j = resume_to_recruitments[resume_seq_j]
        negative_candidates_dict.setdefault(resume_seq_i, set()).update(applied_positions_j)
        processed_resumes.add(resume_seq_i)
    
    # 두 번째 이력서에 대한 negative 후보 업데이트
    if resume_seq_j not in processed_resumes:
        applied_positions_i = resume_to_recruitments[resume_seq_i]
        negative_candidates_dict.setdefault(resume_seq_j, set()).update(applied_positions_i)
        processed_resumes.add(resume_seq_j)

In [19]:
# 병렬 처리를 위한 tasks 리스트를 준비합니다.
tasks = [(resume_seq, candidates, resume_to_recruitments) for resume_seq, candidates in negative_candidates_dict.items()]

# 병렬 처리를 설정합니다.
with Pool(processes=4) as pool:
    # pool.starmap을 사용하여 각 함수 호출에 여러 인자를 전달합니다.
    results = pool.starmap(generate_negative_samples_of_user, tasks)

# 결과를 정리합니다.
negative_samples = list(results)

# 결과를 DataFrame으로 변환합니다.
negative_samples_df = pd.DataFrame(negative_samples, columns=['resume_seq', 'recruitment_seq_negatives'])

# 결과를 확인합니다.
negative_samples_df.sort_values(by=['resume_seq'], inplace=True)
negative_samples_df.reset_index(drop=True, inplace=True)
negative_samples_df

Unnamed: 0,resume_seq,recruitment_seq_negatives
0,U00001,"[R00827, R05482, R00328, R01201]"
1,U00002,"[R04733, R00862, R03700, R05702, R04002, R0619..."
2,U00003,"[R02763, R06150, R04416]"
3,U00004,"[R05995, R01971, R01922, R02401, R05234, R0108..."
4,U00005,"[R00376, R04672, R01643]"
...,...,...
8477,U08478,"[R01795, R06082]"
8478,U08479,"[R03632, R06512, R01655, R06386, R02857, R01301]"
8479,U08480,"[R02634, R03531]"
8480,U08481,"[R02569, R02217, R04470]"


In [None]:
# 긍정값 데이터프레임
data = {'resume_seq': list(resume_to_recruitments.keys()), 'recruitment_seq_positives': [list(val) for val in resume_to_recruitments.values()]}
positive_samples_df = pd.DataFrame(data, columns=['resume_seq', 'recruitment_seq_positives'])
positive_samples_df

In [None]:
# label 만들기 위한 explode 및 labeling
positive_pairs_df = positive_samples_df.explode('recruitment_seq_positives').rename(columns={'recruitment_seq_positives': 'recruitment_seq'})
negative_pairs_df = negative_samples_df.explode('recruitment_seq_negatives').rename(columns={'recruitment_seq_negatives': 'recruitment_seq'})

positive_pairs_df['label'] = 1
negative_pairs_df['label'] = 0

# neg_sample 포함된 상관관계 데이터프레임 형성
all_pairs_df = pd.concat([positive_pairs_df, negative_pairs_df], ignore_index=True)

all_pairs_df

In [None]:
# 이력서, 공고 메타데이터와 병합
combined_features_df = all_pairs_df.merge(resume_data, on='resume_seq', how='left')
combined_features_df = combined_features_df.merge(recruitment_data, on='recruitment_seq', how='left')

In [None]:
combined_features_df.sort_values(by='resume_seq', inplace=True)
combined_features_df.reset_index(drop=True, inplace=True)
combined_features_df

In [None]:
# 이력서별로 그룹화하여 긍정적인 샘플과 부정적인 샘플을 분리
grouped = combined_features_df.groupby('resume_seq')

train_data = []
validation_data = []

# 각 이력서에 대해 반복
for resume_seq, group in grouped:
    # 긍정적인 샘플과 부정적인 샘플 분리
    positive_samples = group[group['label'] == 1]
    negative_samples = group[group['label'] == 0]
    
    # 긍정적인 샘플과 부정적인 샘플에서 무작위로 하나씩 선택
    if not positive_samples.empty:
        validation_positive_sample = positive_samples.sample(n=1)
        train_data.append(positive_samples.drop(validation_positive_sample.index))
        validation_data.append(validation_positive_sample)
    
    if not negative_samples.empty:
        validation_negative_sample = negative_samples.sample(n=1)
        train_data.append(negative_samples.drop(validation_negative_sample.index))
        validation_data.append(validation_negative_sample)

# 학습 데이터와 검증 데이터를 데이터프레임으로 변환
train_df = pd.concat(train_data, ignore_index=True)
validation_df = pd.concat(validation_data, ignore_index=True)

# 결과 확인
(train_df.shape, validation_df.shape)

In [None]:
# def convert_to_libsvm(df, label_column):
#     libsvm_data = []
    
#     # Iterate over each row and create a string in libSVM format
#     for index, row in df.iterrows():
#         # Start with the label
#         libsvm_row = [str(int(row[label_column]))]
        
#         # Add each feature:value pair
#         for i, value in enumerate(row.drop(label_column), start=1):
#             if value != 0:  # Only non-zero values need to be included in libSVM format
#                 libsvm_row.append(f"{i}:{value}")
        
#         # Join all items in the row with a space and add to the list
#         libsvm_data.append(' '.join(libsvm_row))
    
#     return libsvm_data

In [None]:
# libsvm_train_data = convert_to_libsvm(train_df, 'label')
# libsvm_validation_data = convert_to_libsvm(validation_df, 'label')
# libsvm_test_data = convert_to_libsvm(combined_features_df, 'label')

# # Save the libSVM data to a text file
# with open('train.txt', 'w') as f:
#     for item in libsvm_train_data:
#         f.write("%s\n" % item)

# with open('validation.txt', 'w') as f:
#     for item in libsvm_validation_data:
#         f.write("%s\n" % item)

# with open('test.txt', 'w') as f:
#     for item in libsvm_test_data:
#         f.write("%s\n" % item)

In [None]:
train_file_path = 'train.txt'
valid_file_path = 'validation.txt'
test_file_path = 'test.txt'

In [None]:
# # 변환된 libSVM 데이터를 파일에서 다시 읽기
# with open('validation.txt', 'r') as file:
#     lines = file.readlines()

# # 각 줄을 검사하여 비정상적인 형식을 찾기
# for line_number, line in enumerate(lines, start=1):
#     elements = line.strip().split(' ')
#     # 라벨 확인 (첫 번째 요소)
#     try:
#         label = int(elements[0])
#     except ValueError:
#         print(f"Line {line_number} has invalid label: {elements[0]}")
#         continue
#     # 특성 확인 (이후 요소)
#     for element in elements[1:]:
#         try:
#             index, value = element.split(':')
#             index = int(index)  # 여기서 실패할 경우 아래 print문 실행
#             value = float(value)  # 값이 실수일 수 있으므로 float 변환 시도
#         except ValueError:
#             print(f"Line {line_number} has invalid feature format: {element}")
#             continue

In [None]:
fm = FactorizationMachine(k=50,
                          lr=0.001,
                          l2_reg=True,
                          l2_lambda=0.2,
                          epoch=200,
                          early_stop_window=3,
                          train_data=train_file_path,
                          valid_data=valid_file_path)

fm.train()