In [1]:
import argparse
import pickle

import numpy as np
import pandas as pd
import os
import torch
import random
import dgl

import faiss
from sklearn.preprocessing import normalize

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    dgl.seed(seed)

In [None]:
# Load the apply_train data
apply_train_df = pd.read_csv('/path/to/apply_train.csv')
grouped_apply_train = apply_train_df.groupby('resume_seq')['recruitment_seq'].apply(list)

# Load the resume and recruitment vectors
resume_vector = np.load('/path/to/h_resume.npz')
recruitment_vector = np.load('/path/to/h_recruitment.npz')

h_resume = resume_vector['resume_vectors']
h_recruitment = recruitment_vector['recruitment_vectors']

# Create a FAISS index for the recruitment vectors
index = faiss.IndexFlatL2(h_recruitment.shape[1])
index.add(h_recruitment)

# Load the sample_submission DataFrame
sample_submission_df = pd.read_csv('/path/to/sample_submission.csv')

# Prepare the submission DataFrame
submission_dict = {'resume_seq': [], 'recruitment_seq': []}

# Iterate over the resume sequences in the sample submission
for resume_id in sample_submission_df['resume_seq'].unique():
    query_vector = h_resume[int(resume_id[1:]) - 1]  # Assuming the ID follows the 'Uxxxxx' format
    applied_jobs = grouped_apply_train.get(resume_id, [])
    
    # Search for the top 5+length of applied jobs similar vectors
    D, I = index.search(query_vector.reshape(1, -1), 5 + len(applied_jobs))
    
    # Filter out the applied jobs and prepare the recommendation in the required format
    recommended_jobs = ['R' + str(idx + 1).zfill(5) for idx in I[0] if idx not in applied_jobs][:5]
    
    # Add the recommendations to the submission_dict
    for job_id in recommended_jobs:
        submission_dict['resume_seq'].append(resume_id)
        submission_dict['recruitment_seq'].append(job_id)

# Create a new DataFrame for the submission
final_submission_df = pd.DataFrame(submission_dict)

# Save the final submission
final_submission_df.to_csv('/path/to/final_submission.csv', index=False)