# pip & import

In [1]:
!pip install scikit-surprise



In [2]:
!pip install category_encoders



In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import category_encoders as ce
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.dataset import DatasetAutoFolds
import os
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 가공 및 변환

In [4]:
apply_train_df = pd.read_csv('apply_train.csv')
resume_train_df = pd.read_csv('resume.csv')
certificate_df = pd.read_csv('resume_certificate.csv')
education_train_df = pd.read_csv('resume_education.csv')
language_train_df = pd.read_csv('resume_language.csv')
company_train_df = pd.read_csv('company.csv')
recruitment_train_df = pd.read_csv('recruitment.csv')

In [5]:
all_apply_matrix = apply_train_df.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

In [6]:
count_by_recruitment_seq = apply_train_df.groupby('recruitment_seq')['resume_seq'].count()
count_by_recruitment_seq = count_by_recruitment_seq.reset_index()
count_by_recruitment_seq.columns = ['recruitment_seq', 'rating']

# 업데이트된 DataFrame을 확인합니다.
count_by_recruitment_seq

Unnamed: 0,recruitment_seq,rating
0,R00001,11
1,R00002,6
2,R00003,7
3,R00004,15
4,R00005,5
...,...,...
6690,R06691,6
6691,R06692,11
6692,R06693,4
6693,R06694,7


In [7]:
total_user = resume_train_df.sort_values(by='resume_seq')
certificate_df = certificate_df.dropna(subset=['certificate_contents'])
tmp = certificate_df.groupby('resume_seq')['certificate_contents'].apply(';'.join).reset_index()
total_user = pd.merge(total_user, tmp, on='resume_seq', how='left')
total_user = pd.merge(total_user, education_train_df, on='resume_seq', how='left')
total_user = pd.merge(total_user, apply_train_df, on='resume_seq', how='left')

In [8]:
total_item = recruitment_train_df
total_item = pd.merge(total_item, company_train_df, on='recruitment_seq', how='left')
total_item = pd.merge(total_item, count_by_recruitment_seq, on='recruitment_seq', how='left')
total_item

Unnamed: 0,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,text_keyword,company_type_seq,supply_kind,employee,rating
0,R02264,3.0,,,0,0,2507;2707;2810,4,8,1,,5.0,402.0,800.0,14
1,R06317,3.0,,,0,0,2204;2205;2707,3,2,1,,,,,7
2,R04017,3.0,,,0,0,2101;2108;2201;2707,3,2,1,,,,,3
3,R02865,3.0,,,0,0,2201;2204;2205;2707,2,2,1,,,,,5
4,R04890,3.0,,,0,0,2201;2204;2205;2707,2,2,2,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,R03678,3.0,,,0,0,2101;2108;2201;2204;2205;2707,3,2,1,,,,,34
6691,R04593,3.0,,,0,0,2201;2204;2205;2707,4,2,1,,,,,12
6692,R03252,3.0,,,0,0,2109,3,2,1,,4.0,402.0,525.0,7
6693,R05130,3.0,,,0,0,2201;2204;2205;2707,2,2,2,,2.0,402.0,40.0,6


In [9]:
total = pd.merge(total_user, total_item, on='recruitment_seq', how='left')
total

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword_x,job_code_seq1,job_code_seq2,...,career_start,check_box_keyword,education,major_task,qualifications,text_keyword_y,company_type_seq,supply_kind,employee,rating
0,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,0,2507,4,7,1,,,,,12
1,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,0,2507;2707,3,2,2,,,,,6
2,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,0,2507,4,7,1,,,,,8
3,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,0,2507;2707;2799,3,2,2,,,,,16
4,U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,...,0,2707;2507,4,2,1,의류;의상;여성복,,,,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57941,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,...,0,2101;2108;2201;2707,3,2,2,,2.0,100.0,200.0,6
57942,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,...,0,2101;2108;2201;2707,2,2,2,,,,,7
57943,U08482,2020-11-29,2020-11-29,4,0,0.0,0.0,상품기획;영업MD;머천다이저;기획MD;마케팅;PR,재료·화학·섬유·의복,,...,0,2101;2108;2201;2707,3,2,2,,,,,13
57944,U08482,2020-11-29,2020-11-29,4,0,0.0,0.0,상품기획;영업MD;머천다이저;기획MD;마케팅;PR,재료·화학·섬유·의복,,...,0,2101;2108;2109;2201;2203;2204;2205;2302;2507;2707,2,2,2,,,,,19


In [10]:
average_address_seq1 = total.groupby('resume_seq')['address_seq1'].mean().reset_index()
average_address_seq1.columns = ['resume_seq', 'location']
average_address_seq1

Unnamed: 0,resume_seq,location
0,U00001,3.000000
1,U00002,3.000000
2,U00003,3.000000
3,U00004,3.117647
4,U00005,3.000000
...,...,...
8477,U08478,3.000000
8478,U08479,3.000000
8479,U08480,3.000000
8480,U08481,3.000000


In [11]:
total = pd.merge(total, average_address_seq1, on='resume_seq', how='left')
total

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword_x,job_code_seq1,job_code_seq2,...,check_box_keyword,education,major_task,qualifications,text_keyword_y,company_type_seq,supply_kind,employee,rating,location
0,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507,4,7,1,,,,,12,3.0
1,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507;2707,3,2,2,,,,,6,3.0
2,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507,4,7,1,,,,,8,3.0
3,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507;2707;2799,3,2,2,,,,,16,3.0
4,U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,...,2707;2507,4,2,1,의류;의상;여성복,,,,7,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57941,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,...,2101;2108;2201;2707,3,2,2,,2.0,100.0,200.0,6,3.0
57942,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,...,2101;2108;2201;2707,2,2,2,,,,,7,3.0
57943,U08482,2020-11-29,2020-11-29,4,0,0.0,0.0,상품기획;영업MD;머천다이저;기획MD;마케팅;PR,재료·화학·섬유·의복,,...,2101;2108;2201;2707,3,2,2,,,,,13,3.0
57944,U08482,2020-11-29,2020-11-29,4,0,0.0,0.0,상품기획;영업MD;머천다이저;기획MD;마케팅;PR,재료·화학·섬유·의복,,...,2101;2108;2109;2201;2203;2204;2205;2302;2507;2707,2,2,2,,,,,19,3.0


In [12]:
total_train = total[(total['address_seq1'] - total['location']<10) & (total['address_seq1'] - total['location']>-10)]
total_train

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword_x,job_code_seq1,job_code_seq2,...,check_box_keyword,education,major_task,qualifications,text_keyword_y,company_type_seq,supply_kind,employee,rating,location
0,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507,4,7,1,,,,,12,3.0
1,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507;2707,3,2,2,,,,,6,3.0
2,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507,4,7,1,,,,,8,3.0
3,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507;2707;2799,3,2,2,,,,,16,3.0
4,U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,...,2707;2507,4,2,1,의류;의상;여성복,,,,7,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57941,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,...,2101;2108;2201;2707,3,2,2,,2.0,100.0,200.0,6,3.0
57942,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,...,2101;2108;2201;2707,2,2,2,,,,,7,3.0
57943,U08482,2020-11-29,2020-11-29,4,0,0.0,0.0,상품기획;영업MD;머천다이저;기획MD;마케팅;PR,재료·화학·섬유·의복,,...,2101;2108;2201;2707,3,2,2,,,,,13,3.0
57944,U08482,2020-11-29,2020-11-29,4,0,0.0,0.0,상품기획;영업MD;머천다이저;기획MD;마케팅;PR,재료·화학·섬유·의복,,...,2101;2108;2109;2201;2203;2204;2205;2302;2507;2707,2,2,2,,,,,19,3.0


In [13]:
total_train2 = total[total['education'] <= total['degree']]
total_train3 = total[total['education'] > total['degree']]
not_in_total_train2 = total_train3[~total_train3['resume_seq'].isin(total_train2['resume_seq'])]

In [14]:
merged_total = pd.concat([total_train2, not_in_total_train2], ignore_index=True)
merged_total

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword_x,job_code_seq1,job_code_seq2,...,check_box_keyword,education,major_task,qualifications,text_keyword_y,company_type_seq,supply_kind,employee,rating,location
0,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507,4,7,1,,,,,12,3.0
1,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507;2707,3,2,2,,,,,6,3.0
2,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507,4,7,1,,,,,8,3.0
3,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,2507;2707;2799,3,2,2,,,,,16,3.0
4,U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,...,2707;2507,4,2,1,의류;의상;여성복,,,,7,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56044,U08217,2020-03-29,2020-04-08,2,2007,0.0,3500.0,상품기획;MD;영업MD;마케팅,재료·화학·섬유·의복,,...,2101;2108;2201;2707,3,2,2,,,,,9,3.0
56045,U08217,2020-03-29,2020-04-08,2,2007,0.0,3500.0,상품기획;MD;영업MD;마케팅,재료·화학·섬유·의복,,...,2201;2204;2205;2707,4,2,2,,2.0,100.0,270.0,5,3.0
56046,U08348,2020-03-28,2020-06-05,3,2012,0.0,0.0,패션디자이너,재료·화학·섬유·의복,,...,2501;2507,4,2,1,,,,,6,3.0
56047,U08348,2020-03-28,2020-06-05,3,2012,0.0,0.0,패션디자이너,재료·화학·섬유·의복,,...,2507;2707,4,2,1,,2.0,100.0,270.0,6,3.0


# 함수 정의

In [15]:
def recall5(answer_df, submission_df):

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # submission의 예측이 각각 5개인지 확인
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # submission의 예측된 값들에 null값이 있는지 확인
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # 예측값에 중복이 있는지 확인
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    # primary_col 즉 resume_seq가 양측에 있는지 확인 후 남김
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    #
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [16]:
def minimum_condition(resume, company):
  if(resume['degree'] < company['education']):
    return False
  else:
    return True
   # if(resume['career_month']>company['career_end'] or resume['career_month']<company['career_start']): 다 0임
     # return False
   # else:


In [17]:
def check_salary(salary):
  if salary>3000:
    return 3
  elif salary>2000:
    return 2
  elif salary>1000:
    return 1
  else:
    return 0

In [18]:
def check_career_month(career_month):
  if career_month>120:
    return 3
  elif career_month>60:
    return 2
  elif career_month>0:
    return 1
  else:
    return 0

In [19]:
def check_univ_score(univ_score):
  if univ_score>=90:
    return 90
  elif univ_score>=80:
    return 80
  elif univ_score>=70:
    return 70
  elif univ_score>=60:
    return 60
  elif univ_score>=50:
    return 50
  elif univ_score>=40:
    return 40
  elif univ_score>=30:
    return 30
  elif univ_score>=20:
    return 20
  elif univ_score>=10:
    return 10
  else:
    return 0

# apply_matrix

In [20]:
apply_train_m = merged_total[['resume_seq', 'recruitment_seq']]
apply_pred_m = total[['resume_seq', 'recruitment_seq']]

In [21]:
apply_train_groupby = apply_train_m.groupby('resume_seq')['recruitment_seq'].apply(list)
apply_pred_groupby = apply_pred_m.groupby('resume_seq')['recruitment_seq'].apply(list)

In [22]:
#학습, 검증 분리
train, test = [], []
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    if(len(iids)>1):
      for iid in iids[:-1]:
        train.append([uid,iid])
      test.append([uid, iids[-1]])
    else:
        train.append([uid,iid])
        test.append([uid, iid])

In [23]:
apply_train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
apply_test = pd.DataFrame(test, columns=['resume_seq', 'recruitment_seq'])
apply_pred = apply_train_m.copy()
apply_train['resume_seq'].value_counts()

U06543    76
U07490    68
U06166    64
U06125    62
U06206    62
          ..
U04975     1
U07131     1
U06198     1
U01326     1
U00975     1
Name: resume_seq, Length: 8482, dtype: int64

# 거리 조건

In [24]:
resume_d = total[['resume_seq', 'location']]
recruitment_d = total[['recruitment_seq', 'address_seq1']]

resume_d = resume_d.drop_duplicates()
recruitment_d = recruitment_d.drop_duplicates()

In [25]:
resume_d = resume_d.set_index('resume_seq')
recruitment_d = recruitment_d.set_index('recruitment_seq')

In [26]:
resume_d['location'] = resume_d['location'].astype(int)
resume_d['location'].value_counts()

3     8291
4      118
5       31
2       16
8        9
6        8
7        6
11       2
14       1
Name: location, dtype: int64

In [212]:
distance_matrix_new = []
count = 0

tem_2 = [] # U00140
tem_3 = [] # U00004
tem_4 = [] # U00292
tem_5 = [] # U00424
tem_6 = [] # U01105
tem_7 = [] # U01461
tem_8 = [] # U01703
tem_11 = [] # U02309
tem_14 = [] # U02786


for recruitment_seq in range(len(recruitment_d)): #
  if abs(resume_d.loc['U00140']['location'] - recruitment_d.iloc[recruitment_seq]['address_seq1']) <= 10:
        a = True
  else:
        a = False
  tem_2.append(a)

for recruitment_seq in range(len(recruitment_d)): #
  if abs(resume_d.loc['U00004']['location'] - recruitment_d.iloc[recruitment_seq]['address_seq1']) <= 10:
        a = True
  else:
        a = False
  tem_3.append(a)

for recruitment_seq in range(len(recruitment_d)):
  if abs(resume_d.loc['U00292']['location'] - recruitment_d.iloc[recruitment_seq]['address_seq1']) <= 10:
        a = True
  else:
        a = False
  tem_4.append(a)

for recruitment_seq in range(len(recruitment_d)):
  if abs(resume_d.loc['U00424']['location'] - recruitment_d.iloc[recruitment_seq]['address_seq1']) <= 10:
        a = True
  else:
        a = False
  tem_5.append(a)

for recruitment_seq in range(len(recruitment_d)):
  if abs(resume_d.loc['U01105']['location'] - recruitment_d.iloc[recruitment_seq]['address_seq1']) <= 10:
        a = True
  else:
        a = False
  tem_6.append(a)

for recruitment_seq in range(len(recruitment_d)):
  if abs(resume_d.loc['U01461']['location'] - recruitment_d.iloc[recruitment_seq]['address_seq1']) <= 10:
        a = True
  else:
        a = False
  tem_7.append(a)

for recruitment_seq in range(len(recruitment_d)):
  if abs(resume_d.loc['U01703']['location'] - recruitment_d.iloc[recruitment_seq]['address_seq1']) <= 10:
        a = True
  else:
        a = False
  tem_8.append(a)

for recruitment_seq in range(len(recruitment_d)):
  if abs(resume_d.loc['U02309']['location'] - recruitment_d.iloc[recruitment_seq]['address_seq1']) <= 10:
        a = True
  else:
        a = False
  tem_11.append(a)

for recruitment_seq in range(len(recruitment_d)):
  if abs(resume_d.loc['U02786']['location'] - recruitment_d.iloc[recruitment_seq]['address_seq1']) <= 10:
        a = True
  else:
        a = False
  tem_14.append(a)



# apply_item_matrix의 인덱스(=resume_seq)와 컬럼(=recruitment_seq)을 뽑아냄
for resume_seq in range(len(resume_d)):
    if resume_d.iloc[resume_seq]['location'] == 2:
      distance_matrix_new.append(tem_2)
    elif resume_d.iloc[resume_seq]['location'] == 3:
      distance_matrix_new.append(tem_3)
    elif resume_d.iloc[resume_seq]['location'] == 4:
      distance_matrix_new.append(tem_4)
    elif resume_d.iloc[resume_seq]['location'] == 5:
      distance_matrix_new.append(tem_5)
    elif resume_d.iloc[resume_seq]['location'] == 6:
      distance_matrix_new.append(tem_6)
    elif resume_d.iloc[resume_seq]['location'] == 7:
      distance_matrix_new.append(tem_7)
    elif resume_d.iloc[resume_seq]['location'] == 8:
      distance_matrix_new.append(tem_8)
    elif resume_d.iloc[resume_seq]['location'] == 11:
      distance_matrix_new.append(tem_11)
    else:
      distance_matrix_new.append(tem_2)

# 새로운 DataFrame 생성
distance_matrix_new = pd.DataFrame(distance_matrix_new, index=all_apply_matrix.index, columns=all_apply_matrix.columns)
distance_matrix_new

recruitment_seq,R00001,R00002,R00003,R00004,R00005,R00006,R00007,R00008,R00009,R00010,...,R06686,R06687,R06688,R06689,R06690,R06691,R06692,R06693,R06694,R06695
resume_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U00002,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U00003,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U00004,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U00005,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U08478,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U08479,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U08480,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U08481,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


# 회사 최소 조건

In [28]:
resume_c = resume_train_df.copy()
recruitment_c = recruitment_train_df.copy()

In [29]:
# 'resume_seq' 열을 기준으로 정렬
resume_c = resume_c.sort_values(by='resume_seq')
# 인덱스를 순차적으로 변경
resume_c['resume_seq'] = [f'U{i:05}' for i in range(1, len(resume_c) + 1)]
resume_c = resume_c.set_index('resume_seq')

# 'resume_seq' 열을 기준으로 정렬
recruitment_c = recruitment_c.sort_values(by='recruitment_seq')
# 인덱스를 순차적으로 변경
recruitment_c['recruitment_seq'] = [f'R{i:05}' for i in range(1, len(recruitment_c) + 1)]
recruitment_c = recruitment_c.set_index('recruitment_seq')

In [30]:
resume_c = resume_c[['degree']]
recruitment_c = recruitment_c[['education']]

In [164]:
apply_item_matrix_new = []
count = 0
# 초등학생들 ^^
tem_6 = [] # U05886이 6학년
tem_5 = [] # U08422이 5학년
tem_4 = [] # U04600이 4학년
tem_3 = [] # U03400이 3학년
tem_2 = [] # U04220이 2학년

for recruitment_seq in range(len(recruitment_c)): #
  if resume_c.loc['U05886']['degree'] < recruitment_c.iloc[recruitment_seq]['education']:
        a = False
  else:
        a = True
  tem_6.append(a)

for recruitment_seq in range(len(recruitment_c)):
  if resume_c.loc['U08422']['degree'] < recruitment_c.iloc[recruitment_seq]['education']:
        a = False
  else:
        a = True
  tem_5.append(a)

for recruitment_seq in range(len(recruitment_c)):
  if ((resume_c.loc['U04600']['degree'] < recruitment_c.iloc[recruitment_seq]['education'])):
        a = False
  else:
        a = True
  tem_4.append(a)

for recruitment_seq in range(len(recruitment_c)):
  if ((resume_c.loc['U03400']['degree'] < recruitment_c.iloc[recruitment_seq]['education'])):
        a = False
  else:
        a = True
  tem_3.append(a)

for recruitment_seq in range(len(recruitment_c)):
  if resume_c.loc['U04220']['degree'] < recruitment_c.iloc[recruitment_seq]['education']:
        a = False
  else:
        a = True
  tem_2.append(a)

# apply_item_matrix의 인덱스(=resume_seq)와 컬럼(=recruitment_seq)을 뽑아냄
for resume_seq in range(len(resume_c)):
    if resume_c.iloc[resume_seq]['degree'] == 6:
      apply_item_matrix_new.append(tem_6)
    elif resume_c.iloc[resume_seq]['degree'] == 5:
      apply_item_matrix_new.append(tem_5)
    elif resume_c.iloc[resume_seq]['degree'] == 4:
      apply_item_matrix_new.append(tem_4)
    elif resume_c.iloc[resume_seq]['degree'] == 3:
      apply_item_matrix_new.append(tem_3)
    else:
      apply_item_matrix_new.append(tem_2)

# 새로운 DataFrame 생성
apply_item_matrix_new = pd.DataFrame(apply_item_matrix_new, index=all_apply_matrix.index, columns=all_apply_matrix.columns)
apply_item_matrix_new

recruitment_seq,R00001,R00002,R00003,R00004,R00005,R00006,R00007,R00008,R00009,R00010,...,R06686,R06687,R06688,R06689,R06690,R06691,R06692,R06693,R06694,R06695
resume_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U00002,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U00003,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U00004,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U00005,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U08478,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U08479,True,True,True,True,True,False,True,True,True,True,...,True,True,True,True,True,True,True,False,False,True
U08480,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
U08481,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


# 학위별 분류

In [32]:
check_degree = total[['resume_seq', 'degree']]
check_degree = check_degree.drop_duplicates(subset='resume_seq')

In [33]:
apply_train_c = pd.merge(apply_train, check_degree, on = 'resume_seq', how = 'left')
apply_pred = pd.merge(apply_pred, check_degree, on = 'resume_seq', how = 'left')

In [34]:
apply_train_2 = apply_train
apply_pred_2 = apply_pred

In [35]:
# 사용자-아이템 행렬 생성: 구직자가 해당 채용 공고에 지원했으면 1, 아니면 0으로 설정
apply_train_item_matrix_2 = apply_train_2.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

apply_train_similarity_2 = cosine_similarity(apply_train_item_matrix_2) # 사용자 간의 유사성 계산


apply_pred_item_matrix_2 = apply_pred_2.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

apply_pred_similarity_2 = cosine_similarity(apply_pred_item_matrix_2) # 사용자 간의 유사성 계산

# 이미 지원한거 모음

In [36]:
# train_사용자 간의 유사성 계산
user_train_similarity_2 = apply_train_similarity_2
user_predicted_scores_2 = user_train_similarity_2.dot(apply_train_item_matrix_2) / np.array([np.abs(user_train_similarity_2).sum(axis=1)]).T

item_train_similarity_2 = cosine_similarity(apply_train_item_matrix_2.T)
item_train_predicted_scores_2 = apply_train_item_matrix_2.dot(item_train_similarity_2)

sort_user_list_2 = []
applied_jobs_2 = []


for idx, user in enumerate(apply_train_item_matrix_2.index):
    # 해당 사용자가 지원한 채용 공고
    applied_job = apply_train[apply_train['resume_seq'] == user]['recruitment_seq'].values
    applied_jobs_2.append([user, applied_job])

# alpha

In [269]:
alpha = 0.98

# train 결과 도출

## 2학위 train 결과 도출

In [270]:
# 이미 지원한 채용 공고 제외하고 추천
recommendations_2 = []


for idx, user in enumerate(apply_train_item_matrix_2.index):
  re_applied_jobs = []
  # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
  sorted_job_indices = (user_predicted_scores_2[idx] + item_train_predicted_scores_2.loc[user].values*alpha).argsort()[::-1]
  for idxx, j in enumerate(sorted_job_indices):
    if(len(re_applied_jobs) >= 5):
      break
    else:
      job = apply_train_item_matrix_2.columns[j]
      if (job not in applied_jobs_2[idx][1] and apply_item_matrix_new.loc[user].loc[job] and (job not in re_applied_jobs) and distance_matrix_new.loc[user].loc[job]):
        re_applied_jobs.append(job)
        recommendations_2.append([user, job])

In [271]:
val_prediction_2 = pd.DataFrame(recommendations_2, columns=['resume_seq', 'recruitment_seq'])

# 예측 점수

In [272]:
val_prediction = val_prediction_2

In [273]:
val_prediction.sort_values(by='resume_seq', inplace=True)

In [274]:
recall5(apply_test,val_prediction)

0.11353454373968404

# pred 이미 지원한 거

In [229]:
# train_사용자 간의 유사성 계산
user_similarity_2 = apply_pred_similarity_2
pred_user_predicted_scores_2 = apply_pred_similarity_2.dot(apply_pred_item_matrix_2) / np.array([np.abs(user_similarity_2).sum(axis=1)]).T

pred_item_train_similarity_2 = cosine_similarity(apply_pred_item_matrix_2.T)
pred_item_train_predicted_scores_2 = apply_pred_item_matrix_2.dot(pred_item_train_similarity_2)

sort_user_list_2 = []
applied_jobs_2 = []


for idx, user in enumerate(apply_pred_item_matrix_2.index):
    # 해당 사용자가 지원한 채용 공고
    applied_job = apply_train[apply_train['resume_seq'] == user]['recruitment_seq'].values
    applied_jobs_2.append([user, applied_job])

# pred 결과 도출

## 2학위 pred 결과 도출

In [275]:
# 이미 지원한 채용 공고 제외하고 추천
recommendations_2 = []


for idx, user in enumerate(apply_pred_item_matrix_2.index):
  re_applied_jobs = []
  # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
  sorted_job_indices = (pred_user_predicted_scores_2[idx] + pred_item_train_predicted_scores_2.loc[user].values*alpha).argsort()[::-1]
  for idxx, j in enumerate(sorted_job_indices):
    if(len(re_applied_jobs) >= 5):
      break
    else:
      job = apply_pred_item_matrix_2.columns[j]
      if (job not in applied_jobs_2[idx][1] and apply_item_matrix_new.loc[user].loc[job] and (job not in re_applied_jobs) and distance_matrix_new.loc[user].loc[job]):
        re_applied_jobs.append(job)
        recommendations_2.append([user, job])

# 총 결과

In [276]:
top_recommendations_2 = pd.DataFrame(recommendations_2, columns=['resume_seq', 'recruitment_seq'])
top_recommendations = top_recommendations_2
top_recommendations.sort_values(by='resume_seq', inplace=True)

In [277]:
top_recommendations.to_csv('./H_submit.csv', index=False)