In [1]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def recall5(answer_df, submission_df): #

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # submission의 예측이 각각 5개인
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [3]:
apply_train = pd.read_csv('./apply_train.csv')

In [4]:
apply_train

Unnamed: 0,resume_seq,recruitment_seq
0,U05833,R03838
1,U06456,R02144
2,U07807,R01877
3,U04842,R02463
4,U08336,R00112
...,...,...
57941,U02270,R03430
57942,U02640,R04987
57943,U08238,R01342
57944,U01296,R06363


In [5]:
#학습, 검증 분리
train, val = [], []
apply_train_groupby = apply_train.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    val.append([uid, iids[-1]])

In [6]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
val = pd.DataFrame(val, columns=['resume_seq', 'recruitment_seq'])
pred = apply_train.copy()

In [7]:
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

In [8]:
train_user_item_matrix2 = train_user_item_matrix
pred_user_item_matrix2 = pred_user_item_matrix

# 수정한곳

In [None]:
resume = pd.read_csv('./resume.csv')
resume = resume.sort_values(by='resume_seq')
resume

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword,job_code_seq1,job_code_seq2,job_code_seq3,career_month,career_job_code
1690,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,,0,
3979,U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,,73,기타 공공;개인서비스
5648,U00003,2018-02-14,2020-07-08,4,2004,0.0,0.0,남성복디자이너;TD캐주얼,재료·화학·섬유·의복,,,113,섬유;봉제;가방;의류
1367,U00004,2017-10-26,2020-04-27,4,2011,0.0,0.0,상품기획;영업기획,재료·화학·섬유·의복,,,27,섬유;봉제;가방;의류
5242,U00005,2020-03-18,2020-04-08,4,2011,0.0,0.0,인사;총무;경영;MD;상품기획,재료·화학·섬유·의복,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3811,U08478,2020-02-05,2020-03-31,4,2003,0.0,2700.0,상품기획,재료·화학·섬유·의복,,,60,섬유;봉제;가방;의류
2227,U08479,2016-02-17,2020-07-15,3,0,2500.0,2300.0,디자이너,재료·화학·섬유·의복,,,59,디자인;CAD;설계
2657,U08480,2019-11-13,2020-06-24,4,0,0.0,0.0,VMD;텍스타일디자인;섬유디자인;니트디자인;여성복디자인,디자인,,,0,
6619,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,,8,섬유;봉제;가방;의류


In [None]:
resume.set_index('resume_seq', inplace=True)
resume

Unnamed: 0_level_0,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword,job_code_seq1,job_code_seq2,job_code_seq3,career_month,career_job_code
resume_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,,0,
U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,,73,기타 공공;개인서비스
U00003,2018-02-14,2020-07-08,4,2004,0.0,0.0,남성복디자이너;TD캐주얼,재료·화학·섬유·의복,,,113,섬유;봉제;가방;의류
U00004,2017-10-26,2020-04-27,4,2011,0.0,0.0,상품기획;영업기획,재료·화학·섬유·의복,,,27,섬유;봉제;가방;의류
U00005,2020-03-18,2020-04-08,4,2011,0.0,0.0,인사;총무;경영;MD;상품기획,재료·화학·섬유·의복,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
U08478,2020-02-05,2020-03-31,4,2003,0.0,2700.0,상품기획,재료·화학·섬유·의복,,,60,섬유;봉제;가방;의류
U08479,2016-02-17,2020-07-15,3,0,2500.0,2300.0,디자이너,재료·화학·섬유·의복,,,59,디자인;CAD;설계
U08480,2019-11-13,2020-06-24,4,0,0.0,0.0,VMD;텍스타일디자인;섬유디자인;니트디자인;여성복디자인,디자인,,,0,
U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,,8,섬유;봉제;가방;의류


In [None]:
resume_certificate = pd.read_csv('./resume_certificate.csv')
resume_certificate = resume_certificate.sort_values(by='resume_seq')
resume_certificate

NameError: ignored

In [None]:
one_hot_encoded = pd.get_dummies(resume_certificate['certificate_contents'].str.split(expand=True).stack()).sum(level=0)
resume_certificate = pd.concat([resume_certificate, one_hot_encoded], axis=1).drop(columns='certificate_contents')
resume_certificate = resume_certificate.groupby('resume_seq').sum().reset_index()
resume_certificate

  one_hot_encoded = pd.get_dummies(resume_certificate['certificate_contents'].str.split(expand=True).stack()).sum(level=0)


Unnamed: 0,resume_seq,&,(,(1종,(2종,(CISCO,(EXPERT),(ITQ),(ITQ)인증시험,(ITQ)인증시험(아래한글,...,활용능력2급,활용능력3급,회계,회계관리1급,회계관리2급,회계관리2급(국가공인),회계실무사,회계실무사(23급),훈육지도자,휴넷
0,U00002,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,U00003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,U00004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,U00006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,U00007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971,U08476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5972,U08479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5973,U08480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5974,U08481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
resume_certificate = resume_certificate.drop(['&', '('], axis=1)

In [None]:
resume_certificate.columns = resume_certificate.columns.str.replace(r'\(1종', '1종')
resume_certificate.columns = resume_certificate.columns.str.replace(r'\(2종', '2종')

  resume_certificate.columns = resume_certificate.columns.str.replace(r'\(1종', '1종')
  resume_certificate.columns = resume_certificate.columns.str.replace(r'\(2종', '2종')


In [None]:
merged_resume = pd.merge(resume, resume_certificate, on='resume_seq', how='left')

In [None]:
merged_resume = merged_resume.drop(['reg_date','updated_date', 'graduate_date', 'job_code_seq2', 'job_code_seq3','career_job_code','text_keyword'], axis=1)

In [None]:
one_hot_encoded = pd.get_dummies(merged_resume['job_code_seq1'].str.split(expand=True).stack()).sum(level=0)
merged_resume = pd.concat([merged_resume, one_hot_encoded], axis=1).drop(columns='job_code_seq1')
merged_resume = merged_resume.groupby('resume_seq').sum().reset_index()
merged_resume = merged_resume.fillna(0)
merged_resume

  one_hot_encoded = pd.get_dummies(merged_resume['job_code_seq1'].str.split(expand=True).stack()).sum(level=0)


Unnamed: 0,resume_seq,degree,hope_salary,last_salary,career_month,1종,2종,(CISCO,(EXPERT),(ITQ),...,IT·게임,건설·기계·전기·전자,경영·기획·회계·사무,기타,디자인,문화·예술·신문·방송,영업·판매·TM,재료·화학·섬유·의복,전문·교육·자격,직종
0,U00001,4,1500.0,1500.0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
1,U00002,4,0.0,0.0,73,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2,U00003,4,0.0,0.0,113,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3,U00004,4,0.0,0.0,27,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,U00005,4,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U08478,4,0.0,2700.0,60,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
8478,U08479,3,2500.0,2300.0,59,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
8479,U08480,4,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
8480,U08481,4,0.0,2700.0,8,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
train_user_item_matrix = pd.merge(train_user_item_matrix, merged_resume, on='resume_seq', how='left')
pred_user_item_matrix = pd.merge(pred_user_item_matrix, merged_resume, on='resume_seq', how='left')

In [None]:
pred_user_item_matrix

Unnamed: 0,resume_seq,R00001,R00002,R00003,R00004,R00005,R00006,R00007,R00008,R00009,...,IT·게임,건설·기계·전기·전자,경영·기획·회계·사무,기타,디자인,문화·예술·신문·방송,영업·판매·TM,재료·화학·섬유·의복,전문·교육·자격,직종
0,U00001,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,U00002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,U00003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,U00004,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,U00005,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U08478,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8478,U08479,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8479,U08480,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8480,U08481,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
train_user_item_matrix.set_index('resume_seq', inplace=True)

In [None]:
pred_user_item_matrix.set_index('resume_seq', inplace=True)

In [None]:
pred_user_item_matrix

Unnamed: 0_level_0,R00001,R00002,R00003,R00004,R00005,R00006,R00007,R00008,R00009,R00010,...,IT·게임,건설·기계·전기·전자,경영·기획·회계·사무,기타,디자인,문화·예술·신문·방송,영업·판매·TM,재료·화학·섬유·의복,전문·교육·자격,직종
resume_seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
U00002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
U00003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
U00004,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
U00005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U08478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
U08479,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
U08480,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
U08481,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# cosine_similarity

In [9]:
train_user_similarity = cosine_similarity(train_user_item_matrix)
train_item_similarity = cosine_similarity(train_user_item_matrix.T)

pred_user_similarity = cosine_similarity(pred_user_item_matrix)
pred_item_similarity = cosine_similarity(pred_user_item_matrix.T)

In [16]:
train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)

pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

In [17]:
train_user_predicted_scores.shape

(8482, 6691)

In [None]:
alpha = 0.42
train_recommendations = []

for idx, user in tqdm(enumerate(train_user_similarity.index)):
    applied_jobs = set(train_user_similarity.loc[user][train_user_similarity.loc[user] == 1].index)

    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = (train_user_similarity.loc[user].values * alpha + train_user_similarity[idx]).argsort()[::-1]
    recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs and job in target_columns][:5]

    for job in recommended_jobs:
        train_recommendations.append([user, job])

0it [00:00, ?it/s]

In [None]:
'''alpha = 0.42
train_recommendations = []
for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
    applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)

    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx]).argsort()[::-1]
    recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

    for job in recommended_jobs:
        train_recommendations.append([user, job])'''

'alpha = 0.42\ntrain_recommendations = []\nfor idx, user in tqdm(enumerate(train_user_item_matrix.index)):\n    applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)\n\n    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)\n    sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx]).argsort()[::-1]\n    recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]\n\n    for job in recommended_jobs:\n        train_recommendations.append([user, job])'

In [None]:
val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

# recall

In [None]:
val

Unnamed: 0,resume_seq,recruitment_seq
0,U00001,R04536
1,U00002,R01103
2,U00003,R03301
3,U00004,R05341
4,U00005,R00374
...,...,...
8477,U08478,R03939
8478,U08479,R02787
8479,U08480,R03152
8480,U08481,R06253


In [None]:
val_prediction

Unnamed: 0,resume_seq,recruitment_seq
0,U00001,R00944
1,U00001,R03939
2,U00001,R04200
3,U00001,R03510
4,U00001,R00773
...,...,...
42405,U08482,R05574
42406,U08482,R01331
42407,U08482,R01214
42408,U08482,R00815


In [None]:
recall5(val,val_prediction)

0.004362178731431266

In [None]:
alpha = 0.42
pred_recommendations = []

target_columns = [col for col in train_user_item_matrix.columns if col.startswith('R0')]

for idx, user in tqdm(enumerate(pred_user_item_matrix.index)):
    applied_jobs = set(pred_user_item_matrix.loc[user][pred_user_item_matrix.loc[user] == 1].index)

    sorted_job_indices = (pred_item_predicted_scores.loc[user].values * alpha + pred_user_predicted_scores[idx]).argsort()[::-1]
    recommended_jobs = [job for job in pred_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs and job in target_columns][:5]

    for job in recommended_jobs:
        pred_recommendations.append([user, job])

0it [00:00, ?it/s]

# save

In [None]:
top_recommendations = pd.DataFrame(pred_recommendations, columns=['resume_seq', 'recruitment_seq'])
top_recommendations.to_csv('./baseline_add_item_0.42.csv', index=False)

# 밑에서 부턴 연습장

## company

In [None]:
company = pd.read_csv('./company.csv')
company = company.sort_values(by='recruitment_seq')
company

Unnamed: 0,recruitment_seq,company_type_seq,supply_kind,employee
1103,R00001,5,201,631
2343,R00002,2,201,160
1986,R00004,2,402,500
352,R00009,2,402,41
1087,R00011,2,100,200
...,...,...,...,...
1439,R06670,5,402,300
1808,R06676,2,402,11
811,R06677,9,402,20
1449,R06688,2,402,90


In [None]:
company_train = pd.read_csv('./apply_train.csv')
company_train = company_train.sort_values(by='resume_seq')
company_train

Unnamed: 0,resume_seq,recruitment_seq
11989,U00001,R05210
342,U00001,R05288
51174,U00001,R04536
28858,U00001,R06065
50591,U00002,R01103
...,...,...
13950,U08481,R00225
22895,U08481,R05500
33843,U08482,R03633
54343,U08482,R03743


In [None]:
merged_company = pd.merge(company_train, company, on='recruitment_seq', how='left')
merged_company

Unnamed: 0,resume_seq,recruitment_seq,company_type_seq,supply_kind,employee
0,U00001,R05210,,,
1,U00001,R05288,,,
2,U00001,R04536,,,
3,U00001,R06065,,,
4,U00002,R01103,,,
...,...,...,...,...,...
57941,U08481,R00225,2.0,100.0,200.0
57942,U08481,R05500,2.0,100.0,200.0
57943,U08482,R03633,,,
57944,U08482,R03743,2.0,402.0,90.0


In [None]:
recruitment = pd.read_csv('./recruitment.csv')
recruitment = recruitment.sort_values(by='recruitment_seq')
recruitment

Unnamed: 0,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,text_keyword
4573,R00001,5.0,,,0,0,2101;2108;2201;2204;2205;2707;2810,2,2,1,
5915,R00002,3.0,,,0,0,2507;2703;2707,3,2,1,
6454,R00003,3.0,,,0,0,2101;2108;2201;2707,3,2,2,
5905,R00004,3.0,,,0,0,2507;2707,3,2,1,
4197,R00005,3.0,,,0,0,2507;2707,3,2,1,
...,...,...,...,...,...,...,...,...,...,...,...
5983,R06691,3.0,,,0,0,2501;2507;2707,3,2,1,
2367,R06692,3.0,,,0,0,2201;2507,3,2,2,
6235,R06693,5.0,,,0,0,2102;2707,4,2,1,
3298,R06694,3.0,,,0,0,2101;2108;2109;2110;2201;2203;2707,4,2,1,


In [None]:
recruitment['check_box_keyword']

4573    2101;2108;2201;2204;2205;2707;2810
5915                        2507;2703;2707
6454                   2101;2108;2201;2707
5905                             2507;2707
4197                             2507;2707
                       ...                
5983                        2501;2507;2707
2367                             2201;2507
6235                             2102;2707
3298    2101;2108;2109;2110;2201;2203;2707
2075                             2507;2707
Name: check_box_keyword, Length: 6695, dtype: object

In [None]:
codes_list = recruitment['check_box_keyword'].str.split(';')

In [None]:
codes_list

4573    [2101, 2108, 2201, 2204, 2205, 2707, 2810]
5915                            [2507, 2703, 2707]
6454                      [2101, 2108, 2201, 2707]
5905                                  [2507, 2707]
4197                                  [2507, 2707]
                           ...                    
5983                            [2501, 2507, 2707]
2367                                  [2201, 2507]
6235                                  [2102, 2707]
3298    [2101, 2108, 2109, 2110, 2201, 2203, 2707]
2075                                  [2507, 2707]
Name: check_box_keyword, Length: 6695, dtype: object

In [None]:
codes_list = [[int(code) for code in codes] for codes in codes_list]

In [None]:
recruitment['check_box_keyword'] = codes_list

In [None]:
recruitment['check_box_keyword']

4573    [[2101, 2108, 2201, 2204, 2205, 2707, 2810]]
5915                            [[2507, 2703, 2707]]
6454                      [[2101, 2108, 2201, 2707]]
5905                                  [[2507, 2707]]
4197                                  [[2507, 2707]]
                            ...                     
5983                            [[2501, 2507, 2707]]
2367                                  [[2201, 2507]]
6235                                  [[2102, 2707]]
3298    [[2101, 2108, 2109, 2110, 2201, 2203, 2707]]
2075                                  [[2507, 2707]]
Name: check_box_keyword, Length: 6695, dtype: object

In [None]:
recruitment['check_box_keyword'][0][0]

'[2507, 2707, 2810]'

In [None]:
recruitment['check_box_keyword'][0].max()

AttributeError: ignored

In [None]:
recruitment['check_box_keyword'] = recruitment['check_box_keyword'].astype(str)

In [None]:
import re

# check_box_keyword에서 숫자만 추출하여 리스트로 만들기
numbers = re.findall(r'\d+', str(codes_list[1])

# 숫자들 중에서 가장 큰 값과 가장 작은 값을 찾기
max_number = max(map(int, numbers))
min_number = min(map(int, numbers))

print(f"가장 큰 수: {max_number}")
print(f"가장 작은 수: {min_number}")

TypeError: ignored

In [None]:
merged_recruitment = pd.merge(recruitment, company, on='recruitment_seq', how='left')
merged_recruitment

## resume

In [None]:
resume = pd.read_csv('./resume.csv')
resume = resume.sort_values(by='resume_seq')
resume

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword,job_code_seq1,job_code_seq2,job_code_seq3,career_month,career_job_code
1690,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,,0,
3979,U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,,73,기타 공공;개인서비스
5648,U00003,2018-02-14,2020-07-08,4,2004,0.0,0.0,남성복디자이너;TD캐주얼,재료·화학·섬유·의복,,,113,섬유;봉제;가방;의류
1367,U00004,2017-10-26,2020-04-27,4,2011,0.0,0.0,상품기획;영업기획,재료·화학·섬유·의복,,,27,섬유;봉제;가방;의류
5242,U00005,2020-03-18,2020-04-08,4,2011,0.0,0.0,인사;총무;경영;MD;상품기획,재료·화학·섬유·의복,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3811,U08478,2020-02-05,2020-03-31,4,2003,0.0,2700.0,상품기획,재료·화학·섬유·의복,,,60,섬유;봉제;가방;의류
2227,U08479,2016-02-17,2020-07-15,3,0,2500.0,2300.0,디자이너,재료·화학·섬유·의복,,,59,디자인;CAD;설계
2657,U08480,2019-11-13,2020-06-24,4,0,0.0,0.0,VMD;텍스타일디자인;섬유디자인;니트디자인;여성복디자인,디자인,,,0,
6619,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,,8,섬유;봉제;가방;의류


In [None]:
resume_certificate = pd.read_csv('./resume_certificate.csv')
resume_certificate = resume_certificate.sort_values(by='resume_seq')
resume_certificate

Unnamed: 0,resume_seq,certificate_contents
6855,U00002,자동차 운전면허 (2종 보통)
6458,U00003,운전면허2종보통
4424,U00003,컬러리스트 기사
7553,U00003,워드프로세서 1급
1509,U00004,정보처리기능사
...,...,...
6980,U08480,컬러리스트기사
7043,U08480,컴퓨터패션디자인운용마스터
335,U08481,컴퓨터활용능력 2급
7386,U08482,컴퓨터그래픽스운용기능사


In [None]:
one_hot_encoded = pd.get_dummies(resume_certificate['certificate_contents'].str.split(expand=True).stack()).sum(level=0)
resume_certificate = pd.concat([resume_certificate, one_hot_encoded], axis=1).drop(columns='certificate_contents')
resume_certificate = resume_certificate.groupby('resume_seq').sum().reset_index()
resume_certificate

  one_hot_encoded = pd.get_dummies(resume_certificate['certificate_contents'].str.split(expand=True).stack()).sum(level=0)


Unnamed: 0,resume_seq,&,(,(1종,(2종,(CISCO,(EXPERT),(ITQ),(ITQ)인증시험,(ITQ)인증시험(아래한글,...,활용능력2급,활용능력3급,회계,회계관리1급,회계관리2급,회계관리2급(국가공인),회계실무사,회계실무사(23급),훈육지도자,휴넷
0,U00002,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,U00003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,U00004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,U00006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,U00007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971,U08476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5972,U08479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5973,U08480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5974,U08481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
resume_certificate.columns

Index(['resume_seq', '&', '(', '(1종', '(2종', '(CISCO', '(EXPERT)', '(ITQ)',
       '(ITQ)인증시험', '(ITQ)인증시험(아래한글',
       ...
       '활용능력2급', '활용능력3급', '회계', '회계관리1급', '회계관리2급', '회계관리2급(국가공인)', '회계실무사',
       '회계실무사(23급)', '훈육지도자', '휴넷'],
      dtype='object', length=1715)

In [None]:
resume_certificate = resume_certificate.drop(['&', '('], axis=1)
resume_certificate

Unnamed: 0,resume_seq,(1종,(2종,(CISCO,(EXPERT),(ITQ),(ITQ)인증시험,(ITQ)인증시험(아래한글,(JLPT),(M7),...,활용능력2급,활용능력3급,회계,회계관리1급,회계관리2급,회계관리2급(국가공인),회계실무사,회계실무사(23급),훈육지도자,휴넷
0,U00002,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,U00003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,U00004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,U00006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,U00007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971,U08476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5972,U08479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5973,U08480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5974,U08481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
resume_certificate.columns = resume_certificate.columns.str.replace(r'\(1종', '1종')
resume_certificate.columns = resume_certificate.columns.str.replace(r'\(2종', '2종')
resume_certificate

  resume_certificate.columns = resume_certificate.columns.str.replace(r'\(1종', '1종')
  resume_certificate.columns = resume_certificate.columns.str.replace(r'\(2종', '2종')


Unnamed: 0,resume_seq,1종,2종,(CISCO,(EXPERT),(ITQ),(ITQ)인증시험,(ITQ)인증시험(아래한글,(JLPT),(M7),...,활용능력2급,활용능력3급,회계,회계관리1급,회계관리2급,회계관리2급(국가공인),회계실무사,회계실무사(23급),훈육지도자,휴넷
0,U00002,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,U00003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,U00004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,U00006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,U00007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5971,U08476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5972,U08479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5973,U08480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5974,U08481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
'''resume_certificate = resume_certificate.dropna(subset=['certificate_contents'])
resume_certificate = resume_certificate.groupby('resume_seq')['certificate_contents'].apply(', '.join).reset_index()
resume_certificate'''

In [None]:
merged_resume = pd.merge(resume, resume_certificate, on='resume_seq', how='left')
merged_resume

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword,job_code_seq1,job_code_seq2,...,활용능력2급,활용능력3급,회계,회계관리1급,회계관리2급,회계관리2급(국가공인),회계실무사,회계실무사(23급),훈육지도자,휴넷
0,U00001,2019-12-27,2020-02-01,4,2009,1500.0,1500.0,디자이너,재료·화학·섬유·의복,,...,,,,,,,,,,
1,U00002,2020-04-24,2020-04-29,4,2005,0.0,0.0,디자이너,재료·화학·섬유·의복,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,U00003,2018-02-14,2020-07-08,4,2004,0.0,0.0,남성복디자이너;TD캐주얼,재료·화학·섬유·의복,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,U00004,2017-10-26,2020-04-27,4,2011,0.0,0.0,상품기획;영업기획,재료·화학·섬유·의복,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,U00005,2020-03-18,2020-04-08,4,2011,0.0,0.0,인사;총무;경영;MD;상품기획,재료·화학·섬유·의복,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U08478,2020-02-05,2020-03-31,4,2003,0.0,2700.0,상품기획,재료·화학·섬유·의복,,...,,,,,,,,,,
8478,U08479,2016-02-17,2020-07-15,3,0,2500.0,2300.0,디자이너,재료·화학·섬유·의복,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8479,U08480,2019-11-13,2020-06-24,4,0,0.0,0.0,VMD;텍스타일디자인;섬유디자인;니트디자인;여성복디자인,디자인,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8480,U08481,2020-07-09,2020-11-23,4,2015,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
merged_resume = merged_resume.drop(['reg_date','updated_date', 'graduate_date', 'job_code_seq2', 'job_code_seq3','career_job_code'], axis=1)
merged_resume

Unnamed: 0,resume_seq,degree,hope_salary,last_salary,text_keyword,job_code_seq1,career_month,1종,2종,(CISCO,...,활용능력2급,활용능력3급,회계,회계관리1급,회계관리2급,회계관리2급(국가공인),회계실무사,회계실무사(23급),훈육지도자,휴넷
0,U00001,4,1500.0,1500.0,디자이너,재료·화학·섬유·의복,0,,,,...,,,,,,,,,,
1,U00002,4,0.0,0.0,디자이너,재료·화학·섬유·의복,73,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,U00003,4,0.0,0.0,남성복디자이너;TD캐주얼,재료·화학·섬유·의복,113,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,U00004,4,0.0,0.0,상품기획;영업기획,재료·화학·섬유·의복,27,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,U00005,4,0.0,0.0,인사;총무;경영;MD;상품기획,재료·화학·섬유·의복,0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U08478,4,0.0,2700.0,상품기획,재료·화학·섬유·의복,60,,,,...,,,,,,,,,,
8478,U08479,3,2500.0,2300.0,디자이너,재료·화학·섬유·의복,59,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8479,U08480,4,0.0,0.0,VMD;텍스타일디자인;섬유디자인;니트디자인;여성복디자인,디자인,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8480,U08481,4,0.0,2700.0,상품기획;머천다이저;기획MD,재료·화학·섬유·의복,8,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
merged_resume = merged_resume.drop(['text_keyword'], axis=1)
merged_resume

Unnamed: 0,resume_seq,degree,hope_salary,last_salary,job_code_seq1,career_month,1종,2종,(CISCO,(EXPERT),...,활용능력2급,활용능력3급,회계,회계관리1급,회계관리2급,회계관리2급(국가공인),회계실무사,회계실무사(23급),훈육지도자,휴넷
0,U00001,4,1500.0,1500.0,재료·화학·섬유·의복,0,,,,,...,,,,,,,,,,
1,U00002,4,0.0,0.0,재료·화학·섬유·의복,73,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,U00003,4,0.0,0.0,재료·화학·섬유·의복,113,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,U00004,4,0.0,0.0,재료·화학·섬유·의복,27,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,U00005,4,0.0,0.0,재료·화학·섬유·의복,0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U08478,4,0.0,2700.0,재료·화학·섬유·의복,60,,,,,...,,,,,,,,,,
8478,U08479,3,2500.0,2300.0,재료·화학·섬유·의복,59,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8479,U08480,4,0.0,0.0,디자인,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8480,U08481,4,0.0,2700.0,재료·화학·섬유·의복,8,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
one_hot_encoded = pd.get_dummies(merged_resume['job_code_seq1'].str.split(expand=True).stack()).sum(level=0)
merged_resume = pd.concat([merged_resume, one_hot_encoded], axis=1).drop(columns='job_code_seq1')
merged_resume = merged_resume.groupby('resume_seq').sum().reset_index()
merged_resume = merged_resume.fillna(0)
merged_resume

  one_hot_encoded = pd.get_dummies(merged_resume['job_code_seq1'].str.split(expand=True).stack()).sum(level=0)


Unnamed: 0,resume_seq,degree,hope_salary,last_salary,career_month,1종,2종,(CISCO,(EXPERT),(ITQ),...,IT·게임,건설·기계·전기·전자,경영·기획·회계·사무,기타,디자인,문화·예술·신문·방송,영업·판매·TM,재료·화학·섬유·의복,전문·교육·자격,직종
0,U00001,4,1500.0,1500.0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
1,U00002,4,0.0,0.0,73,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2,U00003,4,0.0,0.0,113,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3,U00004,4,0.0,0.0,27,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,U00005,4,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U08478,4,0.0,2700.0,60,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
8478,U08479,3,2500.0,2300.0,59,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
8479,U08480,4,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
8480,U08481,4,0.0,2700.0,8,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
merged_resume = merged_resume.fillna(0)
merged_resume

Unnamed: 0,resume_seq,degree,hope_salary,last_salary,career_month,1종,2종,(CISCO,(EXPERT),(ITQ),...,IT·게임,건설·기계·전기·전자,경영·기획·회계·사무,기타,디자인,문화·예술·신문·방송,영업·판매·TM,재료·화학·섬유·의복,전문·교육·자격,직종
0,U00001,4,1500.0,1500.0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
1,U00002,4,0.0,0.0,73,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
2,U00003,4,0.0,0.0,113,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
3,U00004,4,0.0,0.0,27,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4,U00005,4,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U08478,4,0.0,2700.0,60,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
8478,U08479,3,2500.0,2300.0,59,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
8479,U08480,4,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
8480,U08481,4,0.0,2700.0,8,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0


## resume_education.csv

In [None]:
resume_education = pd.read_csv('./resume_education.csv')
resume_education = resume_education.sort_values(by='resume_seq')
resume_education

Unnamed: 0,resume_seq,hischool_type_seq,hischool_special_type,hischool_nation,hischool_gender,hischool_location_seq,univ_type_seq1,univ_type_seq2,univ_transfer,univ_location,univ_major,univ_sub_major,univ_major_type,univ_score
5714,U00001,21,일반고,공립,여자학교,4,5,5,0,17,,,9,20.0
3375,U00002,0,기타,기타,기타,0,5,5,0,3,,,8,90.0
201,U00003,21,일반고,공립,남여공학,6,5,5,0,17,,,4,90.0
586,U00004,21,일반고,공립,남여공학,5,5,5,0,5,,,3,70.0
7970,U00005,21,일반고,공립,여자학교,5,0,0,0,0,,,10,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6923,U08478,21,일반고,공립,남여공학,3,5,5,0,5,,,19,0.0
5091,U08479,21,자율고,사립,남여공학,3,3,6,0,3,가정과,,9,0.0
2775,U08480,0,기타,기타,기타,0,5,5,0,10,미술대학섬유패션코디네이션학과,,9,80.0
334,U08481,21,자율고,공립,여자학교,5,5,5,0,5,,,9,60.0


## resume_language.csv

In [None]:
resume_language = pd.read_csv('./resume_language.csv')
resume_language = resume_language.sort_values(by='resume_seq')
resume_language

In [None]:
merged_resume = pd.merge(merged_resume, resume_education, on='resume_seq', how='left')
merged_resume

In [None]:
merged_resume = pd.merge(merged_resume, resume_language, on='resume_seq', how='left')
merged_resume

In [None]:
merged_resume = merged_resume.fillna(0)

In [None]:
merged_resume