<font color="#CC3D3D"><p>
# Data Preprocessing for Side Information

<font color="blue"><p>
#### 데이터 전처리 절차
1. 수치형 피처
 - 결측값처리: SimpleImputer(strategy="median")
 - 이상값처리: FunctionTransformer()
 - 스케일링:   MinMaxScaler()
2. 범주형 피처
 - 결측값처리: SimpleImputer(strategy="most_frequent")
 - 인코딩:     OneHotEncoder(handle_unknown="ignore")
 - 차원축소:   MyPCATransformer() # Custom PCA   
 - 스케일링:   MinMaxScaler()
3. 공통
 - 피처제거:   FunctionTransformer()   

## Imports

In [2]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import pickle
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer 
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import set_config

## Data Loading & Merging

#### User Features

In [3]:
resume = pd.read_csv('data/resume.csv')
resume

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword,job_code_seq1,job_code_seq2,job_code_seq3,career_month,career_job_code
0,U00606,2020-03-04,2020-05-22,4,2008,3500.0,3500.0,스타일디자이너;우븐디자이너,재료·화학·섬유·의복,,,67,
1,U00509,2019-08-25,2020-09-02,2,0,0.0,3700.0,상품기획;MD;기획;머천다이저;머천다이징,재료·화학·섬유·의복,,,84,섬유;봉제;가방;의류
2,U02012,2017-11-20,2020-01-26,5,1979,3500.0,3100.0,니트디자인,재료·화학·섬유·의복,,,121,학교;학원;직원훈련(교육서비스)
3,U04599,2020-05-13,2020-05-28,4,2012,0.0,2500.0,MD;기획MD,재료·화학·섬유·의복,,,24,섬유;봉제;가방;의류
4,U07573,2019-07-23,2020-03-08,4,2010,1900.0,0.0,디자이너;남성복;스포츠웨어;편집디자인;코디네이터;일러스트레이터;VMD;MD,재료·화학·섬유·의복,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U04890,2018-05-27,2020-08-26,5,1994,0.0,6250.0,상품기획;생산;글로벌소싱;남성복;여성복,재료·화학·섬유·의복,,,204,섬유;봉제;가방;의류
8478,U05910,2019-09-27,2020-10-22,4,2013,2300.0,1900.0,소재;기획MD;남성복;온라인MD;디자이너,재료·화학·섬유·의복,,,12,섬유;봉제;가방;의류
8479,U01468,2019-02-26,2020-06-23,4,2008,0.0,0.0,상품기획MD;우븐디자인,디자인,,,0,섬유;봉제;가방;의류
8480,U05315,2020-03-27,2020-12-21,4,2016,0.0,0.0,여성복디자인;우븐디자인;다이마루디자인,재료·화학·섬유·의복,,,18,디자인;CAD;설계


In [4]:
resume_cer = pd.read_csv('data/resume_certificate.csv')
resume_cer = resume_cer.drop_duplicates('resume_seq')
resume_cer

Unnamed: 0,resume_seq,certificate_contents
0,U06421,손해보험사
1,U01278,2종운전면허증
2,U01008,자동차운전면허 2종 보통
3,U03248,운전면허2종보통
4,U07483,메이크업3급
...,...,...
12942,U00966,자동차운전면허 1종 보통
12944,U04399,운전면허2종보통
12962,U07008,운전면허 1종 보통
12965,U06294,금융자산관리사(FP)


In [5]:
resume_lan = pd.read_csv('data/resume_language.csv')
resume_lan = resume_lan.drop_duplicates('resume_seq')
resume_lan

Unnamed: 0,resume_seq,language,exam_name,score
0,U01774,2,4,742.42
1,U04892,2,4,888.89
2,U01859,2,4,500.00
3,U02475,2,4,500.00
4,U03776,4,5,595.96
...,...,...,...,...
864,U04964,2,11,400.00
865,U02950,2,4,843.43
866,U05922,2,2,500.00
867,U03827,2,4,883.84


In [6]:
resume_edu = pd.read_csv('data/resume_education.csv')
resume_edu = resume_edu.drop_duplicates('resume_seq')
resume_edu

Unnamed: 0,resume_seq,hischool_type_seq,hischool_special_type,hischool_nation,hischool_gender,hischool_location_seq,univ_type_seq1,univ_type_seq2,univ_transfer,univ_location,univ_major,univ_sub_major,univ_major_type,univ_score
0,U01419,21,일반고,사립,남자학교,3,5,5,0,3,,,9,60.0
1,U03375,21,일반고,사립,여자학교,3,5,5,0,3,,,4,80.0
2,U06523,21,일반고,사립,남여공학,3,5,5,0,3,,,8,70.0
3,U06619,21,일반고,사립,남여공학,5,5,5,0,5,,,8,80.0
4,U05015,16,특성화고,공립,남여공학,3,5,5,0,3,,,9,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U06326,21,일반고,사립,여자학교,9,5,5,0,7,섬우패션학부의류패션전공,,10,70.0
8478,U07284,0,기타,기타,기타,0,5,5,0,11,,,3,70.0
8479,U04109,21,일반고,공립,남여공학,5,0,0,0,0,,,9,80.0
8480,U01863,21,일반고,사립,여자학교,15,3,5,0,15,의류학과생활과학계열,,9,80.0


In [7]:
# 모든 사용자(resume) 데이터 병합: 해당 사용자에게 누락된 정보가 많기 때문에 resume.csv의 id를 기준으로 병합 
user_features = (resume
                 .merge(resume_cer, on='resume_seq', how='left')
                 .merge(resume_lan, on='resume_seq', how='left')
                 .merge(resume_edu, on='resume_seq', how='left')
                )
user_features

Unnamed: 0,resume_seq,reg_date,updated_date,degree,graduate_date,hope_salary,last_salary,text_keyword,job_code_seq1,job_code_seq2,...,hischool_gender,hischool_location_seq,univ_type_seq1,univ_type_seq2,univ_transfer,univ_location,univ_major,univ_sub_major,univ_major_type,univ_score
0,U00606,2020-03-04,2020-05-22,4,2008,3500.0,3500.0,스타일디자이너;우븐디자이너,재료·화학·섬유·의복,,...,남여공학,11,5,5,0,11,,,4,70.0
1,U00509,2019-08-25,2020-09-02,2,0,0.0,3700.0,상품기획;MD;기획;머천다이저;머천다이징,재료·화학·섬유·의복,,...,기타,0,5,5,0,5,,,4,90.0
2,U02012,2017-11-20,2020-01-26,5,1979,3500.0,3100.0,니트디자인,재료·화학·섬유·의복,,...,기타,0,5,5,0,3,,,9,90.0
3,U04599,2020-05-13,2020-05-28,4,2012,0.0,2500.0,MD;기획MD,재료·화학·섬유·의복,,...,여자학교,7,3,5,0,11,,,19,80.0
4,U07573,2019-07-23,2020-03-08,4,2010,1900.0,0.0,디자이너;남성복;스포츠웨어;편집디자인;코디네이터;일러스트레이터;VMD;MD,재료·화학·섬유·의복,,...,여자학교,4,5,5,0,13,,,19,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8477,U04890,2018-05-27,2020-08-26,5,1994,0.0,6250.0,상품기획;생산;글로벌소싱;남성복;여성복,재료·화학·섬유·의복,,...,남자학교,3,5,5,0,3,,,3,70.0
8478,U05910,2019-09-27,2020-10-22,4,2013,2300.0,1900.0,소재;기획MD;남성복;온라인MD;디자이너,재료·화학·섬유·의복,,...,여자학교,15,5,5,0,15,,,19,80.0
8479,U01468,2019-02-26,2020-06-23,4,2008,0.0,0.0,상품기획MD;우븐디자인,디자인,,...,기타,0,5,5,0,18,,,9,80.0
8480,U05315,2020-03-27,2020-12-21,4,2016,0.0,0.0,여성복디자인;우븐디자인;다이마루디자인,재료·화학·섬유·의복,,...,남여공학,5,5,5,0,3,,,9,80.0


#### Item Features

In [9]:
recruit = pd.read_csv('data/recruitment.csv')
recruit

Unnamed: 0,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,text_keyword
0,R02264,3.0,,,0,0,2507;2707;2810,4,8,1,
1,R06317,3.0,,,0,0,2204;2205;2707,3,2,1,
2,R04017,3.0,,,0,0,2101;2108;2201;2707,3,2,1,
3,R02865,3.0,,,0,0,2201;2204;2205;2707,2,2,1,
4,R04890,3.0,,,0,0,2201;2204;2205;2707,2,2,2,
...,...,...,...,...,...,...,...,...,...,...,...
6690,R03678,3.0,,,0,0,2101;2108;2201;2204;2205;2707,3,2,1,
6691,R04593,3.0,,,0,0,2201;2204;2205;2707,4,2,1,
6692,R03252,3.0,,,0,0,2109,3,2,1,
6693,R05130,3.0,,,0,0,2201;2204;2205;2707,2,2,2,


In [10]:
company = pd.read_csv('data/company.csv')
company = company.drop_duplicates('recruitment_seq')
company

Unnamed: 0,recruitment_seq,company_type_seq,supply_kind,employee
0,R02073,2,514,20
1,R03274,2,402,90
2,R02195,2,514,20
3,R03372,4,100,60
4,R00867,2,402,590
...,...,...,...,...
2372,R01786,2,100,100
2373,R03415,2,100,270
2374,R04028,4,402,525
2375,R06508,2,402,70


In [11]:
# 모든 아이템(recruitment) 데이터 병합: 해당 아이템에 대해 누락된 정보가 많기 때문에 recruit.csv의 id를 기준으로 병합 
item_features = recruit.merge(company, on='recruitment_seq', how='left')
item_features

Unnamed: 0,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,text_keyword,company_type_seq,supply_kind,employee
0,R02264,3.0,,,0,0,2507;2707;2810,4,8,1,,5.0,402.0,800.0
1,R06317,3.0,,,0,0,2204;2205;2707,3,2,1,,,,
2,R04017,3.0,,,0,0,2101;2108;2201;2707,3,2,1,,,,
3,R02865,3.0,,,0,0,2201;2204;2205;2707,2,2,1,,,,
4,R04890,3.0,,,0,0,2201;2204;2205;2707,2,2,2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,R03678,3.0,,,0,0,2101;2108;2201;2204;2205;2707,3,2,1,,,,
6691,R04593,3.0,,,0,0,2201;2204;2205;2707,4,2,1,,,,
6692,R03252,3.0,,,0,0,2109,3,2,1,,4.0,402.0,525.0
6693,R05130,3.0,,,0,0,2201;2204;2205;2707,2,2,2,,2.0,402.0,40.0


In [12]:
# Save list of user/item ids for later use
user_ids = list(user_features.resume_seq)
item_ids = list(item_features.recruitment_seq)

In [13]:
# 가비지 콜렉션: 프로그램이 동적으로 할당했던 메모리 영역 중에서 불필요한 메모리를 알아서 정리
import gc

del resume, resume_cer, resume_lan, resume_edu, recruit, company
gc.collect()

2057

## Data Preprocessing

#### User Features

수치형/범주형 피처 분리 & 학습/평가 데이터 분할

In [14]:
# 수치형/범주형 피처 분리
binary_features = ['text_keyword','job_code_seq1']
numeric_features = ['degree','graduate_date','hope_salary','last_salary','career_month','univ_score']
categorical_features = ['reg_date','updated_date','career_job_code','certificate_contents','hischool_type_seq', 
                        'hischool_special_type','hischool_nation','hischool_gender','hischool_location_seq', 
                        'univ_type_seq1','univ_type_seq2','univ_transfer','univ_location','univ_major_type']

파이프라인 구축: 수치형과 범주형 피처를 다르게 처리할 수 있는 ColumnTransformer를 활용

In [15]:
# 결측값이 과도한(50% 초과) 피처 제거
def drop_features(X, threshold=0.5):
    # Calculate the proportion of missing values for each column
    missing_proportion = X.isnull().mean()
    # Identify columns where missing value proportion is greater than the threshold
    columns_to_drop = missing_proportion[missing_proportion > threshold].index
    # Drop these columns from the DataFrame
    X_dropped = X.drop(columns=columns_to_drop)
    return X_dropped

# 이상치 처리 방법 중 가장 단순한 방법:
def remove_outlier(X):
    df = pd.DataFrame(X)
    # df.clip(): 상한선과 하한선을 임계값으로 정해서 임계값 밖의 값을 임계값으로 변경
    return df.apply(lambda x: x.clip(x.quantile(.05), x.quantile(.95)), axis=0).values

In [16]:
# PCA 차원을 자동으로 결정하는 Custom PCA 전처리기 클래스
class MyPCATransformer(TransformerMixin, BaseEstimator):
    # 전처리기 생성 즉, MyPCATransformer() 호출시 실행
    def __init__(self, sum_explained_variance=0.99):
        self.sum_explained_variance = sum_explained_variance

    # 전처리기의 fit() 호출시 실행
    def fit(self, X, y=None):
        # 먼저, 전체 피처에 대해 PCA 수행(차원 축소 없음)
        max_d = min(X.shape[0], X.shape[1])
        pca = PCA(n_components=max_d).fit(X)
        # 누적된 분산의 설명량이 um_explained_variance 이상 되는 차원을 축소할 차원으로 설정
        cumsum = np.cumsum(pca.explained_variance_ratio_)                 #분산의 설명량을 누적합
        self.num_d = np.argmax(cumsum >= self.sum_explained_variance) + 1 #분산의 설명량이 99%이상 되는 차원의 수
        if self.num_d == 1: self.num_d = max_d
        # 축소할 차원으로 다시 PCA 수행 
        self.pca = PCA(n_components=self.num_d)
        self.pca.fit(X)
        return self
    
    # 전처리기의 transform() 호출시 실행
    def transform(self, X):
        return self.pca.transform(X)        

In [17]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
#        ("outlier", FunctionTransformer(remove_outlier)), # 함수를 전처리기로 변환하여 sklearn에 없는 새로운 전처리기를 만듬
        ("scaler",  MinMaxScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False)),
#        ("pca", MyPCATransformer()), # Custom 전처리기(PCA) 호출
#        ("scaler",  MinMaxScaler()),
    ]
)

binary_transformer = Pipeline(
    steps=[
        ("impuer", FunctionTransformer(lambda x: x.fillna('없음'))),      
        ("corpus", FunctionTransformer(lambda x: x.str.replace(r'[; ·]',',', regex=True).str.split(',').str.join(" "))),
        ("BoW", CountVectorizer()),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("bin1", binary_transformer, binary_features[0]), 
        ("bin2", binary_transformer, binary_features[1]), 
    ]
)

preprocessor = Pipeline(
    steps=[
#        ("drop", FunctionTransformer(drop_features)), # 함수를 전처리기로 변환하여 sklearn에 없는 새로운 전처리기를 만듬
        ("column", column_transformer), 
        ("asDF", FunctionTransformer(lambda x: pd.DataFrame(x.T, columns=user_ids)))  # np.array => pd.DataFrame
    ]
)

In [18]:
set_config(display="diagram")  # To view the text pipeline, change to display='text'.
preprocessor

파이프라인을 통한 데이터 전처리

In [19]:
# Output 형식: 데이터프레임의 각 컬럼은 각 사용자를 나타냄.
user_features_preprocessed = preprocessor.fit_transform(user_features)
user_features_preprocessed

Unnamed: 0,U00606,U00509,U02012,U04599,U07573,U03218,U04073,U07898,U07935,U07524,...,U06042,U00515,U01905,U00842,U08092,U04890,U05910,U01468,U05315,U01277
0,0.500000,0.000000,0.750000,0.500000,0.500000,0.250000,0.250000,0.500000,0.500000,0.250000,...,0.500000,0.500000,0.500000,0.500000,0.500000,0.750000,0.500000,0.500000,0.500000,0.500000
1,0.993076,0.000000,0.978734,0.995054,0.994065,0.989614,0.000000,0.995054,0.997527,0.990109,...,0.986647,0.992582,0.996044,0.989614,0.989614,0.986152,0.995549,0.993076,0.997033,0.993571
2,0.350000,0.000000,0.350000,0.000000,0.190000,0.250000,0.390000,0.270000,0.000000,0.525000,...,0.390000,0.310000,0.000000,0.390000,0.000000,0.000000,0.230000,0.000000,0.000000,0.210000
3,0.350000,0.370000,0.310000,0.250000,0.000000,0.230000,0.390000,0.250000,0.000000,0.490000,...,0.390000,0.290000,0.270000,0.370000,0.750000,0.625000,0.190000,0.000000,0.000000,0.100000
4,0.155093,0.194444,0.280093,0.055556,0.000000,0.138889,0.178241,0.097222,0.027778,0.263889,...,0.472222,0.062500,0.083333,0.231481,0.384259,0.472222,0.027778,0.000000,0.041667,0.006944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7406,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7407,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7408,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000,1.000000,1.000000


#### Item Features

수치형/범주형 피처 분리 & 학습/평가 데이터 분할

In [20]:
# 수치형/범주형 피처 분리
binary_features = ['check_box_keyword']
numeric_features = ['education']
categorical_features = ['address_seq1','major_task','qualifications']

In [21]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
#        ("outlier", FunctionTransformer(remove_outlier)), # 함수를 전처리기로 변환하여 sklearn에 없는 새로운 전처리기를 만듬
        ("scaler",  MinMaxScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False)),
#        ("pca", MyPCATransformer()), # Custom 전처리기(PCA) 호출
#        ("scaler",  MinMaxScaler()),
    ]
)

binary_transformer = Pipeline(
    steps=[
        ("corpus", FunctionTransformer(lambda x: x.str.replace(';',',').str.split(',').str.join(" "))),
        ("BoW", CountVectorizer()),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("bin1", binary_transformer, binary_features[0]), 
    ]
)

preprocessor = Pipeline(
    steps=[
#        ("drop", FunctionTransformer(drop_features)), # 함수를 전처리기로 변환하여 sklearn에 없는 새로운 전처리기를 만듬
        ("column", column_transformer), 
        ("asDF", FunctionTransformer(lambda x: pd.DataFrame(x.toarray().T, columns=item_ids)))  # np.array => pd.DataFrame
    ]
)

In [22]:
set_config(display="diagram")  # To view the text pipeline, change to display='text'.
preprocessor

파이프라인을 통한 데이터 전처리

In [23]:
# Output 형식: 데이터프레임의 각 컬럼은 각 아이템을 나타냄.
item_features_preprocessed = preprocessor.fit_transform(item_features)
item_features_preprocessed

Unnamed: 0,R02264,R06317,R04017,R02865,R04890,R05446,R05023,R04999,R02396,R04458,...,R03810,R05101,R04989,R01376,R03086,R03678,R04593,R03252,R05130,R04779
0,0.5,0.25,0.25,0.0,0.0,0.25,0.0,0.5,0.25,0.25,...,0.5,0.5,0.25,0.25,0.5,0.25,0.5,0.25,0.0,0.25
1,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.00,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00
2,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.00,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00
3,1.0,1.00,1.00,1.0,1.0,1.00,1.0,0.0,1.00,1.00,...,1.0,1.0,1.00,1.00,1.0,1.00,1.0,1.00,1.0,1.00
4,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.00,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.00,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00
76,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.00,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00
77,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.00,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00
78,0.0,0.00,0.00,0.0,0.0,0.00,0.0,0.0,0.00,0.00,...,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00


In [24]:
# Test
#item_features_preprocessed = (item_features_preprocessed >= .5).astype(int)

## Feature Export

In [25]:
# 사용자/아이템 Side info를 pickle 파일로 저장 => Side info를 지원하는 추천모델에서 사용

with open('features.pkl', 'wb') as file:
    pickle.dump((user_features_preprocessed, item_features_preprocessed), file)

<font color="#CC3D3D"><p>
# End