In [1]:
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

In [2]:
path = '../../../data/카드'

train = pd.read_csv(os.path.join(path, 'train.csv'), index_col=0)
test = pd.read_csv(os.path.join(path, 'test.csv'), index_col=0)
sub = pd.read_csv(os.path.join(path, 'sample_submission.csv'), index_col=0)

In [3]:
test.shape

(10000, 18)

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26457 entries, 0 to 26456
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         26457 non-null  object 
 1   car            26457 non-null  object 
 2   reality        26457 non-null  object 
 3   child_num      26457 non-null  int64  
 4   income_total   26457 non-null  float64
 5   income_type    26457 non-null  object 
 6   edu_type       26457 non-null  object 
 7   family_type    26457 non-null  object 
 8   house_type     26457 non-null  object 
 9   DAYS_BIRTH     26457 non-null  int64  
 10  DAYS_EMPLOYED  26457 non-null  int64  
 11  FLAG_MOBIL     26457 non-null  int64  
 12  work_phone     26457 non-null  int64  
 13  phone          26457 non-null  int64  
 14  email          26457 non-null  int64  
 15  occyp_type     18286 non-null  object 
 16  family_size    26457 non-null  float64
 17  begin_month    26457 non-null  float64
 18  credit

In [5]:
target = train['credit']
train = train.drop(['credit'], axis=1)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26457 entries, 0 to 26456
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gender         26457 non-null  object 
 1   car            26457 non-null  object 
 2   reality        26457 non-null  object 
 3   child_num      26457 non-null  int64  
 4   income_total   26457 non-null  float64
 5   income_type    26457 non-null  object 
 6   edu_type       26457 non-null  object 
 7   family_type    26457 non-null  object 
 8   house_type     26457 non-null  object 
 9   DAYS_BIRTH     26457 non-null  int64  
 10  DAYS_EMPLOYED  26457 non-null  int64  
 11  FLAG_MOBIL     26457 non-null  int64  
 12  work_phone     26457 non-null  int64  
 13  phone          26457 non-null  int64  
 14  email          26457 non-null  int64  
 15  occyp_type     18286 non-null  object 
 16  family_size    26457 non-null  float64
 17  begin_month    26457 non-null  float64
dtypes: flo

In [7]:
numeric_columns = list(train.dtypes[train.dtypes != object].index)   # object가 아니면 numeric
label_columns = ['gender', 'car', 'reality', 'work_phone', 'phone', 'email']   # category 2개인 것 label encoding
oh_columns = ['income_type', 'edu_type', 'family_type']   # category 3개 이상인것 one hot encoding(occyp는 null이 많으니까 제거)

In [8]:
# 기본적인 전처리
# numeric 변수에는 standard scaling 적용, label 변수에는 label encoding, category 변수에는 one-hot encoding
# train 모드시, scaler와 encoder를 함께 반환
# test 모드시, output df만 반환. scaler와 encoder가 반드시 함께 전달되어야 함
# df_ : 변환할 df
# num_columns : numeric 변수의 이름을 담은 리스트
# label_columns : label encoding을 할 변수의 이름을 담은 리스트
# oh_columns : one-hot encoding을 할 변수의 이름을 담은 리스트
# train : 훈련 모드 설정. True면 훈련모드, False면 test 모드
# scaler : test 모드 시 필요한 scaler
# oh_encoder : test 모드 시 필요한 one-hot scaler
# 필요 모듈
# import pandas as pd
# from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
# 주의사항 !!!!!
# 변수로 전달된 column들을 가지고만 전처리 진행 --> 전달된 column들로만 구성된 df를 반환
def basic_preprocess(df_, num_columns=None, label_columns=None, oh_columns=None, train=True,
                     scaler=None, oh_encoder=None):
    df = df_.copy()
    final_columns = []  # 최종 df column 이름 리스트
    final_dfs = []  # 최종 df_list

    # train mode
    if train:
        # scaling
        if num_columns:
            scaler = StandardScaler()  # scaler 선언
            num_df = df[num_columns]
            num_sc = scaler.fit_transform(num_df)  # scaling
            num_sc = pd.DataFrame(num_sc, columns=num_columns)  # df로 변환
            final_columns.extend(num_columns)
            final_dfs.append(num_sc)

        # label encoding
        if label_columns:
            enc = LabelEncoder()  # label encoder 선언
            label_df = df[label_columns]
            # 각 변수에 대해 label encoding
            for col in label_columns:
                label_df[col] = enc.fit_transform(label_df[col])
            label_df.reset_index(drop=True, inplace=True)
            final_columns.extend(label_columns)
            final_dfs.append(label_df)

        # one hot encoding
        if oh_columns:
            oh_encoder = OneHotEncoder(sparse=False)  # one-hot encoder 선언. sparse matrix 생성 x
            oh_df = df[oh_columns]
            oh_df = oh_encoder.fit_transform(oh_df)

            # one-hot 변환 후 이름 생성
            names = oh_encoder.categories_  # category들
            columns = []
            for pre, name in zip(oh_columns, names):
                for i in range(len(name)):
                    temp = pre + '_' + name[i]  # 각 category에 원래 column 이름을 prefix로 붙여줌
                    columns.append(temp)
            oh_df = pd.DataFrame(oh_df, columns=columns)  # df로 만들기
            final_columns.extend(columns)
            final_dfs.append(oh_df)

        # 최종 df
        final_df = pd.concat(final_dfs, axis=1, ignore_index=True)
        final_df.columns = final_columns
        return final_df, scaler, oh_encoder
    # test 모드
    else:
        if num_columns:
            num_df = df[num_columns]
            num_sc = scaler.transform(num_df)  # 전달 받은 scaler로 변환
            num_sc = pd.DataFrame(num_sc, columns=num_columns)
            final_columns.extend(num_columns)
            final_dfs.append(num_sc)

        if label_columns:
            enc = LabelEncoder()
            label_df = df[label_columns]
            for col in label_columns:
                label_df[col] = enc.fit_transform(label_df[col])
            label_df.reset_index(drop=True, inplace=True)
            final_columns.extend(label_columns)
            final_dfs.append(label_df)

        if oh_columns:
            oh_df = df[oh_columns]
            oh_df = oh_encoder.transform(oh_df)  # 전달 받은 encoder로 변환

            names = oh_encoder.categories_
            columns = []
            for pre, name in zip(oh_columns, names):
                for i in range(len(name)):
                    temp = pre + '_' + name[i]
                    columns.append(temp)
            oh_df = pd.DataFrame(oh_df, columns=columns)
            final_columns.extend(columns)
            final_dfs.append(oh_df)

        final_df = pd.concat(final_dfs, axis=1, ignore_index=True)
        final_df.columns = final_columns
        return final_df

In [9]:
pre_train, scaler, oh_encoder = basic_preprocess(train, num_columns=numeric_columns, label_columns=label_columns, oh_columns=oh_columns)
pre_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,edu_type_Academic degree,edu_type_Higher education,edu_type_Incomplete higher,edu_type_Lower secondary,edu_type_Secondary / secondary special,family_type_Civil marriage,family_type_Married,family_type_Separated,family_type_Single / not married,family_type_Widow
0,-0.573599,0.149136,0.490075,-0.463930,0.0,-0.538417,-0.645705,-0.316937,-0.214735,1.215231,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.764529,0.590848,1.089621,-0.440878,0.0,-0.538417,-0.645705,3.155199,0.876135,1.275620,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,-0.573599,2.578550,-0.744719,-0.461929,0.0,-0.538417,1.548696,-0.316937,-0.214735,0.249003,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.573599,0.149136,0.207081,-0.444893,0.0,-0.538417,1.548696,-0.316937,-0.214735,-0.656836,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-0.573599,-0.292575,0.219220,-0.444988,0.0,-0.538417,-0.645705,-0.316937,-0.214735,0.007446,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,2.102658,0.369992,0.923252,-0.444108,0.0,-0.538417,-0.645705,-0.316937,1.967005,1.456788,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
26453,0.764529,-0.071719,0.158765,-0.447679,0.0,-0.538417,-0.645705,-0.316937,-0.214735,-1.260729,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26454,-0.573599,1.032559,1.398558,-0.444333,0.0,-0.538417,-0.645705,-0.316937,-0.214735,0.067835,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
26455,-0.573599,-0.160062,1.383563,-0.430454,0.0,-0.538417,-0.645705,-0.316937,-1.305605,-1.985400,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
pre_test = basic_preprocess(test, num_columns=numeric_columns, label_columns=label_columns, oh_columns=oh_columns, train=False,
                       scaler=scaler, oh_encoder=oh_encoder)
pre_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,...,edu_type_Academic degree,edu_type_Higher education,edu_type_Incomplete higher,edu_type_Lower secondary,edu_type_Secondary / secondary special,family_type_Civil marriage,family_type_Married,family_type_Separated,family_type_Single / not married,family_type_Widow
0,-0.573599,-0.734287,-1.435662,2.227162,0.0,-0.538417,1.548696,-0.316937,-0.214735,-2.045789,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,-0.573599,-0.513431,-0.715444,-0.492750,0.0,-0.538417,1.548696,-0.316937,-0.214735,-0.596447,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.573599,-1.157623,0.016912,-0.431254,0.0,1.857295,1.548696,-0.316937,-0.214735,-0.838004,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,-0.573599,-0.734287,-0.788275,-0.448087,0.0,1.857295,-0.645705,-0.316937,-0.214735,-0.898393,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,-0.573599,0.369992,-0.443637,-0.497944,0.0,1.857295,-0.645705,-0.316937,-0.214735,1.094452,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.573599,0.149136,-0.627143,-0.469204,0.0,1.857295,1.548696,-0.316937,-0.214735,0.430170,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9996,-0.573599,0.149136,1.207198,-0.439241,0.0,1.857295,-0.645705,-0.316937,-0.214735,-0.475668,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
9997,-0.573599,1.032559,-1.203840,-0.531645,0.0,-0.538417,-0.645705,-0.316937,-0.214735,-1.743843,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
9998,-0.573599,-0.071719,-0.138747,-0.437568,0.0,-0.538417,1.548696,-0.316937,-0.214735,-0.415279,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
