# 3.2 범주형 결측지 처리 방법

- 이 파일에서는 여러 결측치 방법을 통해 결측치를 채운 후 각각 새로운 파일로 저장
- 파일 저장위치: `../data/결측치_output/`
- 이 데이터를 사용하여 `4.1_LG_고객데이터_feature_engineering+데이터생성.ipynb` 파일에서 가공변수 추가, 모델링에 사용할 데이터 생성

## 데이터 불러오기

In [57]:
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings(action='ignore')
import pandas as pd
import numpy as np
from feature_engine.imputation import CategoricalImputer
from feature_engine.imputation import RandomSampleImputer


In [58]:
pd.set_option('display.max_columns', None)

In [59]:
# !pip install feature_engine

In [60]:
df = pd.read_csv("../data/customer.csv")
df.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [61]:
df.drop(['케어솔루션고객여부', '거주행정동명', 'label_y', 'label_x'], axis = 1, inplace = True)

In [62]:
# 날짜형으로 변환
df['매출일자(배송완료일자)'] = pd.to_datetime(df['매출일자(배송완료일자)'], format='%Y-%m-%d')
df['년도'] = df['매출일자(배송완료일자)'].dt.year
df['월'] = df['매출일자(배송완료일자)'].dt.month

In [63]:
# 보유멤버십포인트 결측치는 0으로 채우기
df.loc[df['보유멤버십포인트'].isnull(), '보유멤버십포인트'] = 0

In [64]:
# 멤버십가입경로 4개 그룹으로 나눔

def change_variable(x):
    if x in ['베스트샵', '하이마트', '백화점', '전자랜드', '홈플러스', '(주)이마트']:
        return '매장'
    elif x in ['신한카드', '제휴카드', '하나SK카드', '롯데카드', 'LG페이', '네이버페이', '페이코', '스마트월렛']:
        return '카드'
    elif x in ['디얼디어', '바른손', '모닝글로리']:
        return '기타'
    elif x in ['LGE회원통합', 'LGE앱', 'LGE홈페이지(모바일)', '모바일그룹', '얍_KT', '얍_LGU', 'LGE홈페이지']:
        return '홈페이지'
    
def change_variable2(x):
    if x in ['백화점']:
        return '1'
    elif x in ['할인점 기타', '이마트', '온라인']:
        return '2'
    elif x in ['홈플러스', '하이마트', '전자랜드']:
        return '3'
    elif x in ['B2B', '베스트샵', '인터넷/홈쇼핑']:
        return '4'

In [65]:
df['멤버십가입경로'] = df['멤버십가입경로'].apply(lambda x: change_variable(x))
df['최근구매채널'] = df['최근구매채널'].apply(lambda x: change_variable2(x))
df.head(1)

Unnamed: 0,고객ID,매출일자(배송완료일자),타겟제품명,금액,멤버십고객여부,케어솔루션품목수,케어십가입여부,케어십품목수,최근3년구매금액,최근구매채널,Only&Best품목수,건강관리가전품목수,연령대,주거형태,아파트시세,아파트평형대,포인트사용빈도,보유멤버십포인트,멤버십가입경로,제휴카드보유여부,프리미엄카드보유여부,제휴가입여부,멤버십앱설치여부,거주광역명,거주시군구명,년도,월
0,34410,2022-04-29,에어컨,300만원 이상 400만원 이하,0.0,0품목,0,0품목,15KK이상20KK미만,2,1품목,1품목,30대,아파트,1~2억,20평대,미사용,0,매장,0,0,0,0.0,전북,전주시 덕진구,2022,4


In [66]:
df['케어솔루션품목수'] = df['케어솔루션품목수'].apply(lambda x : int(x[0]))
df['케어십품목수'] = df['케어십품목수'].apply(lambda x : int(x[0]))
df['Only&Best품목수'] = df['Only&Best품목수'].apply(lambda x : int(x[0]))
df['건강관리가전품목수'] = df['건강관리가전품목수'].apply(lambda x : int(x[0]))

In [70]:
X = df.copy()

In [71]:
X = X.drop('고객ID', axis = 1)

In [72]:
df_customer_ID = df[['고객ID']]

## Missing Data Imputation - feature engine

### CategoricalImputer() - 최빈값

In [None]:
# ignore_format=True --> numeric도 가능하게 하는 기능
ci = CategoricalImputer(imputation_method='frequent', ignore_format=True) 
ci.fit(X)
df_new = ci.transform(X)

In [None]:
df_new.isnull().sum()

In [None]:
ci.get_params()

In [None]:
df_new['연령대'].value_counts()

In [None]:
df_new = pd.concat([df_customer_ID, df_new], axis = 1)
# df_new

In [None]:
# df_new.to_csv('../data/결측치_output/결측치_CategoricalImputer.csv', encoding='utf-8-sig')

### CategoricalImputer(imputation_method='missing', ignore_format=True) : impute with an arbitrary value
- missing값으로 대체

In [None]:
# ignore_format=True --> numeric도 가능하게 하는 기능
ci = CategoricalImputer(imputation_method='missing', ignore_format=True) 
ci.fit(X)
df_missing = ci.transform(X)

In [None]:
df_missing['거주광역명'].value_counts()

In [None]:
df_missing = pd.concat([df_customer_ID, df_missing], axis = 1)

In [None]:
# df_missing.to_csv('../data/결측치_output/결측치_ArbitaryImputer.csv', encoding='utf-8-sig')

### RandomSampleImputer() - 랜덤

In [None]:
rsi = RandomSampleImputer()
rsi.fit(X)
df_rsi = rsi.transform(X)

In [None]:
df_rsi = pd.concat([df_customer_ID, df_rsi], axis = 1)


In [None]:
# df_rsi.to_csv('../data/결측치_output/결측치_RandomSampleImputer.csv', encoding='utf-8-sig')

In [None]:
df_rsi.columns

## Scikit Learn 



### KNN Imputer

https://scikit-learn.org/stable/modules/impute.html#knnimpute

사이킷런에서 설명하고 있는 KNN 임퓨터 작동 방식

각 표본의 결측치값은 학습 셋에서 찾은 n_neighbors 가장 가까운 이웃의 평균값을 사용하여 대치된다. 누락되지 않은 피처가 모두 가까운 경우에 두 샘플이 서로 이웃이 된다. 기본적으로 결측값인 nan_euclidean_distances를 지원하는 유클리드 디스턴스 메트릭이 가장 가까운 이웃을 찾는데 사용된다. 

결론 : 원하는 인접 이웃 수의 가중 또는 가중 평균을 사용하여 결측값을 대치한다. 

https://www.analyticsvidhya.com/blog/2020/07/knnimputer-a-robust-way-to-impute-missing-values-using-scikit-learn/

카테고리를 int로 수정

#### KNN 으로 채워줄때 범주형을 label encoding을 통해 int로 변환 필요

In [73]:
df_concat = df[['고객ID', '매출일자(배송완료일자)']]

In [74]:
X2 = df.copy()

In [75]:
X2.isnull().sum()

고객ID                 0
매출일자(배송완료일자)         0
타겟제품명                0
금액                 338
멤버십고객여부           6905
케어솔루션품목수             0
케어십가입여부              0
케어십품목수               0
최근3년구매금액             0
최근구매채널               4
Only&Best품목수         0
건강관리가전품목수            0
연령대              27995
주거형태                 0
아파트시세           128164
아파트평형대          128164
포인트사용빈도              0
보유멤버십포인트             0
멤버십가입경로           6905
제휴카드보유여부             0
프리미엄카드보유여부           0
제휴가입여부               0
멤버십앱설치여부          6905
거주광역명            18804
거주시군구명           18870
년도                   0
월                    0
dtype: int64

In [76]:
X2 = X2.drop(['고객ID', '매출일자(배송완료일자)'], axis = 1)

In [77]:
X2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 532625 entries, 0 to 532624
Data columns (total 25 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   타겟제품명         532625 non-null  object 
 1   금액            532287 non-null  object 
 2   멤버십고객여부       525720 non-null  float64
 3   케어솔루션품목수      532625 non-null  int64  
 4   케어십가입여부       532625 non-null  int64  
 5   케어십품목수        532625 non-null  int64  
 6   최근3년구매금액      532625 non-null  object 
 7   최근구매채널        532621 non-null  object 
 8   Only&Best품목수  532625 non-null  int64  
 9   건강관리가전품목수     532625 non-null  int64  
 10  연령대           504630 non-null  object 
 11  주거형태          532625 non-null  object 
 12  아파트시세         404461 non-null  object 
 13  아파트평형대        404461 non-null  object 
 14  포인트사용빈도       532625 non-null  object 
 15  보유멤버십포인트      532625 non-null  object 
 16  멤버십가입경로       525720 non-null  object 
 17  제휴카드보유여부      532625 non-null  int64  
 18  프리미엄

In [78]:
# list(air_df['보유멤버십포인트'].unique())

label_list1 = [0,  '1만↓', '3만↓', '5만↓', '10만↓','20만↓', '30만↓', '30만↑']
label_dict1 = {value: index for index, value in enumerate(label_list1)}

In [79]:
label_list2 = ['0원', '1KK미만', '1KK이상3KK미만','3KK이상5KK미만',  '5KK이상10KK미만','10KK이상15KK미만',
 '15KK이상20KK미만','20KK이상25KK미만','25KK이상30KK미만', '30KK↑']

label_dict2 = {value: index for index, value in enumerate(label_list2)}

In [80]:
# list(air_df['포인트사용빈도'].unique())
label_list3 = ['미사용','1~2회', '3~4회', '5~6회','7~8회','9~10회','11~12회','13~14회','15~16회','17~18회','19~20회', '20회↑']

label_dict3 = {value: index for index, value in enumerate(label_list3)}

In [81]:
# list(air_df['아파트평형대'].unique())
label_list4 = ['10평대↓', '10평대', '20평대', '30평대', '40평대', '50평대', '60평대','70평대↑']

label_dict4 = {value: index for index, value in enumerate(label_list4)}

In [82]:
# list(air_df['아파트시세'].unique())
label_list5 = [ '1억↓', '1~2억', '3~4억', '5~6억', '7~8억', '9~10억', '11~15억', '16~29억','30억↑']

label_dict5 = {value: index for index, value in enumerate(label_list5)}


In [83]:
# air_df.isnull().sum()

In [84]:
# list(air_df['금액'].unique())
label_list6 = ['', '0만원 이상 100만원 이하', '100만원 이상 200만원 이하', '200만원 이상 300만원 이하', 
               '300만원 이상 400만원 이하', '400만원 이상 500만원 이하', '500만원 이상 600만원 이하',
       '600만원 이상 700만원 이하', '700만원 이상 800만원 이하', '800만원 이상 900만원 이하', '900만원 이상 1000만원 이하', '1000만원 이상']

label_dict6 = {value: index for index, value in enumerate(label_list6)}

In [85]:
X2[f'금액'] = X2[f'금액'].map(label_dict6)

In [86]:
# list(air_df['월'].unique())
label_list7 = ['', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

label_dict7 = {value: index for index, value in enumerate(label_list7)}

In [87]:
X2[f'월'] = X2[f'월'].map(label_dict7)


In [88]:
X2['보유멤버십포인트'] = X2['보유멤버십포인트'].map(label_dict1)
X2['최근3년구매금액'] = X2['최근3년구매금액'].map(label_dict2)
X2['포인트사용빈도'] = X2['포인트사용빈도'].map(label_dict3)
X2['아파트평형대'] = X2['아파트평형대'].map(label_dict4)
X2['아파트시세'] = X2['아파트시세'].map(label_dict5)

In [89]:
X2.isnull().sum()

타겟제품명                0
금액                 338
멤버십고객여부           6905
케어솔루션품목수             0
케어십가입여부              0
케어십품목수               0
최근3년구매금액             0
최근구매채널               4
Only&Best품목수         0
건강관리가전품목수            0
연령대              27995
주거형태                 0
아파트시세           128164
아파트평형대          128164
포인트사용빈도              0
보유멤버십포인트             0
멤버십가입경로           6905
제휴카드보유여부             0
프리미엄카드보유여부           0
제휴가입여부               0
멤버십앱설치여부          6905
거주광역명            18804
거주시군구명           18870
년도                   0
월                    0
dtype: int64

In [90]:
X2['최근구매채널'].unique()

array(['2', '4', '3', '1', None], dtype=object)

In [91]:
X2[(X2['최근구매채널'] != '1') & (X2['최근구매채널'] != '2') & (X2['최근구매채널'] != '3') & (X2['최근구매채널'] != '4')]['최근구매채널']

345188    None
345189    None
345190    None
345191    None
Name: 최근구매채널, dtype: object

In [92]:
def changetoInt(x):
    if x != None:
        x = int(x)
        return x
    else:
        return x

X2['최근구매채널'] = X2['최근구매채널'].apply(changetoInt)

In [93]:
from sklearn.preprocessing import LabelEncoder

In [94]:
# Initialize the LabelEncoder
label_product = LabelEncoder()
label_age = LabelEncoder()
label_house = LabelEncoder()
label_membership = LabelEncoder()
label_city = LabelEncoder()
label_district = LabelEncoder()


X2['타겟제품명'] = label_product.fit_transform(X2['타겟제품명'])
X2['연령대'] = label_age.fit_transform(X2['연령대'])
X2['주거형태'] = label_house.fit_transform(X2['주거형태'])
X2['멤버십가입경로'] = label_membership.fit_transform(X2['멤버십가입경로'])
X2['거주광역명'] = label_city.fit_transform(X2['거주광역명'])
X2['거주시군구명'] = label_district.fit_transform(X2['거주시군구명'])

In [95]:
# LabelEncoder시, NaN 값들도 함께 인코딩이 되어서 다시 결측치로 변환
X2.loc[X2['연령대'] == (X2['연령대'].nunique()-1), '연령대']= np.nan
X2.loc[X2['멤버십가입경로'] == (X2['멤버십가입경로'].nunique()-1), '멤버십가입경로']= np.nan
X2.loc[X2['거주광역명'] == (X2['거주광역명'].nunique()-1), '거주광역명']= np.nan
X2.loc[X2['거주시군구명'] == (X2['거주시군구명'].nunique()-1), '거주시군구명']= np.nan

#### KNNImputer() 결측치 처리

In [97]:
from sklearn.impute import KNNImputer

imputer_uniform = KNNImputer(n_neighbors=25, weights="uniform")
df_knn_uniform = imputer_uniform.fit_transform(X2)

In [98]:
df_knn_uniform

array([[2.300e+01, 4.000e+00, 0.000e+00, ..., 1.620e+02, 2.022e+03,
        4.000e+00],
       [1.200e+01, 2.000e+00, 0.000e+00, ..., 1.620e+02, 2.020e+03,
        5.000e+00],
       [3.000e+00, 2.000e+00, 0.000e+00, ..., 1.620e+02, 2.022e+03,
        4.000e+00],
       ...,
       [4.000e+00, 2.000e+00, 0.000e+00, ..., 2.000e+00, 2.023e+03,
        2.000e+00],
       [3.000e+00, 2.000e+00, 0.000e+00, ..., 2.000e+00, 2.019e+03,
        2.000e+00],
       [4.000e+00, 1.000e+00, 0.000e+00, ..., 2.000e+00, 2.023e+03,
        2.000e+00]])

In [99]:
imputer_distance = KNNImputer(n_neighbors=25, weights="distance")
df_knn_distance = imputer_distance.fit_transform(X2)

In [101]:
to_df_knn_uniform = pd.DataFrame(df_knn_uniform, columns = imputer_uniform.feature_names_in_)
to_df_knn_uniform

Unnamed: 0,타겟제품명,금액,멤버십고객여부,케어솔루션품목수,케어십가입여부,케어십품목수,최근3년구매금액,최근구매채널,Only&Best품목수,건강관리가전품목수,연령대,주거형태,아파트시세,아파트평형대,포인트사용빈도,보유멤버십포인트,멤버십가입경로,제휴카드보유여부,프리미엄카드보유여부,제휴가입여부,멤버십앱설치여부,거주광역명,거주시군구명,년도,월
0,23.0,4.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,162.0,2022.0,4.0
1,12.0,2.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,162.0,2020.0,5.0
2,3.0,2.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,162.0,2022.0,4.0
3,11.0,2.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,162.0,2022.0,5.0
4,2.0,3.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,162.0,2022.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532620,12.0,2.0,0.0,0.0,0.0,0.0,6.0,3.0,3.0,2.0,3.0,0.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,145.0,2023.0,3.0
532621,3.0,2.0,0.0,0.0,0.0,0.0,6.0,3.0,3.0,2.0,3.0,0.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,145.0,2023.0,3.0
532622,4.0,2.0,0.0,1.0,0.0,0.0,2.0,4.0,0.0,1.0,4.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,2.0,2023.0,2.0
532623,3.0,2.0,0.0,1.0,0.0,0.0,2.0,4.0,0.0,1.0,4.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,2.0,2019.0,2.0


In [102]:
to_df_knn_distance = pd.DataFrame(df_knn_distance, columns = imputer_distance.feature_names_in_)
to_df_knn_distance

Unnamed: 0,타겟제품명,금액,멤버십고객여부,케어솔루션품목수,케어십가입여부,케어십품목수,최근3년구매금액,최근구매채널,Only&Best품목수,건강관리가전품목수,연령대,주거형태,아파트시세,아파트평형대,포인트사용빈도,보유멤버십포인트,멤버십가입경로,제휴카드보유여부,프리미엄카드보유여부,제휴가입여부,멤버십앱설치여부,거주광역명,거주시군구명,년도,월
0,23.0,4.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,162.0,2022.0,4.0
1,12.0,2.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,162.0,2020.0,5.0
2,3.0,2.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,162.0,2022.0,4.0
3,11.0,2.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,162.0,2022.0,5.0
4,2.0,3.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12.0,162.0,2022.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532620,12.0,2.0,0.0,0.0,0.0,0.0,6.0,3.0,3.0,2.0,3.0,0.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,145.0,2023.0,3.0
532621,3.0,2.0,0.0,0.0,0.0,0.0,6.0,3.0,3.0,2.0,3.0,0.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,145.0,2023.0,3.0
532622,4.0,2.0,0.0,1.0,0.0,0.0,2.0,4.0,0.0,1.0,4.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,2.0,2023.0,2.0
532623,3.0,2.0,0.0,1.0,0.0,0.0,2.0,4.0,0.0,1.0,4.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0,2.0,2019.0,2.0


In [103]:
to_df_knn_uniform.to_csv('../data/결측치_output/결측치_KNNImputer_uniform25.csv', encoding='utf-8-sig')
to_df_knn_distance.to_csv('../data/결측치_output/결측치_KNNImputer_distance25.csv', encoding='utf-8-sig')

In [104]:
to_df_knn_uniform['타겟제품명'] = to_df_knn_uniform['타겟제품명'].apply(lambda x: int(x))
to_df_knn_uniform['연령대'] = to_df_knn_uniform['연령대'].apply(lambda x: int(x))
to_df_knn_uniform['주거형태'] = to_df_knn_uniform['주거형태'].apply(lambda x: int(x))
to_df_knn_uniform['멤버십가입경로'] = to_df_knn_uniform['멤버십가입경로'].apply(lambda x: int(x))
to_df_knn_uniform['거주광역명'] = to_df_knn_uniform['거주광역명'].apply(lambda x: int(x))
to_df_knn_uniform['거주시군구명'] = to_df_knn_uniform['거주시군구명'].apply(lambda x: int(x))

to_df_knn_distance['타겟제품명'] = to_df_knn_distance['타겟제품명'].apply(lambda x: int(x))
to_df_knn_distance['연령대'] = to_df_knn_distance['연령대'].apply(lambda x: int(x))
to_df_knn_distance['주거형태'] = to_df_knn_distance['주거형태'].apply(lambda x: int(x))
to_df_knn_distance['멤버십가입경로'] = to_df_knn_distance['멤버십가입경로'].apply(lambda x: int(x))
to_df_knn_distance['거주광역명'] = to_df_knn_distance['거주광역명'].apply(lambda x: int(x))
to_df_knn_distance['거주시군구명'] = to_df_knn_distance['거주시군구명'].apply(lambda x: int(x))

In [107]:
to_df_knn_uniform['타겟제품명'] = label_product.inverse_transform(to_df_knn_uniform['타겟제품명'])
to_df_knn_uniform['연령대'] = label_age.inverse_transform(to_df_knn_uniform['연령대'])
to_df_knn_uniform['주거형태'] = label_house.inverse_transform(to_df_knn_uniform['주거형태'])
to_df_knn_uniform['멤버십가입경로'] = label_membership.inverse_transform(to_df_knn_uniform['멤버십가입경로'])
to_df_knn_uniform['거주광역명'] = label_city.inverse_transform(to_df_knn_uniform['거주광역명'])
to_df_knn_uniform['거주시군구명'] = label_district.inverse_transform(to_df_knn_uniform['거주시군구명'])

In [108]:
to_df_knn_distance['타겟제품명'] = label_product.inverse_transform(to_df_knn_distance['타겟제품명'])
to_df_knn_distance['연령대'] = label_age.inverse_transform(to_df_knn_distance['연령대'])
to_df_knn_distance['주거형태'] = label_house.inverse_transform(to_df_knn_distance['주거형태'])
to_df_knn_distance['멤버십가입경로'] = label_membership.inverse_transform(to_df_knn_distance['멤버십가입경로'])
to_df_knn_distance['거주광역명'] = label_city.inverse_transform(to_df_knn_distance['거주광역명'])
to_df_knn_distance['거주시군구명'] = label_district.inverse_transform(to_df_knn_distance['거주시군구명'])

In [111]:
df_knn_uniform_final = pd.concat([df_concat, to_df_knn_uniform], axis = 1)
df_knn_distance_final = pd.concat([df_concat, to_df_knn_distance], axis = 1)


In [112]:
df_knn_uniform_final

Unnamed: 0,고객ID,매출일자(배송완료일자),타겟제품명,금액,멤버십고객여부,케어솔루션품목수,케어십가입여부,케어십품목수,최근3년구매금액,최근구매채널,Only&Best품목수,건강관리가전품목수,연령대,주거형태,아파트시세,아파트평형대,포인트사용빈도,보유멤버십포인트,멤버십가입경로,제휴카드보유여부,프리미엄카드보유여부,제휴가입여부,멤버십앱설치여부,거주광역명,거주시군구명,년도,월
0,34410,2022-04-29,에어컨,4.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,30대,아파트,1.0,2.0,0.0,0.0,매장,0.0,0.0,0.0,0.0,전북,전주시 덕진구,2022.0,4.0
1,34410,2020-05-25,세탁기,2.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,30대,아파트,1.0,2.0,0.0,0.0,매장,0.0,0.0,0.0,0.0,전북,전주시 덕진구,2020.0,5.0
2,34410,2022-04-30,건조기,2.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,30대,아파트,1.0,2.0,0.0,0.0,매장,0.0,0.0,0.0,0.0,전북,전주시 덕진구,2022.0,4.0
3,34410,2022-05-02,뷰티기기,2.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,30대,아파트,1.0,2.0,0.0,0.0,매장,0.0,0.0,0.0,0.0,전북,전주시 덕진구,2022.0,5.0
4,34410,2022-04-23,TV,3.0,0.0,0.0,0.0,0.0,6.0,2.0,1.0,1.0,30대,아파트,1.0,2.0,0.0,0.0,매장,0.0,0.0,0.0,0.0,전북,전주시 덕진구,2022.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532620,102635,2023-03-17,세탁기,2.0,0.0,0.0,0.0,0.0,6.0,3.0,3.0,2.0,40대,아파트,2.0,3.0,0.0,0.0,매장,0.0,0.0,0.0,0.0,경기,용인시 처인구,2023.0,3.0
532621,102635,2023-03-17,건조기,2.0,0.0,0.0,0.0,0.0,6.0,3.0,3.0,2.0,40대,아파트,2.0,3.0,0.0,0.0,매장,0.0,0.0,0.0,0.0,경기,용인시 처인구,2023.0,3.0
532622,98135,2023-02-17,공기청정기,2.0,0.0,1.0,0.0,0.0,2.0,4.0,0.0,1.0,50대,아파트,2.0,2.0,0.0,0.0,매장,0.0,0.0,0.0,0.0,서울,강남구,2023.0,2.0
532623,98135,2019-02-21,건조기,2.0,0.0,1.0,0.0,0.0,2.0,4.0,0.0,1.0,50대,아파트,2.0,2.0,0.0,0.0,매장,0.0,0.0,0.0,0.0,서울,강남구,2019.0,2.0


In [113]:
df_knn_uniform_final.to_csv('../data/결측치_output/결측치_KNNImputer_uniform25_final.csv', encoding='utf-8-sig')
df_knn_distance_final.to_csv('../data/결측치_output/결측치_KNNImputer_distance25_final.csv', encoding='utf-8-sig')

In [None]:
# df_knn2.to_csv('../data/결측치_KNNImpter_10.csv', encoding='utf-8-sig')