# Library

In [147]:
import pandas as pd
import os
import numpy as np
from datetime import datetime

In [4]:
adf = pd.DataFrame({'시도':['세종', '서울'], '시군구':[np.nan, '종로구'], '대수':[12, 100]})
bdf = pd.DataFrame({'시도':['세종', '서울'], '시군구':[np.nan, '종로구'], '대수2':[5, 10]})
adf.shape, bdf.shape

((2, 3), (2, 3))

In [5]:
tdf = adf.merge(bdf, on=['시도', '시군구'], how='left')
tdf

Unnamed: 0,시도,시군구,대수,대수2
0,세종,,12,5
1,서울,종로구,100,10


# Folders

In [9]:
fold = 'D:/data/big3'
raw_fold = 'D:/data/big3/raw'
raw4_fold = 'D:/data/processing/big3/raw/BD4'
an_fold = 'D:/data/big3/analysis'
df_fold = 'D:/data/big3/df'
folds = {'fold':fold, 'raw_fold':raw_fold, 'an_fold':an_fold, 'df_fold':df_fold}
folds

{'fold': 'D:/data/big3',
 'raw_fold': 'D:/data/big3/raw',
 'an_fold': 'D:/data/big3/analysis',
 'df_fold': 'D:/data/big3/df'}

# Load

## 전문정비(TB_ERR_RES)

In [3]:
name = 'STD_TB_ERR_RES(20230918)'
file_name = f'{name}.csv'
res = pd.read_csv(os.path.join(raw_fold, file_name))
res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54617 entries, 0 to 54616
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   DRVNG_DSTNC            54617 non-null  float64
 1   VHRNO                  54617 non-null  object 
 2   VIN                    51051 non-null  object 
 3   LOD_COX_MEVLU          53884 non-null  object 
 4   LOD_HC_MEVLU           53886 non-null  object 
 5   LOD_NOX_MEVLU          53883 non-null  object 
 6   LOD_SMO_MEVLU          53941 non-null  object 
 7   LOD_TQ_MEVLU           53819 non-null  object 
 8   OVHUL_YMD              54617 non-null  int64  
 9   SPCY_OVHUL_BIZPLC_NM   54617 non-null  object 
 10  OVHUL_RCPT_NO          54617 non-null  int64  
 11  PRCINSP_RINSP_RCPT_NO  54617 non-null  object 
 12  NOLOD_COX_MEVLU        53870 non-null  object 
 13  NOLOD_HC_MEVLU         53871 non-null  object 
 14  NOLOD_AIR_EXCRT_MEVLU  28484 non-null  object 
 15  NO

In [4]:
res.columns.to_list()

['DRVNG_DSTNC',
 'VHRNO',
 'VIN',
 'LOD_COX_MEVLU',
 'LOD_HC_MEVLU',
 'LOD_NOX_MEVLU',
 'LOD_SMO_MEVLU',
 'LOD_TQ_MEVLU',
 'OVHUL_YMD',
 'SPCY_OVHUL_BIZPLC_NM',
 'OVHUL_RCPT_NO',
 'PRCINSP_RINSP_RCPT_NO',
 'NOLOD_COX_MEVLU',
 'NOLOD_HC_MEVLU',
 'NOLOD_AIR_EXCRT_MEVLU',
 'NOLOD_SMO_MEVLU']

In [5]:
cdict = {
    'DRVNG_DSTNC':'주행거리',
    'VHRNO':'자동차등록번호',
    'VIN':'차대번호',
    'LOD_COX_MEVLU':'부하_일산화탄소_측정치',
    'LOD_HC_MEVLU':'부하_탄화수소_측정치',
    'LOD_NOX_MEVLU':'부하_질소산화물_측정치',
    'LOD_SMO_MEVLU':'부하_매연_측정치',
    'LOD_TQ_MEVLU':'부하_토크_측정치',
    'OVHUL_YMD':'정비일자',
    'SPCY_OVHUL_BIZPLC_NM':'전문정비사업소명',
    'OVHUL_RCPT_NO':'정비접수번호',
    'PRCINSP_RINSP_RCPT_NO':'정밀재검사접수번호',
    'NOLOD_COX_MEVLU':'무부하_일산화탄소_측정치',
    'NOLOD_HC_MEVLU':'무부하_탄화수소_측정치',
    'NOLOD_AIR_EXCRT_MEVLU':'무부하_공기과잉률_측정치',
    'NOLOD_SMO_MEVLU':'무부하_매연_측정치', 
}
resr = res.rename(columns=cdict)
resr.columns

Index(['주행거리', '자동차등록번호', '차대번호', '부하_일산화탄소_측정치', '부하_탄화수소_측정치',
       '부하_질소산화물_측정치', '부하_매연_측정치', '부하_토크_측정치', '정비일자', '전문정비사업소명', '정비접수번호',
       '정밀재검사접수번호', '무부하_일산화탄소_측정치', '무부하_탄화수소_측정치', '무부하_공기과잉률_측정치',
       '무부하_매연_측정치'],
      dtype='object')

## 전문정비 상세(TB_ERR_RES_DET)

In [6]:
name = 'STD_TB_ERR_RES_DET(20230918)'
file_name = f'{name}.csv'
rdt = pd.read_csv(os.path.join(raw_fold, file_name))
rdt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106819 entries, 0 to 106818
Data columns (total 8 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   VHRNO                  106818 non-null  object 
 1   VIN                    94670 non-null   object 
 2   OVHUL_CN               106817 non-null  object 
 3   OVHUL_YMD              106818 non-null  float64
 4   SPCY_OVHUL_BIZPLC_NM   106818 non-null  object 
 5   OVHUL_CMPNT_NM         106817 non-null  object 
 6   OVHUL_RCPT_NO          106818 non-null  float64
 7   PRCINSP_RINSP_RCPT_NO  0 non-null       float64
dtypes: float64(3), object(5)
memory usage: 6.5+ MB


In [7]:
rdt.columns.to_list()

['VHRNO',
 'VIN',
 'OVHUL_CN',
 'OVHUL_YMD',
 'SPCY_OVHUL_BIZPLC_NM',
 'OVHUL_CMPNT_NM',
 'OVHUL_RCPT_NO',
 'PRCINSP_RINSP_RCPT_NO']

In [8]:
cdict = {
    'VHRNO':'자동차등록번호',
    'VIN':'차대번호',
    'OVHUL_CN':'정비내용',
    'OVHUL_YMD':'정비일자',
    'SPCY_OVHUL_BIZPLC_NM':'전문정비사업소명',
    'OVHUL_CMPNT_NM':'정비부품명',
    'OVHUL_RCPT_NO':'정비접수번호',
    'PRCINSP_RINSP_RCPT_NO':'정밀재검사접수번호', 
}
rdtr = rdt.rename(columns=cdict)
rdtr.columns

Index(['자동차등록번호', '차대번호', '정비내용', '정비일자', '전문정비사업소명', '정비부품명', '정비접수번호',
       '정밀재검사접수번호'],
      dtype='object')

In [9]:
set(resr.columns) & set(rdtr.columns)

{'자동차등록번호', '전문정비사업소명', '정밀재검사접수번호', '정비일자', '정비접수번호', '차대번호'}

## 전문정비사업자_2회부적합정비결과등록_(20200101~20230731)_v10

In [10]:
err = pd.read_table(os.path.join(raw4_fold, '전문정비사업자_2회부적합정비결과등록_(20200101~20230731)_v10.txt'), encoding='cp949')
err.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26102 entries, 0 to 26101
Data columns (total 71 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   No              26102 non-null  int64  
 1   접수번호            26102 non-null  object 
 2   정비일자            26102 non-null  object 
 3   차대번호            26102 non-null  object 
 4   차량번호            26102 non-null  object 
 5   차량소유자명          26102 non-null  object 
 6   차종              26102 non-null  object 
 7   차명              26102 non-null  object 
 8   차량연료            26102 non-null  object 
 9   차량연식(년도)        26102 non-null  object 
 10  차량용도            26102 non-null  object 
 11  차령(년수)          26102 non-null  object 
 12  주행거리(km)        26102 non-null  object 
 13  차량중량(kg)        26102 non-null  object 
 14  총중량(kg)         26096 non-null  object 
 15  엔진출력(ps/rpm)    26102 non-null  object 
 16  엔진형식            26102 non-null  object 
 17  엔진배기량(cc)       26102 non-null 

  err = pd.read_table(os.path.join(raw4_fold, '전문정비사업자_2회부적합정비결과등록_(20200101~20230731)_v10.txt'), encoding='cp949')


In [30]:
err.to_csv(os.path.join(raw4_fold, '전문정비사업자_2회부적합정비결과등록_(20200101~20230731)_v10.csv'), index=False)

In [16]:
num = 0
for one in err['정비내역'].unique():
    try:
        if len(one.split(',')) >= num:
            num = len(one.split(','))
    except:
        pass
num

39

In [19]:
err['정비내역'].head()

0    점화플러그 점검/조정, 점화배선 점검/조정, 배전기(Distributor) 점검/조...
1            엔진오일 교환, 에어크리너 분해/탈착, 드로틀바디 분해/탈착, 소음기 교환
2                                               소음기 교환
3                      점화플러그 점검/조정, 점화시기(분사시기)조정 점검/조정
4           엔진오일 교환, 공기유량센서 점검/조정, 에어크리너 교환, 배기관 분해/탈착
Name: 정비내역, dtype: object

In [117]:
temp = err['정비내역'].str.split(',')
temp[0]

['점화플러그 점검/조정',
 ' 점화배선 점검/조정',
 ' 배전기(Distributor) 점검/조정',
 ' 점화시기(분사시기)조정 점검/조정',
 ' 인젝터(휘발유) 점검/조정',
 ' 기화기(믹서포함) 점검/조정',
 ' 연료압력조절기 점검/조정',
 ' 연료휠터 점검/조정',
 ' 연료분사펌프(조정기 조정_봉인) 점검/조정',
 ' 노즐(디젤인젝터포함) 점검/조정',
 ' 공연비(혼합비) 점검/조정',
 ' 엔진정비 점검/조정',
 ' 엔진오일 점검/조정',
 ' 공회전속도 점검/조정',
 ' 전자제어장치(ECU) 점검/조정',
 ' 정화용촉매(Catalytic Converter) 점검/조정',
 ' 디젤산화촉매(DOC) 정비불가능',
 ' 매연여과장치(DPF) 정비불가능',
 ' EGR밸브 정비불가능',
 ' 배출가스재순환장치 EGR제어용 서머밸브 정비불가능',
 ' 정화조절밸브(Purge Control Valve) 점검/조정',
 ' 연료증발가스방지장치 EGR제어용 서머밸브 정비불가능',
 ' PVC밸브 점검/조정',
 ' 블로바이가스 호스 점검/조정',
 ' 산소센서 점검/조정',
 ' 공기유량센서 점검/조정',
 ' 스로틀포지션센서 점검/조정',
 ' 냉각수온센서 점검/조정',
 ' 에어크리너 점검/조정',
 ' 드로틀바디 점검/조정',
 ' 과급기 정비불가능',
 ' 소음기 점검/조정',
 ' 배기관 점검/조정']

In [120]:
temp_set = set()
for one in temp:
    try:
        temp_set = temp_set | set(one)
    except:
        pass
temp_set

{' EGR밸브 교환',
 ' EGR밸브 분해/탈착',
 ' EGR밸브 점검/조정',
 ' EGR밸브 정비불가능',
 ' PVC밸브 교환',
 ' PVC밸브 분해/탈착',
 ' PVC밸브 점검/조정',
 ' PVC밸브 정비불가능',
 ' 공기유량센서 교환',
 ' 공기유량센서 분해/탈착',
 ' 공기유량센서 점검/조정',
 ' 공기유량센서 정비불가능',
 ' 공연비(혼합비) 교환',
 ' 공연비(혼합비) 분해/탈착',
 ' 공연비(혼합비) 점검/조정',
 ' 공연비(혼합비) 정비불가능',
 ' 공회전속도 교환',
 ' 공회전속도 분해/탈착',
 ' 공회전속도 점검/조정',
 ' 공회전속도 정비불가능',
 ' 과급기 교환',
 ' 과급기 분해/탈착',
 ' 과급기 점검/조정',
 ' 과급기 정비불가능',
 ' 기화기(믹서포함) 교환',
 ' 기화기(믹서포함) 분해/탈착',
 ' 기화기(믹서포함) 점검/조정',
 ' 기화기(믹서포함) 정비불가능',
 ' 냉각계통 교환',
 ' 냉각계통 분해/탈착',
 ' 냉각계통 점검/조정',
 ' 냉각계통 정비불가능',
 ' 냉각수온센서 교환',
 ' 냉각수온센서 분해/탈착',
 ' 냉각수온센서 점검/조정',
 ' 냉각수온센서 정비불가능',
 ' 노즐(디젤인젝터포함) 교환',
 ' 노즐(디젤인젝터포함) 분해/탈착',
 ' 노즐(디젤인젝터포함) 점검/조정',
 ' 노즐(디젤인젝터포함) 정비불가능',
 ' 드로틀바디 교환',
 ' 드로틀바디 분해/탈착',
 ' 드로틀바디 점검/조정',
 ' 드로틀바디 정비불가능',
 ' 디젤산화촉매(DOC) 교환',
 ' 디젤산화촉매(DOC) 분해/탈착',
 ' 디젤산화촉매(DOC) 점검/조정',
 ' 디젤산화촉매(DOC) 정비불가능',
 ' 매연여과장치(DPF) 교환',
 ' 매연여과장치(DPF) 분해/탈착',
 ' 매연여과장치(DPF) 점검/조정',
 ' 매연여과장치(DPF) 정비불가능',
 ' 배기관 교환',
 ' 배기관 분해/탈착',
 ' 배기관 점검/조정',
 ' 배기관 정비불가능',
 '

In [121]:
temp_list = []
for one in temp_set:
    temp_list.append(one.strip())
temp_list

['공회전속도 교환',
 '연료휠터 분해/탈착',
 '공연비(혼합비) 점검/조정',
 '블로바이가스 호스 분해/탈착',
 '산소센서 분해/탈착',
 '소음기 점검/조정',
 '점화시기(분사시기)조정 분해/탈착',
 '연료증발가스방지장치 EGR제어용 서머밸브 분해/탈착',
 '정화용촉매(Catalytic Converter) 교환',
 '드로틀바디 교환',
 '연료휠터 교환',
 '엔진오일 분해/탈착',
 '드로틀바디 점검/조정',
 '디젤산화촉매(DOC) 교환',
 '배기관 점검/조정',
 '매연여과장치(DPF) 점검/조정',
 '연료분사펌프(조정기 조정_봉인) 교환',
 '연료휠터 정비불가능',
 '엔진오일 분해/탈착',
 '노즐(디젤인젝터포함) 분해/탈착',
 '냉각계통 정비불가능',
 '공연비(혼합비) 분해/탈착',
 '연료압력조절기 교환',
 '배기관 점검/조정',
 '점화플러그 점검/조정',
 '연료증발가스방지장치 EGR제어용 서머밸브 교환',
 'PVC밸브 분해/탈착',
 '공회전속도 점검/조정',
 '에어크리너 교환',
 '배기관 분해/탈착',
 '연료분사펌프(조정기 조정_봉인) 분해/탈착',
 '배전기(Distributor) 정비불가능',
 '블로바이가스 호스 정비불가능',
 '점화시기(분사시기)조정 정비불가능',
 '매연여과장치(DPF) 분해/탈착',
 'PVC밸브 교환',
 '점화배선 분해/탈착',
 '연료분사펌프(조정기 조정_봉인) 교환',
 '공기유량센서 점검/조정',
 '소음기 교환',
 '연료증발가스방지장치 EGR제어용 서머밸브 점검/조정',
 '정화조절밸브(Purge Control Valve) 분해/탈착',
 '디젤산화촉매(DOC) 분해/탈착',
 '정화조절밸브(Purge Control Valve) 교환',
 '드로틀바디 정비불가능',
 '연료분사펌프(조정기 조정_봉인) 점검/조정',
 '정화조절밸브(Purge Control Valve) 점검/조정',
 '디젤산화촉매(DOC) 정비불가능',
 '인젝터(휘발유) 점검/조정',
 '정화조절

In [122]:
len(temp_list)

244

In [123]:
len(set(temp_list))

136

In [125]:
temp_set2 = set(temp_list)
len(temp_set2)

136

In [126]:
err['정비내역'].head()

0    점화플러그 점검/조정, 점화배선 점검/조정, 배전기(Distributor) 점검/조...
1            엔진오일 교환, 에어크리너 분해/탈착, 드로틀바디 분해/탈착, 소음기 교환
2                                               소음기 교환
3                      점화플러그 점검/조정, 점화시기(분사시기)조정 점검/조정
4           엔진오일 교환, 공기유량센서 점검/조정, 에어크리너 교환, 배기관 분해/탈착
Name: 정비내역, dtype: object

In [127]:
err_copy = err.copy()
err_copy.shape

(26102, 71)

In [129]:
for one in temp_set2:
    err_copy[one] = np.nan
err_copy.shape

(26102, 207)

In [82]:
err_copy.head()

Unnamed: 0,No,접수번호,정비일자,차대번호,차량번호,차량소유자명,차종,차명,차량연료,차량연식(년도),...,엔진오일 점검/조정,냉각계통 분해/탈착,점화배선 정비불가능,소음기 정비불가능,드로틀바디 분해/탈착,연료분사펌프(조정기 조정_봉인) 정비불가능,전자제어장치(ECU) 분해/탈착,엔진오일 정비불가능,전자제어장치(ECU) 정비불가능,정화용촉매(Catalytic Converter) 정비불가능
0,1,ZH51_20230731_00001,2023-07-31,KMHEC41LBCA364580,07조5174,오성환,승용차일반형중형,쏘나타 (SONATA),엘피지,2012,...,,,,,,,,,,
1,2,ZM10_20230731_00001,2023-07-31,KMHF641NBKA192771,189무6484,김미례,승용차일반형대형,그랜저(GRANDEUR),엘피지,2019,...,,,,,,,,,,
2,3,ZK20_20230731_00001,2023-07-31,KMHEU41MP6A120421,23가1130,김생찬,승용차일반형중형,쏘나타(SONATA),엘피지,2006,...,,,,,,,,,,
3,4,ZK20_20230731_00002,2023-07-31,KLAJF69YDBK168156,56어5370,김영민,승용차일반형중형,크루즈 2.0 디젤,경유,2011,...,,,,,,,,,,
4,5,ZP06_20230731_00001,2023-07-31,KMFZSS7JP7U304470,80다5421,김현태,화물차특수용도형소형,포터Ⅱ슈퍼캡냉동탑차 (PORTERⅡ),경유,2007,...,,,,,,,,,,


In [131]:
err_copy['정비내역'].head()

0    점화플러그 점검/조정, 점화배선 점검/조정, 배전기(Distributor) 점검/조...
1            엔진오일 교환, 에어크리너 분해/탈착, 드로틀바디 분해/탈착, 소음기 교환
2                                               소음기 교환
3                      점화플러그 점검/조정, 점화시기(분사시기)조정 점검/조정
4           엔진오일 교환, 공기유량센서 점검/조정, 에어크리너 교환, 배기관 분해/탈착
Name: 정비내역, dtype: object

In [138]:
for one in temp_set2:
    err_copy.loc[err_copy['정비내역'].str.contains(one, na=False, regex=False), one] = 'Y'
err_copy.shape

(26102, 207)

In [139]:
err_copy.head()

Unnamed: 0,No,접수번호,정비일자,차대번호,차량번호,차량소유자명,차종,차명,차량연료,차량연식(년도),...,점화배선 분해/탈착,공기유량센서 분해/탈착,노즐(디젤인젝터포함) 교환,엔진정비 점검/조정,블로바이가스 호스 점검/조정,EGR밸브 점검/조정,소음기 교환,정화조절밸브(Purge Control Valve) 정비불가능,소음기 점검/조정,배전기(Distributor) 점검/조정
0,1,ZH51_20230731_00001,2023-07-31,KMHEC41LBCA364580,07조5174,오성환,승용차일반형중형,쏘나타 (SONATA),엘피지,2012,...,,,,Y,Y,,,,Y,Y
1,2,ZM10_20230731_00001,2023-07-31,KMHF641NBKA192771,189무6484,김미례,승용차일반형대형,그랜저(GRANDEUR),엘피지,2019,...,,,,,,,Y,,,
2,3,ZK20_20230731_00001,2023-07-31,KMHEU41MP6A120421,23가1130,김생찬,승용차일반형중형,쏘나타(SONATA),엘피지,2006,...,,,,,,,Y,,,
3,4,ZK20_20230731_00002,2023-07-31,KLAJF69YDBK168156,56어5370,김영민,승용차일반형중형,크루즈 2.0 디젤,경유,2011,...,,,,,,,,,,
4,5,ZP06_20230731_00001,2023-07-31,KMFZSS7JP7U304470,80다5421,김현태,화물차특수용도형소형,포터Ⅱ슈퍼캡냉동탑차 (PORTERⅡ),경유,2007,...,,,,,,,,,,


In [143]:
err_copy.loc[0, ['점화플러그 점검/조정', '점화배선 점검/조정', '배전기(Distributor) 점검/조정', '점화시기(분사시기)조정 점검/조정', '인젝터(휘발유) 점검/조정', '기화기(믹서포함) 점검/조정', '연료압력조절기 점검/조정', '연료휠터 점검/조정', 
                 '연료분사펌프(조정기 조정_봉인) 점검/조정', '노즐(디젤인젝터포함) 점검/조정', '공연비(혼합비) 점검/조정', '엔진정비 점검/조정', '엔진오일 점검/조정', '공회전속도 점검/조정', '전자제어장치(ECU) 점검/조정', '정화용촉매(Catalytic Converter) 점검/조정', '디젤산화촉매(DOC) 정비불가능', '매연여과장치(DPF) 정비불가능', 'EGR밸브 정비불가능', '배출가스재순환장치 EGR제어용 서머밸브 정비불가능', '정화조절밸브(Purge Control Valve) 점검/조정', 
                 '연료증발가스방지장치 EGR제어용 서머밸브 정비불가능', 'PVC밸브 점검/조정', '블로바이가스 호스 점검/조정', '산소센서 점검/조정', '공기유량센서 점검/조정', '스로틀포지션센서 점검/조정', '냉각수온센서 점검/조정', '에어크리너 점검/조정', '드로틀바디 점검/조정', '과급기 정비불가능', '소음기 점검/조정', '배기관 점검/조정']].isnull().sum()

0

In [144]:
tt = err_copy.iloc[:, 71:].isnull().sum().reset_index()
tt[tt[0] != 26102]

Unnamed: 0,index,0
0,연료휠터 분해/탈착,26041
1,공연비(혼합비) 점검/조정,25001
2,블로바이가스 호스 분해/탈착,26080
3,산소센서 분해/탈착,26029
4,연료증발가스방지장치 EGR제어용 서머밸브 분해/탈착,26080
...,...,...
131,EGR밸브 점검/조정,22418
132,소음기 교환,25392
133,정화조절밸브(Purge Control Valve) 정비불가능,25851
134,소음기 점검/조정,22045


In [149]:
today_date = datetime.today().strftime('%Y%m%d')
today_date

'20231208'

### [출력] 전문정비 내역 수정

In [150]:
# 1.0s
err_copy.to_csv(os.path.join(raw4_fold, f'전문정비내역수정_{today_date}.csv'), index=False)

# EDA

## 전문정비

In [10]:
resr.head()

Unnamed: 0,주행거리,자동차등록번호,차대번호,부하_일산화탄소_측정치,부하_탄화수소_측정치,부하_질소산화물_측정치,부하_매연_측정치,부하_토크_측정치,정비일자,전문정비사업소명,정비접수번호,정밀재검사접수번호,무부하_일산화탄소_측정치,무부하_탄화수소_측정치,무부하_공기과잉률_측정치,무부하_매연_측정치
0,91971.0,서울82누4807,KN4JAV7S2YK017953,0.0,0.0,0.0,0.0,0.0,20080110,(주)삼흥자동차서비스,1,11a02008011000171,0.0,0.0,0.0,<NULL>
1,127433.0,서울82모2590,KNR1M2HEYXT009110,0.0,0.0,0.0,0.0,0.0,20080123,(주)삼흥자동차서비스,1,A0142008012300171,0.0,0.0,0.0,<NULL>
2,307964.0,서울85두1462,KMFXKS7BPWU240029,0.0,0.0,0.0,0.0,0.0,20080108,(주)삼흥자동차서비스,1,A0142008010800381,0.0,0.0,0.0,<NULL>
3,190617.0,06서3621,KPBLC3D81VP026259,0.0,0.0,0.0,42.0,54.0,20080118,(주)동신공업사-부산,1,B0432008011800051,0.0,0.0,0.0,<NULL>
4,185991.0,72노1317,KMJWWH7BPYU258482,0.0,0.0,0.0,52.0,46.0,20080107,(주)동신공업사-부산,1,B0222008010700521,0.0,0.0,0.0,<NULL>


In [11]:
resr.shape, len(resr['자동차등록번호'].unique())

((54617, 16), 46780)

In [12]:
resr.shape, len(resr['차대번호'].unique())

((54617, 16), 43771)

In [13]:
resr.shape, len(resr['정밀재검사접수번호'].unique())

((54617, 16), 54543)

In [14]:
dupl = resr[resr['정밀재검사접수번호'].duplicated(keep=False)].sort_values('정밀재검사접수번호')
dupl.shape

(121, 16)

In [15]:
dupl.head()

Unnamed: 0,주행거리,자동차등록번호,차대번호,부하_일산화탄소_측정치,부하_탄화수소_측정치,부하_질소산화물_측정치,부하_매연_측정치,부하_토크_측정치,정비일자,전문정비사업소명,정비접수번호,정밀재검사접수번호,무부하_일산화탄소_측정치,무부하_탄화수소_측정치,무부하_공기과잉률_측정치,무부하_매연_측정치
13643,81881.0,서울80머1122,KRV5KS3L4VC000023,0.0,0.0,0.0,28.0,172.0,20090513,서울특별시차량정비사업소,2,11a02009051300741,0.0,0.0,,0.00
13642,81881.0,서울80머1122,KRV5KS3L4VC000023,0.0,0.0,0.0,28.0,172.0,20090513,서울특별시차량정비사업소,1,11a02009051300741,0.0,0.0,,0.00
4393,148298.0,부산81나1038,KMFXKD7BP3U718984,0.0,0.0,0.0,21.0,50.0,20101001,삼덕정비(주),2,26302010092700851,0.0,0.0,,0.00
32445,148296.0,부산81나1038,KMFXKD7BP3U718984,0.0,0.0,0.0,57.0,0.0,20100928,삼덕정비(주),1,26302010092700851,0.0,0.0,,0.00
21179,36507.0,75루1661,KPDKFDNF14P000917,0.0,0.0,0.0,0.0,0.0,20071019,대구부란자,2,27302007100403841,0.0,0.0,24.5,<NULL>


In [16]:
dupl['정비접수번호'].unique()

array([2, 1, 3, 4, 7, 6, 5], dtype=int64)

In [19]:
# 동일한 정밀재검사접수번호에서 정비접수번호는 다른가?
for one in dupl['정밀재검사접수번호'].unique():
    temp = dupl[dupl['정밀재검사접수번호'] == one]
    if temp.shape[0] != len(temp['정비접수번호'].unique()):
        print(one, len(temp['정비접수번호'].unique()))

27302008072901661 2
27302010022601700  1
27602012083000681 2
29502007112900971 1
31302007082901261 3
31312011062200251 2
31332010070600031 3
31362011011000041 2
31362011092200091 1
41412010032500400  1
41652010030800300  1
A0062007040500321 2
C0152012012500031 1
C0282012061900131 1
E0302007082200031 2
F0032008021100361 3
H0952007062900271 2
J0042008040800231 1
J0142008081400081 2
O0172009080500111 1


In [21]:
dupl[dupl['정밀재검사접수번호'] == 'F0032008021100361']

Unnamed: 0,주행거리,자동차등록번호,차대번호,부하_일산화탄소_측정치,부하_탄화수소_측정치,부하_질소산화물_측정치,부하_매연_측정치,부하_토크_측정치,정비일자,전문정비사업소명,정비접수번호,정밀재검사접수번호,무부하_일산화탄소_측정치,무부하_탄화수소_측정치,무부하_공기과잉률_측정치,무부하_매연_측정치
9815,17628.0,88러1181,KMFLA19RPVU116617,0.0,0.0,0.0,0.0,0.0,20080214,한일브랸쟈,3,F0032008021100361,0.0,0.0,0.0,<NULL>
9813,17628.0,88러1181,KMFLA19RPVU116617,0.0,0.0,0.0,0.0,0.0,20080214,한일브랸쟈,1,F0032008021100361,0.0,0.0,0.0,<NULL>
9814,17628.0,88러1181,KMFLA19RPVU116617,0.0,0.0,0.0,0.0,0.0,20080214,한일브랸쟈,2,F0032008021100361,0.0,0.0,0.0,<NULL>
9373,17628.0,88러1181,KMFLA19RPVU116617,0.0,0.0,0.0,0.0,0.0,20080212,e-디젤시스템,1,F0032008021100361,0.0,0.0,0.0,<NULL>


In [None]:
# 정밀재검사접수번호, 정비접수번호, 전문정비사업소명, 차대번호 기준 중복 없는지 확인

In [33]:
resr.shape

(54617, 16)

In [34]:
resr.drop_duplicates(['정밀재검사접수번호', '정비접수번호', '전문정비사업소명', '차대번호']).shape

(54617, 16)

In [35]:
rdtr.shape

(106819, 8)

In [36]:
rdtr.columns

Index(['자동차등록번호', '차대번호', '정비내용', '정비일자', '전문정비사업소명', '정비부품명', '정비접수번호',
       '정밀재검사접수번호'],
      dtype='object')

In [37]:
# 정밀재검사접수번호, 정비접수번호, 전문정비사업소명, 차대번호 기준 중복 없는지 확인
rdtr.drop_duplicates(['정밀재검사접수번호', '정비접수번호', '전문정비사업소명', '차대번호']).shape

(47983, 8)

In [38]:
rdtr.head()

Unnamed: 0,자동차등록번호,차대번호,정비내용,정비일자,전문정비사업소명,정비부품명,정비접수번호,정밀재검사접수번호
0,경기87라5197,KMFXKD7BP1U464612,연료분사펌프(조속기조정봉인),20070208.0,(주)중산자동차정비검사소,연료장치,1.0,
1,경기5보7418,KMJFD37XPPU027845,연료분사펌프(조속기조정봉인),20060811.0,(주)삼광자동차중기,연료장치,1.0,
2,경기5보7418,KMJFD37XPPU027845,엔진오일,20060811.0,(주)삼광자동차중기,엔진계통,1.0,
3,경기5보7418,KMJFD37XPPU027845,공회전속도,20060811.0,(주)삼광자동차중기,엔진계통,1.0,
4,경기65러8780,KNAJA5265TA773125,연료분사펌프(조속기조정봉인),20061211.0,(주)삼광자동차중기,연료장치,1.0,


In [23]:
rdtr[rdtr['차대번호'] == 'KMFLA19RPVU116617']

Unnamed: 0,자동차등록번호,차대번호,정비내용,정비일자,전문정비사업소명,정비부품명,정비접수번호,정밀재검사접수번호
53775,88러1181,KMFLA19RPVU116617,연료분사펌프(조속기조정봉인),20080215.0,한일브랸쟈,연료장치,4.0,
56095,88러1181,KMFLA19RPVU116617,노즐(디젤인젝터포함),20080212.0,e-디젤시스템,연료장치,1.0,
56489,88러1181,KMFLA19RPVU116617,연료분사펌프(조속기조정봉인),20080215.0,한일브랸쟈,연료장치,5.0,


In [24]:
resr[resr['차대번호'] == 'KMFLA19RPVU116617']

Unnamed: 0,주행거리,자동차등록번호,차대번호,부하_일산화탄소_측정치,부하_탄화수소_측정치,부하_질소산화물_측정치,부하_매연_측정치,부하_토크_측정치,정비일자,전문정비사업소명,정비접수번호,정밀재검사접수번호,무부하_일산화탄소_측정치,무부하_탄화수소_측정치,무부하_공기과잉률_측정치,무부하_매연_측정치
9373,17628.0,88러1181,KMFLA19RPVU116617,0.0,0.0,0.0,0.0,0.0,20080212,e-디젤시스템,1,F0032008021100361,0.0,0.0,0.0,<NULL>
9678,17807.0,88러1181,KMFLA19RPVU116617,0.0,0.0,0.0,0.0,0.0,20080215,한일브랸쟈,5,F0032008021500021,0.0,0.0,0.0,<NULL>
9813,17628.0,88러1181,KMFLA19RPVU116617,0.0,0.0,0.0,0.0,0.0,20080214,한일브랸쟈,1,F0032008021100361,0.0,0.0,0.0,<NULL>
9814,17628.0,88러1181,KMFLA19RPVU116617,0.0,0.0,0.0,0.0,0.0,20080214,한일브랸쟈,2,F0032008021100361,0.0,0.0,0.0,<NULL>
9815,17628.0,88러1181,KMFLA19RPVU116617,0.0,0.0,0.0,0.0,0.0,20080214,한일브랸쟈,3,F0032008021100361,0.0,0.0,0.0,<NULL>
9816,17790.0,88러1181,KMFLA19RPVU116617,0.0,0.0,0.0,0.0,0.0,20080215,한일브랸쟈,4,F0032008021400261,0.0,0.0,0.0,<NULL>


In [26]:
rdtr['정밀재검사접수번호'].isnull().sum()

106819

In [27]:
# 어떤 열을 기준으로 병합 할 것인지 고민
# resr, rdtr : 차대번호, 정비접수번호, 전문정비사업소명
# 기준 df : resr

# Preprocessing

## 전문정비, 상세 병합
- 어떤 열을 기준으로 병합 할 것인지 고민
- resr, rdtr : 차대번호, 정비접수번호, 전문정비사업소명
- 기준 df : resr

In [25]:
resr.shape

(54617, 16)