# Library

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os, re

# fold path

In [4]:
ods_fold = 'D:/data/big2/ODS'
col_fold = 'C:/Users/kbjung/Documents/GitHub/wabotech/car_big_data_2/analysis/ODSvsSTD'

# 1\. 등록정보(ODS_CEG_CAR_MIG)

In [10]:
# about 9m 45s
name = 'CEG_CAR_MIG'
file_name = f'ODS_{name}.csv'
df = pd.read_csv(os.path.join(ods_fold, file_name), low_memory=False, index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29534047 entries, 0 to 29534046
Data columns (total 32 columns):
 #   Column             Dtype  
---  ------             -----  
 0   VHMNO              object 
 1   VHRNO              object 
 2   BSPL_LEDO_CD       int64  
 3   OWNR_SE            object 
 4   VHCTY              object 
 5   PURPS              object 
 6   NOW_OWNR_NM        object 
 7   NOW_MBERNO         object 
 8   BSPL_ADRS          object 
 9   OWNR_ADRS          object 
 10  VIN                object 
 11  FST_REG_DE         int64  
 12  VHCL_YRIDNW        int64  
 13  VHCL_MNFCT_DE      object 
 14  EMIS_INSP_EFCT_DE  float64
 15  SRCMNNO            object 
 16  EMIS_CRTCNO        object 
 17  EMIS_GRD           object 
 18  LEM_YN             object 
 19  LEM_KND            float64
 20  LEM_STRCHG_YN      object 
 21  BSPL_ZIP           float64
 22  VHRNO_NUM_INFO     int64  
 23  VHCL_ERSR_YN       object 
 24  FST_OWNR_NM        object 
 25  FST_MBERNO      

In [11]:
file_name = f'[정제]{name}.xlsx'
col_df = pd.read_excel(os.path.join(col_fold, file_name), sheet_name='ODS_col')
col_df.head()

Unnamed: 0,컬럼,내용
0,UPDT_DT,수정일시
1,OWNR_SE,소유자구분
2,VHRNO,차량등록번호
3,BSPL_LEDO_CD,본거지법정동코드
4,CRTN_DT,생성일시


In [12]:
col_dict = {a:b for a,b in col_df[['컬럼', '내용']].values}
col_dict

{'UPDT_DT': '수정일시',
 'OWNR_SE': '소유자구분',
 'VHRNO': '차량등록번호',
 'BSPL_LEDO_CD': '본거지법정동코드',
 'CRTN_DT': '생성일시',
 'PURPS': '용도',
 'EMIS_INSP_EFCT_DE': '배출가스검사유효일자',
 'EMIS_CRTCNO': '배출가스인증번호',
 'NOW_MBERNO': '현재회원번호',
 'ACQS_DE': '취득일자',
 'ACQS_AMT': '취득금액',
 'FST_MBERNO': '최초회원번호',
 'FST_REG_DE': '최초등록일자',
 'VHCTY': '차종',
 'VHRNO_NUM_INFO': '차량등록번호숫자정보',
 'VHMNO': '차량관리번호',
 'VHCL_MNFCT_DE': '차량제작일자',
 'VHCL_YRIDNW': '차량연식',
 'VHCL_ERSR_YN': '차량말소YN',
 'VIN': '차대번호',
 'SRCMNNO': '제원관리번호',
 'EMIS_GRD': '배출가스등급',
 'LEM_KND': '저공해조치종류',
 'LEM_STRCHG_YN': '저공해조치구조변경YN',
 'LEM_YN': '저공해조치YN(미사용)',
 'NOW_OWNR_NM': '미정',
 'BSPL_ADRS': '미정',
 'OWNR_ADRS': '미정',
 'BSPL_ZIP': '미정',
 'FST_OWNR_NM': '미정',
 'RMK': '미정',
 'USER_ID': '미정'}

In [13]:
df.columns = [col_dict[x] for x in df.columns]
df.columns

Index(['차량관리번호', '차량등록번호', '본거지법정동코드', '소유자구분', '차종', '용도', '미정', '현재회원번호',
       '미정', '미정', '차대번호', '최초등록일자', '차량연식', '차량제작일자', '배출가스검사유효일자', '제원관리번호',
       '배출가스인증번호', '배출가스등급', '저공해조치YN(미사용)', '저공해조치종류', '저공해조치구조변경YN', '미정',
       '차량등록번호숫자정보', '차량말소YN', '미정', '최초회원번호', '취득일자', '취득금액', '미정', '생성일시',
       '수정일시', '미정'],
      dtype='object')

## 차량등록번호

In [18]:
df.shape

(29534047, 32)

In [14]:
# 결측치 확인
df['차량등록번호'].isnull().sum()

0

In [19]:
p1 = re.compile('[가-힣]{2}[0-9]{1,2}[가-힣]{1}[0-9]{4}')
p2 = re.compile('[0-9]{2}[가-힣]{1}[0-9]{4}')
wrong_vhrno_list = []
for one in tqdm(df['차량등록번호']):
    if not p1.search(str(one)) and not p2.search(str(one)):
        wrong_vhrno_list.append(one)
len(wrong_vhrno_list)

100%|██████████| 29534047/29534047 [00:36<00:00, 806311.78it/s]


1

In [20]:
wrong_vhrno_list

['서울703**575']

## 본거지 법정동 코드

In [21]:
df['본거지법정동코드'].isnull().sum()

0

In [23]:
wrong_localcode_list = []
for one in tqdm(df['본거지법정동코드']):
    if len(str(one)) != 10:
        wrong_localcode_list.append(one)
len(wrong_localcode_list)

100%|██████████| 29534047/29534047 [00:18<00:00, 1594328.87it/s]


0

In [24]:
p = re.compile('[0-9]{10}')
wrong_localcode_list = []
for one in tqdm(df['본거지법정동코드']):
    if not p.search(str(one)):
        wrong_localcode_list.append(one)
len(wrong_localcode_list)

100%|██████████| 29534047/29534047 [00:27<00:00, 1061012.15it/s]


0

## 배출가스검사유효일자

In [25]:
df['배출가스검사유효일자'].head()

0    19850225.0
1    19760706.0
2           NaN
3    20191220.0
4    19930227.0
Name: 배출가스검사유효일자, dtype: float64

In [26]:
df['배출가스검사유효일자'].isnull().sum()

635

In [27]:
null_idx = df[df['배출가스검사유효일자'].isnull() == True].index
len(null_idx)

635

In [30]:
p = re.compile('[0-9]{8}[.][0]')
wrong_gascheckdate_idx = []
for i, one in tqdm(enumerate(df['배출가스검사유효일자'].to_list())):
    if not p.search(str(one)):
        wrong_gascheckdate_idx.append(i)
len(wrong_gascheckdate_idx)

29534047it [00:35, 831502.03it/s]


637

In [31]:
set(wrong_gascheckdate_idx) - set(null_idx)

{4124, 15938}

In [33]:
df.loc[list(set(wrong_gascheckdate_idx) - set(null_idx)), '배출가스검사유효일자']

15938    1990604.0
4124     1990118.0
Name: 배출가스검사유효일자, dtype: float64

In [34]:
df.loc[list(set(wrong_gascheckdate_idx) - set(null_idx)), ['배출가스검사유효일자', '차량연식', '배출가스인증번호']]

Unnamed: 0,배출가스검사유효일자,차량연식,배출가스인증번호
15938,1990604.0,1983,
4124,1990118.0,1979,


In [36]:
df.loc[null_idx, ['차대번호', '배출가스검사유효일자', '차량연식', '배출가스인증번호']]

Unnamed: 0,차대번호,배출가스검사유효일자,차량연식,배출가스인증번호
2,10762211783,,1976,
8,10762211682,,1976,
10,10762212246,,1976,
47,10762213150,,1976,
52,03761307405,,1976,
...,...,...,...,...
8087297,KNAKU815BAA058010,,2010,9MY-KM-14-22
8251095,KMFZCY7JAAU616346,,2010,7MY-HD-14-68
20922089,KMHK4815GJU081024,,2018,HMY-HD-14-31
26101534,KNANE81BBMS034872,,2021,JMY-KM-14-29


In [45]:
len('KMHK4815GJU081024')

17

In [47]:
df.loc[null_idx, ['차대번호', '배출가스검사유효일자', '차량연식', '배출가스인증번호']].isnull().sum()

차대번호            0
배출가스검사유효일자    635
차량연식            0
배출가스인증번호      540
dtype: int64

# 정기검사

In [39]:
# about 3m 34s
file_name = '[ODS]정기검사(2022.12.28).csv'
ins = pd.read_csv(os.path.join(ods_fold, file_name), low_memory=False)
ins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22006426 entries, 0 to 22006425
Data columns (total 39 columns):
 #   Column     Dtype  
---  ------     -----  
 0   사용연료       object 
 1   주행거리       object 
 2   차명         object 
 3   차량번호       object 
 4   차대번호       object 
 5   차종         object 
 6   차량연식       int64  
 7   엔진형식       object 
 8   검사방법       object 
 9   검사일자       int64  
 10  검사종류       object 
 11  검사판정시각     object 
 12  검사판정       object 
 13  산소값        float64
 14  이산화탄소값     float64
 15  무부하매연판정1   object 
 16  무부하매연판정2   object 
 17  무부하매연판정3   object 
 18  무부하매연판정4   object 
 19  무부하매연판정5   object 
 20  무부하매연판정6   object 
 21  무부하매연허용치1  float64
 22  무부하매연허용치2  float64
 23  무부하매연허용치3  float64
 24  무부하매연허용치4  float64
 25  무부하매연허용치5  float64
 26  무부하매연허용치6  float64
 27  무부하매연측정치1  float64
 28  무부하매연측정치2  float64
 29  무부하매연측정치3  float64
 30  무부하매연측정치4  float64
 31  무부하매연측정치5  float64
 32  무부하매연측정치6  float64
 33  차량용도       object 
 34  최대출력허용치    float

In [41]:
ins['검사일자'].head()

0    20190102
1    20190102
2    20190102
3    20190102
4    20190102
Name: 검사일자, dtype: int64

In [46]:
ins.loc[ins['차대번호'] == 'KMHK4815GJU081024', '검사일자']

18083379    20220124
Name: 검사일자, dtype: int64

## 배출가스 인증번호

In [48]:
df['배출가스인증번호'].isnull().sum()

3414724

## 차종

### 등록&제원정보

In [59]:
# about 9m 24s
file_name = '[ODS]등록정보&제원정보(2022.12.28).csv'
df = pd.read_csv(os.path.join(ods_fold, file_name), encoding='cp949', low_memory=False)
df.shape

(29534047, 49)

In [60]:
df.loc[df['배출가스인증번호'].isnull() == True, ['차대번호', '배출가스인증번호', '자동차형식', '엔진형식', '차명']].head()

Unnamed: 0,차대번호,배출가스인증번호,자동차형식,엔진형식,차명
0,10762211742,,EP-01C,GTS,포니
1,KG68T122688*,,UP26LV,,퍼브리카
2,10762211783,,EP-01C,GTS,포니
3,PN71C018511000000,,,10781152968,JAM담프
4,KMHLA11FPGU060639,,FX-2,G4AGG 220982,포니엑셀


In [63]:
df.loc[df['자동차형식'] == 'FX-2', '배출가스인증번호'].value_counts()

LMY-HD-08    32
KMY-HD-02     3
KMY-HD-04     1
Name: 배출가스인증번호, dtype: int64

## 차량 제작 일자

## 차량 연식

## 차량 말소 YN

## 차대번호

## 제원관리번호

## 배출가스 등급