# Library

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os, re

# fold path

In [2]:
ods_fold = 'D:/data/big2/ODS'
col_fold = 'C:/Users/kbjung/Documents/GitHub/wabotech/car_big_data_2/analysis/ODSvsSTD'

In [51]:
# 소수점 옵션
pd.options.display.float_format = '{:.2f}'.format
# pd.reset_option('display.float_format')

# 1\. 등록정보(ODS_CEG_CAR_MIG)

In [3]:
# about 9m 45s
name = 'CEG_CAR_MIG'
file_name = f'ODS_{name}.csv'
df = pd.read_csv(os.path.join(ods_fold, file_name), low_memory=False, index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29534047 entries, 0 to 29534046
Data columns (total 32 columns):
 #   Column             Dtype  
---  ------             -----  
 0   VHMNO              object 
 1   VHRNO              object 
 2   BSPL_LEDO_CD       int64  
 3   OWNR_SE            object 
 4   VHCTY              object 
 5   PURPS              object 
 6   NOW_OWNR_NM        object 
 7   NOW_MBERNO         object 
 8   BSPL_ADRS          object 
 9   OWNR_ADRS          object 
 10  VIN                object 
 11  FST_REG_DE         int64  
 12  VHCL_YRIDNW        int64  
 13  VHCL_MNFCT_DE      object 
 14  EMIS_INSP_EFCT_DE  float64
 15  SRCMNNO            object 
 16  EMIS_CRTCNO        object 
 17  EMIS_GRD           object 
 18  LEM_YN             object 
 19  LEM_KND            float64
 20  LEM_STRCHG_YN      object 
 21  BSPL_ZIP           float64
 22  VHRNO_NUM_INFO     int64  
 23  VHCL_ERSR_YN       object 
 24  FST_OWNR_NM        object 
 25  FST_MBERNO      

In [4]:
file_name = f'[정제]{name}.xlsx'
col_df = pd.read_excel(os.path.join(col_fold, file_name), sheet_name='ODS_col')
col_df.head()

Unnamed: 0,컬럼,내용
0,UPDT_DT,수정일시
1,OWNR_SE,소유자구분
2,VHRNO,차량등록번호
3,BSPL_LEDO_CD,본거지법정동코드
4,CRTN_DT,생성일시


In [5]:
col_dict = {a:b for a,b in col_df[['컬럼', '내용']].values}
col_dict

{'UPDT_DT': '수정일시',
 'OWNR_SE': '소유자구분',
 'VHRNO': '차량등록번호',
 'BSPL_LEDO_CD': '본거지법정동코드',
 'CRTN_DT': '생성일시',
 'PURPS': '용도',
 'EMIS_INSP_EFCT_DE': '배출가스검사유효일자',
 'EMIS_CRTCNO': '배출가스인증번호',
 'NOW_MBERNO': '현재회원번호',
 'ACQS_DE': '취득일자',
 'ACQS_AMT': '취득금액',
 'FST_MBERNO': '최초회원번호',
 'FST_REG_DE': '최초등록일자',
 'VHCTY': '차종',
 'VHRNO_NUM_INFO': '차량등록번호숫자정보',
 'VHMNO': '차량관리번호',
 'VHCL_MNFCT_DE': '차량제작일자',
 'VHCL_YRIDNW': '차량연식',
 'VHCL_ERSR_YN': '차량말소YN',
 'VIN': '차대번호',
 'SRCMNNO': '제원관리번호',
 'EMIS_GRD': '배출가스등급',
 'LEM_KND': '저공해조치종류',
 'LEM_STRCHG_YN': '저공해조치구조변경YN',
 'LEM_YN': '저공해조치YN(미사용)',
 'NOW_OWNR_NM': '미정',
 'BSPL_ADRS': '미정',
 'OWNR_ADRS': '미정',
 'BSPL_ZIP': '미정',
 'FST_OWNR_NM': '미정',
 'RMK': '미정',
 'USER_ID': '미정'}

In [6]:
df.columns = [col_dict[x] for x in df.columns]
df.columns

Index(['차량관리번호', '차량등록번호', '본거지법정동코드', '소유자구분', '차종', '용도', '미정', '현재회원번호',
       '미정', '미정', '차대번호', '최초등록일자', '차량연식', '차량제작일자', '배출가스검사유효일자', '제원관리번호',
       '배출가스인증번호', '배출가스등급', '저공해조치YN(미사용)', '저공해조치종류', '저공해조치구조변경YN', '미정',
       '차량등록번호숫자정보', '차량말소YN', '미정', '최초회원번호', '취득일자', '취득금액', '미정', '생성일시',
       '수정일시', '미정'],
      dtype='object')

## 차량등록번호

In [7]:
df.shape

(29534047, 32)

In [8]:
# 결측치 확인
df['차량등록번호'].isnull().sum()

0

In [9]:
p1 = re.compile('[가-힣]{2}[0-9]{1,2}[가-힣]{1}[0-9]{4}')
p2 = re.compile('[0-9]{2}[가-힣]{1}[0-9]{4}')
wrong_vhrno_list = []
for one in tqdm(df['차량등록번호']):
    if not p1.search(str(one)) and not p2.search(str(one)):
        wrong_vhrno_list.append(one)
len(wrong_vhrno_list)

100%|██████████| 29534047/29534047 [00:35<00:00, 831439.60it/s]


1

In [10]:
wrong_vhrno_list

['서울703**575']

## 본거지 법정동 코드

In [11]:
df['본거지법정동코드'].isnull().sum()

0

In [12]:
wrong_localcode_list = []
for one in tqdm(df['본거지법정동코드']):
    if len(str(one)) != 10:
        wrong_localcode_list.append(one)
len(wrong_localcode_list)

100%|██████████| 29534047/29534047 [00:18<00:00, 1625711.43it/s]


0

In [13]:
p = re.compile('[0-9]{10}')
wrong_localcode_list = []
for one in tqdm(df['본거지법정동코드']):
    if not p.search(str(one)):
        wrong_localcode_list.append(one)
len(wrong_localcode_list)

100%|██████████| 29534047/29534047 [00:26<00:00, 1117989.31it/s]


0

## 배출가스검사유효일자

In [14]:
df['배출가스검사유효일자'].head()

0    19850225.0
1    19760706.0
2           NaN
3    20191220.0
4    19930227.0
Name: 배출가스검사유효일자, dtype: float64

In [15]:
df['배출가스검사유효일자'].isnull().sum()

635

In [16]:
null_idx = df[df['배출가스검사유효일자'].isnull() == True].index
len(null_idx)

635

In [17]:
p = re.compile('[0-9]{8}[.][0]')
wrong_gascheckdate_idx = []
for i, one in tqdm(enumerate(df['배출가스검사유효일자'].to_list())):
    if not p.search(str(one)):
        wrong_gascheckdate_idx.append(i)
len(wrong_gascheckdate_idx)

29534047it [00:30, 975731.91it/s] 


637

In [18]:
set(wrong_gascheckdate_idx) - set(null_idx)

{4124, 15938}

In [19]:
df.loc[list(set(wrong_gascheckdate_idx) - set(null_idx)), '배출가스검사유효일자']

15938    1990604.0
4124     1990118.0
Name: 배출가스검사유효일자, dtype: float64

In [20]:
df.loc[list(set(wrong_gascheckdate_idx) - set(null_idx)), ['배출가스검사유효일자', '차량연식', '배출가스인증번호']]

Unnamed: 0,배출가스검사유효일자,차량연식,배출가스인증번호
15938,1990604.0,1983,
4124,1990118.0,1979,


In [21]:
df.loc[null_idx, ['차대번호', '배출가스검사유효일자', '차량연식', '배출가스인증번호']]

Unnamed: 0,차대번호,배출가스검사유효일자,차량연식,배출가스인증번호
2,10762211783,,1976,
8,10762211682,,1976,
10,10762212246,,1976,
47,10762213150,,1976,
52,03761307405,,1976,
...,...,...,...,...
8087297,KNAKU815BAA058010,,2010,9MY-KM-14-22
8251095,KMFZCY7JAAU616346,,2010,7MY-HD-14-68
20922089,KMHK4815GJU081024,,2018,HMY-HD-14-31
26101534,KNANE81BBMS034872,,2021,JMY-KM-14-29


In [22]:
len('KMHK4815GJU081024')

17

In [23]:
df.loc[null_idx, ['차대번호', '배출가스검사유효일자', '차량연식', '배출가스인증번호']].isnull().sum()

차대번호            0
배출가스검사유효일자    635
차량연식            0
배출가스인증번호      540
dtype: int64

# 정기검사

In [24]:
# about 3m 34s
file_name = '[ODS]정기검사(2022.12.28).csv'
ins = pd.read_csv(os.path.join(ods_fold, file_name), low_memory=False)
ins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22006426 entries, 0 to 22006425
Data columns (total 39 columns):
 #   Column     Dtype  
---  ------     -----  
 0   사용연료       object 
 1   주행거리       object 
 2   차명         object 
 3   차량번호       object 
 4   차대번호       object 
 5   차종         object 
 6   차량연식       int64  
 7   엔진형식       object 
 8   검사방법       object 
 9   검사일자       int64  
 10  검사종류       object 
 11  검사판정시각     object 
 12  검사판정       object 
 13  산소값        float64
 14  이산화탄소값     float64
 15  무부하매연판정1   object 
 16  무부하매연판정2   object 
 17  무부하매연판정3   object 
 18  무부하매연판정4   object 
 19  무부하매연판정5   object 
 20  무부하매연판정6   object 
 21  무부하매연허용치1  float64
 22  무부하매연허용치2  float64
 23  무부하매연허용치3  float64
 24  무부하매연허용치4  float64
 25  무부하매연허용치5  float64
 26  무부하매연허용치6  float64
 27  무부하매연측정치1  float64
 28  무부하매연측정치2  float64
 29  무부하매연측정치3  float64
 30  무부하매연측정치4  float64
 31  무부하매연측정치5  float64
 32  무부하매연측정치6  float64
 33  차량용도       object 
 34  최대출력허용치    float

In [25]:
ins['검사일자'].head()

0    20190102
1    20190102
2    20190102
3    20190102
4    20190102
Name: 검사일자, dtype: int64

In [26]:
ins.loc[ins['차대번호'] == 'KMHK4815GJU081024', '검사일자']

18083379    20220124
Name: 검사일자, dtype: int64

## 배출가스 인증번호

In [27]:
df['배출가스인증번호'].isnull().sum()

3414724

## 차종

### 등록&제원정보

In [28]:
# about 9m 24s
file_name = '[ODS]등록정보&제원정보(2022.12.28).csv'
df = pd.read_csv(os.path.join(ods_fold, file_name), encoding='cp949', low_memory=False)
df.shape

(29534047, 49)

In [40]:
df.loc[df['배출가스인증번호'].isnull() == True, ['차대번호', '배출가스인증번호', '자동차형식', '엔진형식', '차명', '차량연식']].tail(10)

Unnamed: 0,차대번호,배출가스인증번호,자동차형식,엔진형식,차명,차량연식
29534031,KNMA4B2RMPP005538,,T4W13-4D,H5H,SM6,2023
29534036,KMHJE811BPU100088,,NX4H51BD-E8DF-G5A,G4FT,투싼 하이브리드(TUCSON HYBRID),2023
29534037,KMHJE811BPU100230,,NX4H51BD-E8DO-G5A,G4FT,투싼 하이브리드(TUCSON HYBRID),2023
29534038,KMHJE811BPU100378,,NX4H51BD-E9DO-G5A,G4FT,투싼 하이브리드(TUCSON HYBRID),2023
29534039,KMHJE811BPU100494,,NX4H51BD-E8DF-G5A,G4FT,투싼 하이브리드(TUCSON HYBRID),2023
29534040,KMHJE811BPU100592,,NX4H51BD-E8DF-G5A,G4FT,투싼 하이브리드(TUCSON HYBRID),2023
29534041,KMHL241JBPA060768,,DNJBH-G7-G5A,G4NR,쏘나타 하이브리드 (SONATA HYBRID),2023
29534042,KMHL241JBPA060809,,DNJBH-G7-G5A,G4NR,쏘나타 하이브리드 (SONATA HYBRID),2023
29534043,KMHL241JBPA060817,,DNJBH-G7-G5A,G4NR,쏘나타 하이브리드 (SONATA HYBRID),2023
29534046,KMHL3412BPA299998,,DN3BF-VS9-G3A,G4FP,쏘나타(SONATA),2023


In [33]:
df.loc[df['자동차형식'] == 'FX-2', ['배출가스인증번호', '차량연식', '차명']].value_counts()

배출가스인증번호   차량연식  차명   
LMY-HD-08  1988  엑셀       25
           1987  엑셀        6
KMY-HD-02  1987  엑셀        1
           1988  엑셀        1
           1989  엑셀        1
KMY-HD-04  1992  EXCEL     1
LMY-HD-08  1989  엑셀        1
dtype: int64

In [42]:
df.loc[df['엔진형식'] == 'G4FP', ['배출가스인증번호', '차량연식', '차명']].value_counts()

배출가스인증번호      차량연식  차명                      
KMY-KM-14-81  2020  K5                          24414
LMY-HD-14-62  2021  투싼(TUCSON)                  23281
MMY-KM-14-36  2022  스포티지                        21174
LMY-HD-14-62  2022  투싼(TUCSON)                  20925
KMY-KM-14-81  2021  K5                          15056
JMY-KM-14-57  2022  스포티지                        12525
KMY-HD-14-56  2021  쏘나타(SONATA)                 10923
              2022  쏘나타(SONATA)                  9652
KMY-KM-14-81  2022  K5                           9458
KMY-HD-14-56  2020  쏘나타(SONATA)                  9007
NMY-KM-14-11  2023  셀토스                          8869
HMY-HD-14-31  2021  코나(KONA)                     7682
LMY-HD-14-76  2022  코나(KONA)                     2485
              2023  코나(KONA)                     2351
LMY-HD-14-33  2021  아반떼 N 라인(AVANTE N Line)      1613
MMY-KM-14-36  2023  스포티지                         1350
JMY-HD-14-48  2021  투싼(TUCSON)                   1070
LMY-HD-14-62  2023  투싼(TUCSON)       

## 차종

In [44]:
df['차종_등록정보'].isnull().sum()

14

In [45]:
df['차종_등록정보'].unique()

array(['승용', '화물', '승합', '특수', nan], dtype=object)

In [48]:
df.loc[df['차종_등록정보'].isnull() == True, ['차명', '차종_등록정보', '차종분류']]

Unnamed: 0,차명,차종_등록정보,차종분류
6257,유니목1200,,특수용도형-특수용도형
7271,타이탄디젤14,,일반형-카고
21694,봉고킹캡,,일반형-카고
26839,슈퍼타이탄디젤14척(2.5톤),,일반형-카고
37972,로얄프린스1.5,,일반
38473,그레이스살롱,,일반
43509,봉고킹캡,,일반형-카고
47889,포니2-1400픽업컨버터블탑,,일반형-픽업
60693,베스타3밴,,밴
62028,점보타이탄2.5톤더블캡,,일반형-카고


## 차량 제작 일자

In [49]:
df['차량제작일자'].isnull().sum()

424546

In [52]:
df['차량제작일자'].describe()

count   29109501.00
mean    20133035.32
std        62241.75
min        11212.00
25%     20090924.00
50%     20140829.00
75%     20180908.00
max     20221227.00
Name: 차량제작일자, dtype: float64

In [53]:
df.loc[df['차량제작일자'].isnull() == True, ['차량제작일자', '차량연식']].head()

Unnamed: 0,차량제작일자,차량연식
0,,1976
1,,1968
2,,1976
3,,1971
4,,1986


In [55]:
df['차량제작일자'] = df['차량제작일자'].astype(str)

In [58]:
df['차량제작일자'].tail()

29534042    20221222.0
29534043    20221222.0
29534044    20221209.0
29534045    20221216.0
29534046    20221219.0
Name: 차량제작일자, dtype: object

In [59]:
df.loc[df['차량제작일자'].str.len() != 10].shape

(424560, 49)

In [62]:
df.loc[df['차량제작일자'].str.len() != 10, '차량제작일자'].tail()

21685676          nan
21712859          nan
22655861          nan
23415596          nan
28698925    2020101.0
Name: 차량제작일자, dtype: object

In [65]:
nan_idx = []
for i, one in tqdm(enumerate(df['차량제작일자'].to_list())):
    if str(one) == 'nan':
        nan_idx.append(i)
len(nan_idx)

29534047it [00:14, 2067135.81it/s]


424546

In [66]:
error_idx = []
for i, one in tqdm(enumerate(df['차량제작일자'].to_list())):
    if len(str(one)) != 10:
        error_idx.append(i)
len(error_idx)

29534047it [00:16, 1765138.99it/s]


424560

In [67]:
len(set(error_idx) - set(nan_idx))

14

In [70]:
df.loc[list(set(error_idx) - set(nan_idx)), ['차량제작일자', '차량연식']]

Unnamed: 0,차량제작일자,차량연식
17678839,2160221.0,2016
1492297,11212.0,2002
3291562,50226.0,2005
858730,7770707.0,1999
5554988,70814.0,2008
28698925,2020101.0,2022
12484270,2130128.0,2013
1233232,2210413.0,2001
201553,9910501.0,1991
453521,9950407.0,1995


In [71]:
df.loc[nan_idx, ['차량제작일자', '차량연식']].head()

Unnamed: 0,차량제작일자,차량연식
0,,1976
1,,1968
2,,1976
3,,1971
4,,1986


## 차량 연식

In [72]:
df['차량연식'].isnull().sum()

0

In [73]:
p = re.compile('[0-9]')
count = 0
if one in df['차량연식']:
    if not p.search(str(one)):
        count += 1
count

0

## 차량 말소 YN

In [74]:
df['차량말소YN'].isnull().sum()

0

In [75]:
df['차량말소YN'].value_counts(dropna=True)

N    25693532
Y     3840515
Name: 차량말소YN, dtype: int64

## 차대번호

In [76]:
df['차대번호'].isnull().sum()

0

## 제원관리번호

In [77]:
df['제원관리번호'].isnull().sum()

0

In [78]:
df['제원관리번호'].head()

0    10004800000000
1    30220100010946
2    10004800000000
3    30321400001063
4    30381300100968
Name: 제원관리번호, dtype: object

In [80]:
temp = df['제원관리번호'].str.len()
temp.unique()

array([14, 17], dtype=int64)

In [81]:
df.loc[df['제원관리번호'].str.len() == 14, '제원관리번호'].shape

(1998450,)

In [82]:
df.loc[df['제원관리번호'].str.len() == 17, '제원관리번호'].shape

(27535597,)

In [83]:
df.loc[df['제원관리번호'].str.len() == 17, '제원관리번호'].head()

33     00034313000053303
64     00034717000033303
107    00032709000020003
178    00034113000441104
194    00034314000023303
Name: 제원관리번호, dtype: object

In [84]:
error_idx = []
p = re.compile('[가-힣]')
for i, one in enumerate(df['제원관리번호']):
    if p.search(str(one)):
        error_idx.append(i)
len(error_idx)

0

## 배출가스 등급

In [85]:
df['배출가스등급'].isnull().sum()

32

# 2\. 제원정보(ODS_CEG_CAR_SRC_MIG)

In [106]:
name = 'CEG_CAR_SRC_MIG'
file_name = f'ODS_{name}.csv'
src = pd.read_csv(os.path.join(ods_fold, file_name), index_col=0, low_memory=False)
src.info()

<class 'pandas.core.frame.DataFrame'>
Index: 545374 entries, B0710004800013305 to A1110000800533322
Data columns (total 27 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   VHCNM        545272 non-null  object 
 1   VHCTY        545370 non-null  object 
 2   VHCTY_CL     531367 non-null  object 
 3   VHCTY_TY     531332 non-null  object 
 4   CAR_FRM      516295 non-null  object 
 5   MKR_NM       529017 non-null  object 
 6   FUEL         503254 non-null  object 
 7   EGINTY       494157 non-null  object 
 8   EMPV_WGHT    0 non-null       float64
 9   TOT_WGHT     541702 non-null  float64
 10  CRYNG_WGHT   446181 non-null  float64
 11  EGINPWR      471948 non-null  float64
 12  DSPLVL       510521 non-null  float64
 13  TKCAR_NMPR   545313 non-null  float64
 14  VHCL_LT      514786 non-null  float64
 15  VHCL_WH      514799 non-null  float64
 16  VHCL_HG      514732 non-null  float64
 17  PLOR_NM      540392 non-null  object 
 18  RE

In [107]:
file_name = f'[정제]{name}.xlsx'
col_df = pd.read_excel(os.path.join(col_fold, file_name), sheet_name='ODS_col')
col_dict = {a:b for a,b in col_df[['컬럼', '내용']].values}
src.columns = [col_dict[x] for x in src.columns]
src.columns

Index(['차명', '차종', '차종분류', '차종유형(소분류)', '자동차형식', '제작사명', '연료', '엔진형식', '공차중량',
       '총중량', '적재중량', '엔진출력', '배기량', '승차인원', '차량길이', '차량너비', '차량높이', '원산지명',
       '미정', '삭제YN', '승인일자', '구동형식', '변속기종류', '미정', '생성일시', '수정일시', '미정'],
      dtype='object')

## 차종분류

In [108]:
src['차종분류'].unique()

array(['특수용도형-특수용도형', '특수', '특수용도형-피견인', '일반', '다목적', '밴', '일반형-픽업',
       '일반형-카고', '기타', '견인', '특수작업형', '특수용도형-탱크로리', '덤프', '구난', '승용겸화물',
       '특수용도형', nan], dtype=object)

In [109]:
src['차종분류'].isnull().sum()

14007

In [116]:
src.loc[src['차종분류'].isnull() == True, ['차종분류', '차종', '차명']].head()

Unnamed: 0_level_0,차종분류,차종,차명
SRCMNNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7550000100043018,,T,ELF
7550000100053018,,T,ELF
7550000100063018,,T,ELF
7550000100073018,,T,ELF
7550000100083018,,T,ELF


In [102]:
src['차종'].unique()

array(['T', 'V', 'P', 'S', nan], dtype=object)

In [86]:
df['차종분류'].unique()

array(['일반', '일반형-카고', '특수용도형-특수용도형', '특수용도형-피견인', '덤프', '구난', '특수작업형',
       '기타', '다목적', '견인', '밴', '일반형-픽업', '승용겸화물', '특수용도형-탱크로리', '특수',
       '특수용도형', nan], dtype=object)

In [87]:
df['차종분류'].isnull().sum()

21

In [90]:
df.loc[df['차종분류'].isnull() == True, ['차종분류', '차종_등록정보', '차종_제원정보']]

Unnamed: 0,차종분류,차종_등록정보,차종_제원정보
29505431,,승용,
29506203,,승용,
29506215,,승용,
29506915,,승용,
29506919,,승용,
29506921,,승용,
29507665,,승용,
29508285,,승용,
29509242,,승용,
29509300,,승용,


## 차종

In [117]:
src['차종'].unique()

array(['T', 'V', 'P', 'S', nan], dtype=object)

In [118]:
src['차종'].isnull().sum()

4

In [119]:
src.loc[src['차종'].isnull() == True, '차종분류']

SRCMNNO
30110100002329    NaN
30210100000056    NaN
30220100000001    NaN
30110100002328    NaN
Name: 차종분류, dtype: object

In [123]:
df['차종_제원정보'].isnull().sum()

21

In [124]:
df.loc[df['차종_제원정보'].isnull() == True, ['차종_제원정보', '차종_등록정보']]

Unnamed: 0,차종_제원정보,차종_등록정보
29505431,,승용
29506203,,승용
29506215,,승용
29506915,,승용
29506919,,승용
29506921,,승용
29507665,,승용
29508285,,승용
29509242,,승용
29509300,,승용


## 차명

In [125]:
src['차명'].isnull().sum()

102

In [136]:
src.loc[src['차명'].isnull() == True, ['자동차형식', '엔진형식', '차명']].head(30)

Unnamed: 0_level_0,자동차형식,엔진형식,차명
SRCMNNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
30432000000014,,,
30451600000004,,,
30451600000007,,10831268192,
30451600000008,,XA014091,
30220100001103,,,
30481800000001,,G15CF,
30460300000009,,,
30462500000000,,,
30110100002429,,01801300465,
30112200002000,,,


In [134]:
df.loc[df['자동차형식'] == '104-941', ['자동차형식', '엔진형식', '차명']]

Unnamed: 0,자동차형식,엔진형식,차명
517100,104-941,104.941,벤즈C280
1878718,104-941,112 920,벤츠C280


In [137]:
df.loc[df['엔진형식'] == 'G15CF', ['자동차형식', '엔진형식', '차명']]

Unnamed: 0,자동차형식,엔진형식,차명
1171,TM19,G15CF,르망
1355,TM19,G15CF,르망
1972,TM19,G15CF,르망
3418,TM19,G15CF,르망
4413,TA19,G15CF,르망오토
...,...,...,...
1009310,TF19,G15CF,대우 르망
1087713,TM19,G15CF,르망
2082125,TF19E,G15CF,대우 르망
2310856,TF19,G15CF,LEMANS


In [132]:
df['차명'].isnull().sum()

23

## 제작사 명

In [138]:
src['제작사명'].isnull().sum()

16357

In [146]:
list(src['제작사명'].unique())

['(주) 두성특장차',
 '자기인증면제차량',
 'SOFA차량',
 '한국토요타자동차(주)',
 '현대자동차(주)',
 '닛산',
 'BMW AG',
 '자일대우상용차 주식회사',
 '(주)참존임포트',
 '한국특장차(주)',
 '스카니아코리아그룹(주)',
 '타타대우상용차(주)',
 '주식회사 천하',
 '아우라(AURA)',
 '정우정공 (주)',
 '원모터스(One motors)',
 '기아자동차(주)',
 '미니앤컴팩(mini&compact)',
 '와우모터스',
 '홍성일',
 '에스알(S.R)모터스',
 '이명환',
 '세하(SEHA)',
 '(주)인피니코리아',
 '오토포럼',
 '스타자동차매매상사',
 '최영훈',
 '(주)노바모터스',
 '조인수입차',
 '(주)신광테크놀러지',
 '쌍용자동차(주)',
 '한국상용트럭(주)',
 '(주)한성특장',
 '아우디폭스바겐코리아(주)',
 '(주)한국쓰리축',
 '(주)새한에어서스펜션',
 '모터프로',
 '한신특장',
 '명성정공',
 '뉴클락모터스',
 '한남상사',
 '더블유모터스컴퍼니',
 '현대',
 '한국특장기술(주)',
 nan,
 '(주)수산특장',
 '대우자동차(주)',
 '(주)스타텍',
 '케이에이치특장(주)',
 '(유)동양특장차',
 '(주)함코',
 '(유)삼능기계',
 '동우특장차(주)',
 '(주)알텍',
 '미래특장차(주)',
 '금양기전',
 '삼성상용차(주)',
 '지엠코리아(주)',
 '아시아자동차공업(주)',
 '**기타**',
 '(주)진도',
 'TOYOTA',
 'MAZDA',
 '건민특장 주식회사',
 '디와이(주) 익산공장',
 '(주)한국그린피아',
 '(주)포멕특장',
 '(주)한중특장',
 '대우중공업(주)',
 '대양중공업(주)',
 '이텍산업 주식회사',
 '메리트특장차주식회사',
 '태산냉동 주식회사',
 '(주)동해기계항공',
 '(주)유로카라반',
 '(주)한국토미',
 '도쿄모터스',
 '주식회사 에타',
 '엔에스티네트웍스(주)',
 '

In [149]:
src.loc[src['제작사명'].isnull() == True, ['차명', '자동차형식', '엔진형식', '제작사명']].head(10)

Unnamed: 0_level_0,차명,자동차형식,엔진형식,제작사명
SRCMNNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30111100000072,벤츠300E,124 030,103 983,
30111100000073,벤즈200,,10292010047640,
30111100000076,벤츠300E,124 030,103 983,
30111100000080,벤츠S320,140 033,104 994,
30111100000081,볼보,740GL,B200E,
30111100000084,머큐리세이블,MSV3-27C,MSV3,
30111100000085,아우디A62.0E,4A,ABK,
30111100000088,BMW,318I,184S1,
30111100000092,랜드크루사,,3F0117036,
30111100000109,BMW318i,318IA,184S1,


In [143]:
src.loc[src['제작사명'].isnull() == True, ['차명', '자동차형식', '엔진형식', '제작사명']].isnull().sum()

차명          57
자동차형식     5331
엔진형식      4164
제작사명     16357
dtype: int64

## 제원관리번호

## 자동차 형식

## 연료

## 엔진형식