# Library

In [33]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import re

In [2]:
ods_fold = 'D:/data/big2/ODS'

In [10]:
# about 8m 42s
file_name = '[ODS][G4]등록정보&제원정보(말소제거)(2022.12.29).csv'
df = pd.read_csv(os.path.join(ods_fold, file_name), encoding='cp949', low_memory=False)

In [11]:
df.columns

Index(['차량관리번호', '차량등록번호', '본거지법정동코드', '소유자구분', '차종_등록정보', '용도', '현재회원번호',
       '차대번호', '최초등록일자', '차량연식', '차량제작일자', '배출가스검사유효일자', '제원관리번호', '배출가스인증번호',
       '배출가스등급', '저공해조치YN(미사용)', '저공해조치종류', '저공해조치구조변경YN', '차량등록번호숫자정보',
       '차량말소YN', '최초회원번호', '취득일자', '취득금액', '생성일시_등록정보', '수정일시_등록정보', '차명',
       '차종_제원정보', '차종분류', '차종유형(소분류)', '자동차형식', '제작사명', '연료', '엔진형식', '공차중량',
       '총중량', '적재중량', '엔진출력', '배기량', '승차인원', '차량길이', '차량너비', '차량높이', '원산지명',
       '삭제YN', '승인일자', '구동형식', '변속기종류', '생성일시_제원정보', '수정일시_제원정보'],
      dtype='object')

In [12]:
df.shape

(1511251, 49)

In [14]:
df['배출가스등급'].value_counts(dropna=False)

4     1511214
T4         37
Name: 배출가스등급, dtype: int64

In [15]:
df['차량말소YN'].value_counts(dropna=False)


N    1511251
Name: 차량말소YN, dtype: int64

In [16]:
g4 = df.sort_values(['최초등록일자', '배출가스검사유효일자'], ascending=[False, False]).drop_duplicates('차대번호').reset_index(drop=True)
g4.shape

(1511211, 49)

In [17]:
g4['엔진형식'].isnull().sum()

2001

In [18]:
engine_form = ['G4AJ', 'G15CF', 'B5', 'H20MR', 'G16SF', 'MSV3', 'ZB', '1NZ', '2AZ']
len(engine_form)

9

In [19]:
for one in engine_form:
    print(g4[g4['엔진형식'] == one].shape)

(2388, 49)
(1513, 49)
(16111, 49)
(2835, 49)
(45, 49)
(1089, 49)
(0, 49)
(1, 49)
(3, 49)


In [20]:
g4.loc[g4['엔진형식'] == 'G4AJ', '연료'] = '휘발유'
g4.loc[g4['엔진형식'] == 'G15CF', '연료'] = '휘발유'
g4.loc[g4['엔진형식'] == 'B5', '연료'] = '휘발유'
g4.loc[g4['엔진형식'] == 'H20MR', '연료'] = '휘발유'
g4.loc[g4['엔진형식'] == 'G16SF', '연료'] = '휘발유'
g4.loc[g4['엔진형식'] == 'MSV3', '연료'] = '휘발유'
g4.loc[g4['엔진형식'] == 'ZB', '연료'] = '경유'
g4.loc[g4['엔진형식'] == '1NZ', '연료'] = '경유'
g4.loc[g4['엔진형식'] == '2AZ', '연료'] = '경유'

In [21]:
g4['연료'].value_counts(dropna=False)

경유             1152701
휘발유             332345
LPG(액화석유가스)      26089
NaN                 69
기타연료                 5
CNG(압축천연가스)          1
알코올                  1
Name: 연료, dtype: int64

In [22]:
g4.loc[g4['연료'] == '기타연료', '차명'].unique()

array(['에어스트림탐지견수송트레일러', '스텔라오토매틱', '벤즈 230E', '벤즈230E', '도요다크라운로얄싸롱'],
      dtype=object)

In [27]:
car_name = [
    '에어스트림탐지견수송트레일러', #
    '스텔라오토매틱', # 휘발유
    '벤즈 230E', # 휘발유
    '벤즈230E', # 휘발유
    '도요다크라운로얄싸롱', # 휘발유
]
len(car_name)

5

In [28]:
g4.loc[g4['차명'] == '스텔라오토매틱', '연료'] = '휘발유'
g4.loc[g4['차명'] == '벤즈 230E', '연료'] = '휘발유'
g4.loc[g4['차명'] == '벤즈230E', '연료'] = '휘발유'
g4.loc[g4['차명'] == '도요다크라운로얄싸롱', '연료'] = '휘발유'

In [29]:
g4['연료'].value_counts(dropna=False)

경유             1152701
휘발유             332353
LPG(액화석유가스)      26088
NaN                 66
기타연료                 1
CNG(압축천연가스)          1
알코올                  1
Name: 연료, dtype: int64

In [30]:
g4.groupby('연료', dropna=False)['차대번호'].count()

연료
CNG(압축천연가스)          1
LPG(액화석유가스)      26088
경유             1152701
기타연료                 1
알코올                  1
휘발유             332353
NaN                 66
Name: 차대번호, dtype: int64

In [31]:
g4.groupby('연료', dropna=False)['차대번호'].count().to_excel('analysis2/통계_4등급_연료별.xlsx')

In [32]:
g4.groupby('차량연식', dropna=False)['차대번호'].count().to_excel('analysis2/통계_4등급_차량연식.xlsx')

# 배출가스 인증번호 정상 확인

In [34]:
for one in g4['배출가스인증번호']:
    if ' ' in str(one):
        print('y')

In [35]:
p_before = re.compile('[a-zA-Z0-9]{3}[-][a-zA-Z]{2}[-][0-9]{2}')
p_after = re.compile('[a-zA-Z0-9]{3}[-][a-zA-Z]{2}[-][0-9]{2}[-][0-9]{2}')

In [44]:
check_list = []
for one in tqdm(g4['배출가스인증번호']):
    if p_before.search(str(one)) or p_after.search(str(one)):
        check_list.append('정상')
    elif str(one) == 'nan':
        check_list.append(np.nan)
    else:
        check_list.append('규격외')
len(check_list)

100%|██████████| 1511211/1511211 [00:01<00:00, 862825.38it/s]


1511211

In [45]:
g4['배출가스인증번호'].head()

0             NaN
1             NaN
2             NaN
3    7MY-HD-24-28
4             NaN
Name: 배출가스인증번호, dtype: object

In [46]:
check_list[:5]

[nan, nan, nan, '정상', nan]

In [47]:
g4['배인번호_정상_판단'] = check_list

In [48]:
g4['배출가스인증번호'].isnull().sum()

123700

In [52]:
g4.groupby(['배인번호_정상_판단'], dropna=False)['차대번호'].count()

배인번호_정상_판단
규격외      21959
정상     1365552
NaN     123700
Name: 차대번호, dtype: int64

In [50]:
g4.groupby(['연료', '배인번호_정상_판단'], dropna=False)['차대번호'].count().unstack()

배인번호_정상_판단,규격외,정상,NaN
연료,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CNG(압축천연가스),,1.0,
LPG(액화석유가스),,23363.0,2725.0
경유,4151.0,1140926.0,7624.0
기타연료,,,1.0
알코올,,,1.0
휘발유,17808.0,201262.0,113283.0
,,,66.0


In [53]:
with pd.ExcelWriter('analysis2/통계_4등급_연료별_차량연식별_배인번호_정상여부별.xlsx') as writer:
    g4.groupby(['연료', '배인번호_정상_판단'], dropna=False)['차대번호'].count().unstack().to_excel(writer, sheet_name='연료별_배인번호_정상여부')
    g4.groupby('연료', dropna=False)['차대번호'].count().to_excel(writer, sheet_name='연료별')
    g4.groupby('차량연식', dropna=False)['차대번호'].count().to_excel(writer, sheet_name='차량연식별')
    g4.groupby(['배인번호_정상_판단'], dropna=False)['차대번호'].count().to_excel(writer, sheet_name='배인번호_정상여부')

# Code End