In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from glob import glob
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')
import re

In [2]:
know_train = [pd.read_csv(path, index_col=0) for path in sorted(glob('./_data/train/*.csv'))]
know_test = [pd.read_csv(path, index_col=0) for path in sorted(glob('./_data/test/*.csv'))]
submission = pd.read_csv('./_data/sample_submission.csv')

## 공백 `' '` 으로 되어있는 결측치를 np.nan으로 변경

In [6]:
for train, test in zip(know_train, know_test):
    for col in test.columns:
        train[col].replace(' ', np.nan, inplace=True)
        test[col].replace(' ', np.nan, inplace=True)

In [15]:
i=0
for train, test in zip(know_train, know_test):
    print(f'{2017+i} train 결측치\n {train.isna().sum()}')
    print(f'{2017+i} test 결측치\n: {test.isna().sum()}')
    i+=1

2017 train 결측치
 aq1_1          0
aq1_2        585
aq2_1          0
aq2_2        861
aq3_1          0
            ... 
bq40        1301
bq41_1      1332
bq41_2      8222
bq41_3      1316
knowcode       0
Length: 155, dtype: int64
2017 test 결측치
: aq1_1        0
aq1_2      598
aq2_1        0
aq2_2      862
aq3_1        0
          ... 
bq39_2       0
bq40      1333
bq41_1    1371
bq41_2    8191
bq41_3    1342
Length: 154, dtype: int64
2018 train 결측치
 cq1            0
cq2            0
cq3            0
cq4            0
cq5            0
            ... 
bq40           3
bq41_1      1270
bq41_2      1388
bq41_3      7824
knowcode       0
Length: 140, dtype: int64
2018 test 결측치
: cq1          0
cq2          0
cq3          0
cq4          0
cq5          0
          ... 
bq39      1331
bq40         0
bq41_1    1298
bq41_2    1423
bq41_3    7794
Length: 139, dtype: int64
2019 train 결측치
 sq1            0
sq2            0
sq3            0
sq4            0
sq5            0
            ... 
bq30      

### 결측치가 포함된 열 확인

In [38]:
include_na_col_train = []
for i in range(len(know_train)):
    cols =[]
    for col in know_train[i].columns:
        if know_train[i].loc[:,col].isna().sum()>0:
            cols.append(col)
    include_na_col_train.append(cols)

In [39]:
include_na_col_test = []
for i in range(len(know_test)):
    cols =[]
    for col in know_test[i].columns:
        if know_test[i].loc[:,col].isna().sum()>0:
            cols.append(col)
    include_na_col_test.append(cols)

### 2017~2020 결측치가 발생한 열의 개수 모두 다름

In [47]:
print('결측치가 포함된 열 수')
for i in range(4):
    print(f'{2017+i} train : {len(include_na_col_train[0])} 개')
    print(f'{2017+i} test : {len(include_na_col_test[0])} 개')

결측치가 포함된 열 수
2017 train : 59 개
2017 test : 60 개
2018 train : 59 개
2018 test : 60 개
2019 train : 59 개
2019 test : 60 개
2020 train : 59 개
2020 test : 60 개


In [50]:
print('결측치가 포함된 열 목록')
print('='*50)
for i in range(4):
    print(f'{2017+i} train \n {include_na_col_train[0]}')
    print('-'*50)
    print(f'{2017+i} test : {include_na_col_test[0]}')
    print('='*50)

결측치가 포함된 열 목록
2017 train 
 ['aq1_2', 'aq2_2', 'aq3_2', 'aq4_2', 'aq5_2', 'aq6_2', 'aq7_2', 'aq8_2', 'aq9_2', 'aq10_2', 'aq11_2', 'aq12_2', 'aq13_2', 'aq14_2', 'aq15_2', 'aq16_2', 'aq17_2', 'aq18_2', 'aq19_2', 'aq20_2', 'aq21_2', 'aq22_2', 'aq23_2', 'aq24_2', 'aq25_2', 'aq26_2', 'aq27_2', 'aq28_2', 'aq29_2', 'aq30_2', 'aq31_2', 'aq32_2', 'aq33_2', 'aq34_2', 'aq35_2', 'aq36_2', 'aq37_2', 'aq38_2', 'aq39_2', 'aq40_2', 'aq41_2', 'bq4_1a', 'bq4_1b', 'bq4_1c', 'bq5_1', 'bq5_2', 'bq12_2', 'bq12_3', 'bq12_4', 'bq19_1', 'bq31', 'bq32', 'bq33', 'bq34', 'bq38_1', 'bq40', 'bq41_1', 'bq41_2', 'bq41_3']
--------------------------------------------------
2017 test : ['aq1_2', 'aq2_2', 'aq3_2', 'aq4_2', 'aq5_2', 'aq6_2', 'aq7_2', 'aq8_2', 'aq9_2', 'aq10_2', 'aq11_2', 'aq12_2', 'aq13_2', 'aq14_2', 'aq15_2', 'aq16_2', 'aq17_2', 'aq18_2', 'aq19_2', 'aq20_2', 'aq21_2', 'aq22_2', 'aq23_2', 'aq24_2', 'aq25_2', 'aq26_2', 'aq27_2', 'aq28_2', 'aq29_2', 'aq30_2', 'aq31_2', 'aq32_2', 'aq33_2', 'aq34_2', 'aq35_2'

## 데이터 타입 확인
### float64 1개, int64 94개 object 60개

In [54]:
know_train[0].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9486 entries, 0 to 9485
Columns: 155 entries, aq1_1 to knowcode
dtypes: float64(1), int64(94), object(60)
memory usage: 11.3+ MB


In [57]:
know_train[0].select_dtypes('float64')

Unnamed: 0_level_0,bq23
idx,Unnamed: 1_level_1
0,10.0
1,50.0
2,40.0
3,30.0
4,40.0
...,...
9481,10.0
9482,20.0
9483,0.0
9484,20.0


In [59]:
know_train[0].select_dtypes('int64').columns

Index(['aq1_1', 'aq2_1', 'aq3_1', 'aq4_1', 'aq5_1', 'aq6_1', 'aq7_1', 'aq8_1',
       'aq9_1', 'aq10_1', 'aq11_1', 'aq12_1', 'aq13_1', 'aq14_1', 'aq15_1',
       'aq16_1', 'aq17_1', 'aq18_1', 'aq19_1', 'aq20_1', 'aq21_1', 'aq22_1',
       'aq23_1', 'aq24_1', 'aq25_1', 'aq26_1', 'aq27_1', 'aq28_1', 'aq29_1',
       'aq30_1', 'aq31_1', 'aq32_1', 'aq33_1', 'aq34_1', 'aq35_1', 'aq36_1',
       'aq37_1', 'aq38_1', 'aq39_1', 'aq40_1', 'aq41_1', 'bq1', 'bq2', 'bq3',
       'bq4', 'bq5', 'bq6', 'bq7', 'bq8_1', 'bq8_2', 'bq8_3', 'bq9', 'bq10',
       'bq11', 'bq12_1', 'bq12_5', 'bq13', 'bq14', 'bq15_1', 'bq15_2',
       'bq15_3', 'bq16', 'bq17', 'bq18_1', 'bq18_2', 'bq18_3', 'bq18_4',
       'bq18_5', 'bq18_6', 'bq18_7', 'bq19', 'bq20', 'bq21', 'bq22', 'bq24_1',
       'bq24_2', 'bq24_3', 'bq24_4', 'bq24_5', 'bq24_6', 'bq24_7', 'bq24_8',
       'bq25', 'bq26', 'bq27', 'bq28', 'bq29', 'bq35', 'bq36', 'bq37', 'bq38',
       'bq39_1', 'bq39_2', 'knowcode'],
      dtype='object')

In [60]:
know_train[0].select_dtypes('object').columns

Index(['aq1_2', 'aq2_2', 'aq3_2', 'aq4_2', 'aq5_2', 'aq6_2', 'aq7_2', 'aq8_2',
       'aq9_2', 'aq10_2', 'aq11_2', 'aq12_2', 'aq13_2', 'aq14_2', 'aq15_2',
       'aq16_2', 'aq17_2', 'aq18_2', 'aq19_2', 'aq20_2', 'aq21_2', 'aq22_2',
       'aq23_2', 'aq24_2', 'aq25_2', 'aq26_2', 'aq27_2', 'aq28_2', 'aq29_2',
       'aq30_2', 'aq31_2', 'aq32_2', 'aq33_2', 'aq34_2', 'aq35_2', 'aq36_2',
       'aq37_2', 'aq38_2', 'aq39_2', 'aq40_2', 'aq41_2', 'bq4_1a', 'bq4_1b',
       'bq4_1c', 'bq5_1', 'bq5_2', 'bq12_2', 'bq12_3', 'bq12_4', 'bq19_1',
       'bq30', 'bq31', 'bq32', 'bq33', 'bq34', 'bq38_1', 'bq40', 'bq41_1',
       'bq41_2', 'bq41_3'],
      dtype='object')