In [1]:
# 구간화(binning) : 특정 변수를 범주형 변수로 변환
# - 목적 : 강건한(robust) 모델 생성, 과적합(overfitting) 방지
# - 수치형/범주형 변수에 모두 적용 가능
# 이상치가 있으면 평균은 데이터를 대표하지 못한다. 따라서 binning 사용

In [2]:
import pandas as pd
import numpy as np
from pandas import datetime
from matplotlib import pyplot as plt
def parser(x) :
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
input_file = '../AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                index_col=[0],
                parse_dates=[0],
                date_parser=parser)
df.head()

  from pandas import datetime


Unnamed: 0_level_0,CO(GT),PT08.S1(CO),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH,C6H6(GT)
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2004-03-10 18:00:00,2.6,1360.0,1046.0,166.0,1056.0,113.0,1692.0,1268.0,48.9,0.7578,11.9
2004-03-10 19:00:00,2.0,1292.0,955.0,103.0,1174.0,92.0,1559.0,972.0,47.7,0.7255,9.4
2004-03-10 20:00:00,2.2,1402.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,54.0,0.7502,9.0
2004-03-10 21:00:00,2.2,1376.0,948.0,172.0,1092.0,122.0,1584.0,1203.0,60.0,0.7867,9.2
2004-03-10 22:00:00,1.6,1272.0,836.0,131.0,1205.0,116.0,1490.0,1110.0,59.6,0.7888,6.5


In [3]:
# Visualization setup
%matplotlib
from matplotlib import pyplot as plt
import seaborn; seaborn.set()  # set plot styles
%config InlineBackend.figure_format = 'svg'
plt.rcParams['figure.figsize'] = [10, 5]
plt.ion() # enable the interactive mode

import seaborn as sns
sns.set()

Using matplotlib backend: Qt5Agg


In [4]:
# Interpolate the 'CO(GT)' column
#df['CO(GT)'].interpolate(inplace=True) # inplace=True를 하면 원본에 덮어쓰기를 한다.
co = df['CO(GT)'].copy()
co.interpolate(inplace=True)

# df['CO(GT)'].plot() # 결측치가 많은걸 볼 수 있다.

In [5]:
# binning
max_val = co.max()
min_val = co.min()
print(max_val, min_val)

11.9 0.0


In [6]:
# make interval values
bins = np.linspace(min_val, max_val, 6)
print(bins)

[ 0.    2.38  4.76  7.14  9.52 11.9 ]


In [7]:
# Labels for each bin
labels = ['0 <= x < 2.38', '2.38<=x<4.76', '4.76<=x<7.14',
         '7.14<=x<9.52', '9.52<=x<11.9']

In [8]:
# Convert the numerical values into the categorical values
df['bins'] = pd.cut(co, bins=bins, labels=labels, include_lowest=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9357 entries, 2004-03-10 18:00:00 to 2005-04-04 14:00:00
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   CO(GT)         7765 non-null   float64 
 1   PT08.S1(CO)    8991 non-null   float64 
 2   PT08.S2(NMHC)  8991 non-null   float64 
 3   NOx(GT)        7718 non-null   float64 
 4   PT08.S3(NOx)   8991 non-null   float64 
 5   NO2(GT)        7715 non-null   float64 
 6   PT08.S4(NO2)   8991 non-null   float64 
 7   PT08.S5(O3)    8991 non-null   float64 
 8   RH             8991 non-null   float64 
 9   AH             8991 non-null   float64 
 10  C6H6(GT)       9357 non-null   float64 
 11  bins           9357 non-null   category
dtypes: category(1), float64(11)
memory usage: 886.6 KB


In [9]:
# Print bins
print(df['bins']) # 수치형으로 변환됨

Datetime
2004-03-10 18:00:00     2.38<=x<4.76
2004-03-10 19:00:00    0 <= x < 2.38
2004-03-10 20:00:00    0 <= x < 2.38
2004-03-10 21:00:00    0 <= x < 2.38
2004-03-10 22:00:00    0 <= x < 2.38
                           ...      
2005-04-04 10:00:00     2.38<=x<4.76
2005-04-04 11:00:00     2.38<=x<4.76
2005-04-04 12:00:00     2.38<=x<4.76
2005-04-04 13:00:00    0 <= x < 2.38
2005-04-04 14:00:00    0 <= x < 2.38
Name: bins, Length: 9357, dtype: category
Categories (5, object): ['0 <= x < 2.38' < '2.38<=x<4.76' < '4.76<=x<7.14' < '7.14<=x<9.52' < '9.52<=x<11.9']


In [10]:
# Visualize the histogram of bins
plt.hist(df['bins'], bins=5)
plt.show()

In [11]:
# 로그화 : 우편향된 데이터 분포를 정규 분포에 가깝게 변환
# - 이상치에 강건한 모델 생성
# - 정규 분포에 적합한 알고리즘, 모수적 방법 적용이 용이해짐

In [12]:
# Distribution of original data
df.min() # 최솟값 출력, 0이면 로그화 불가능

sns.distplot(df['PT08.S3(NOx)'])



<AxesSubplot:xlabel='PT08.S3(NOx)', ylabel='Density'>

In [13]:
# Calculate natural Logarithm on 'CO(GT)' column
df['log'] = np.log10(df['PT08.S3(NOx)'])

In [14]:
df.min() # 변화됨

CO(GT)                       0
PT08.S1(CO)                647
PT08.S2(NMHC)              383
NOx(GT)                      2
PT08.S3(NOx)               322
NO2(GT)                      2
PT08.S4(NO2)               551
PT08.S5(O3)                221
RH                         9.2
AH                      0.1847
C6H6(GT)                     0
bins             0 <= x < 2.38
log                    2.50786
dtype: object

In [15]:
# Distribution after log transform
sns.distplot(df['log'])
# plt.xlabel()
# plt.ylabel()



<AxesSubplot:xlabel='log', ylabel='Density'>

In [16]:
# 레이블 인코딩 : 개별 범주를 특정 숫자 값으로 표현
# - 순서형 데이터에 적용할 경우 데이터 순선, 순위 등을 보존 가능

# 원핫 인코딩 : 개별 범주를 특정 이진 벡터로 표현
# - 범주 별로 인덱스를 할당
#       범주에 해당되는 인덱스의 값 : 1
#       그 외의 나머지 값 : 0
# - 명목형 데이터 인코딩에 적함

In [20]:
# Make a dataset

emp_id = pd.Series([1,2,3,4,5])
gender = pd.Series(['Male', 'Female', 'Female', 'Male', 'Female'])
remarks = pd.Series(['Nice', 'Good', 'Great', 'Great', 'Nice'])

df_emp = pd.DataFrame()
df_emp['emp_id'] = emp_id
df_emp['gender'] = gender
df_emp['remarks'] = remarks

df_emp

Unnamed: 0,emp_id,gender,remarks
0,1,Male,Nice
1,2,Female,Good
2,3,Female,Great
3,4,Male,Great
4,5,Female,Nice


In [21]:
# Print unique values for each column
print(df_emp['emp_id'].unique())
print(df_emp['gender'].unique())
print(df_emp['remarks'].unique())

[1 2 3 4 5]
['Male' 'Female']
['Nice' 'Good' 'Great']


In [22]:
# One-hot encoding the categorial values
df_emp_encoded = pd.get_dummies(df_emp, columns=['gender', 'remarks'])
df_emp_encoded

Unnamed: 0,emp_id,gender_Female,gender_Male,remarks_Good,remarks_Great,remarks_Nice
0,1,0,1,0,0,1
1,2,1,0,1,0,0
2,3,1,0,0,1,0
3,4,0,1,0,1,0
4,5,1,0,0,0,1


In [25]:
# Scaling : 일반적으로 변수들은 서로 다른 값 범위를 가짐
# 많은 머신러닝 모델/알고리즘들이 스케일링된 데이터에서 더 잘 동작한다.
# - 정규화 : 모든 변수들을 0~1 사이의 값으로 스케일링
# - 표준화(z-score 정규화) : 표준편차를 기반으로 스케일링 수행

In [23]:
# Visualize two columns of different scales
plt.plot(df['CO(GT)'], label='CO')
plt.plot(df['PT08.S2(NMHC)'], label='NMHC')
plt.legend(loc='best')

<matplotlib.legend.Legend at 0x198809e9ca0>

In [24]:
# Normalize the 'CO(GT)' column
co = df['CO(GT)'].copy()
co_max = co.max()
co_min = co.min()

df['CO_Norm'] = (co - co_min) / (co_max - co_min)
df['CO_Norm']

Datetime
2004-03-10 18:00:00    0.218487
2004-03-10 19:00:00    0.168067
2004-03-10 20:00:00    0.184874
2004-03-10 21:00:00    0.184874
2004-03-10 22:00:00    0.134454
                         ...   
2005-04-04 10:00:00    0.260504
2005-04-04 11:00:00    0.201681
2005-04-04 12:00:00    0.201681
2005-04-04 13:00:00    0.176471
2005-04-04 14:00:00    0.184874
Name: CO_Norm, Length: 9357, dtype: float64

In [25]:
# Normalize the PT08.S2(NMHC) column
nmhc = df['PT08.S2(NMHC)'].copy()
nmhc_max = nmhc.max()
nmhc_min = nmhc.min()

df['NMHC_Norm'] = (nmhc - nmhc_min) / (nmhc_max - nmhc_min)
df['NMHC_Norm']

Datetime
2004-03-10 18:00:00    0.362097
2004-03-10 19:00:00    0.312398
2004-03-10 20:00:00    0.303659
2004-03-10 21:00:00    0.308575
2004-03-10 22:00:00    0.247406
                         ...   
2005-04-04 10:00:00    0.392135
2005-04-04 11:00:00    0.351720
2005-04-04 12:00:00    0.371382
2005-04-04 13:00:00    0.315674
2005-04-04 14:00:00    0.362643
Name: NMHC_Norm, Length: 9357, dtype: float64

In [26]:
# Visualized normalizd columns
plt.plot(df['CO_Norm'], label = 'CO (normalized)')
plt.plot(df['NMHC_Norm'], label = 'NMHC (normalized)')
plt.legend(loc = 'best')

<matplotlib.legend.Legend at 0x19880a75a00>

In [27]:
# Feature Split
# Make untidy movie data
movies = pd.Series(["The Godfather, 1972, Francis Ford Coppola",
                   "Contact, 1997, Robert Zemeckis",
                   "Parasite, 2019, Joon-ho Bong"])
movies

0    The Godfather, 1972, Francis Ford Coppola
1               Contact, 1997, Robert Zemeckis
2                 Parasite, 2019, Joon-ho Bong
dtype: object

In [28]:
# Divide movie data into title, year, director columns
lst_title = []
lst_year = []
lst_director = []

for val in movies :
    title, year, director = val.split(',') # data split
    lst_title.append(title)
    lst_year.append(year)
    lst_director.append(director)
print(lst_title)
print(lst_year)
print(lst_director)

['The Godfather', 'Contact', 'Parasite']
[' 1972', ' 1997', ' 2019']
[' Francis Ford Coppola', ' Robert Zemeckis', ' Joon-ho Bong']


In [30]:
# Make a DataFrame object
df_movie = pd.DataFrame()
df_movie['title'] = lst_title
df_movie['year'] = lst_year
df_movie['director'] = lst_director

df_movie

Unnamed: 0,title,year,director
0,The Godfather,1972,Francis Ford Coppola
1,Contact,1997,Robert Zemeckis
2,Parasite,2019,Joon-ho Bong
