In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from matplotlib import pyplot as plt

In [None]:
def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

input_file = './data/AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)
print(df.head())

# Visualization setup
%matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

%config InlineBackend.figure_format = 'svg'
plt.rcParams['figure.figsize'] = [10, 5]
plt.ion() # enable the interactive mode

In [None]:
# Interpolate the 'CO(GT)' column
co = df['CO(GT)'].copy()
co.interpolate(inplace = True)

In [None]:
"""
Binning
"""

max_val = co.max()
min_val = co.min()
# print(max_val, min_val)

In [None]:
# Make interval values
bins = np.linspace(min_val, max_val, 6)

In [None]:
# labels for each bin
labels = ['0<=x<2.38', '2.38<=x<4.76', '4.76<=x<7.14','7.14<=x<9.52', '9.52<=x<11.9']

In [None]:
# Convert the numerical values into the categorical values
df['bins'] = pd.cut(co, bins = bins, labels = labels, include_lowest = True)

In [None]:
# print bins
df['bins']

In [None]:
# Visualize the gistogram of bins
plt.hist(df['bins'], bins = 5)
plt.show()

In [None]:
"""
Log Transform
"""

# Distribution of original data
# 0인 경우에 0 divide by zero 발생 -> log계산 시
# df.min()

sns.distplot(df['PT08.S3(NOx)'])

In [None]:
# Calculate natural Logarithm on 'PT08.S3(NOx)' column
df['log'] = np.log10(df['PT08.S3(NOx)'])

In [None]:
# Min values each columns
df.min()

In [None]:
# Distribution after log transform
sns.distplot(df['log'])
plt.xlabel('log(NOx)')
plt.show()

In [None]:
"""
One-hot Encoding
"""
# make a dataset

emp_id = pd.Series([1, 2, 3, 4, 5])
gender = pd.Series(['Male', 'Female', 'Female', 'Male', 'Female'])
remarks = pd.Series(['Nice', 'Good', 'Great','Great','Nice'])

df_emp = pd.DataFrame()
df_emp['emp_id'] = emp_id
df_emp['gender'] = gender
df_emp['remarks'] = remarks

In [None]:
# Print unique values for each column

df_emp['emp_id'].unique()
df_emp['gender'].unique()
df_emp['remarks'].unique()

In [None]:
# One-hot encoding the categorial values
df_emp_encoded = pd.get_dummies(df_emp, columns = ['gender', 'remarks'])
df_emp_encoded

In [None]:
"""
Normalization
"""

# Visualize two columns of different scale
plt.plot(df['CO(GT)'], label = 'CO')
plt.plot(df['PT08.S3(NOx)'], label = 'NMHC')
plt.legend(loc = 'best')

In [None]:
# Normalize the 'CO(GT)' column

co = df['CO(GT)'].copy()
co_max = co.max()
co_min = co.min()

df['CO_Norm'] = (co - co_min) / (co_max - co_min)
df['CO_Norm']

In [None]:
# Normalie the 'PT08.S2(NMHC)' column
nmhc = df['PT08.S2(NMHC)'].copy()
nmhc_max = nmhc.max()
nmhc_min = nmhc.min()

df['NMHC_Norm'] = (nmhc - nmhc_min) / (nmhc_max - nmhc_min)
df['NMHC_Norm']

In [None]:
# Visualized normallized columns

plt.plot(df['CO_Norm'], label = 'CO (normalized)')
plt.plot(df['NMHC_Norm'], label = 'NMHC (normalized)')
plt.legend(loc = 'best')