In [69]:
import pandas as pd
import numpy as np
from pandas import datetime
from matplotlib import pyplot as plt

"""
Load AirQualityUCI Data
"""

def parser(x):
    return datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

input_file = './data/AirQualityUCI_refined.csv'

df = pd.read_csv(input_file,
                 index_col=[0],
                 parse_dates=[0],
                 date_parser=parser)

df.head()

  from pandas import datetime


Unnamed: 0_level_0,CO(GT),PT08.S1(CO),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),RH,AH,C6H6(GT)
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2004-03-10 18:00:00,2.6,1360.0,1046.0,166.0,1056.0,113.0,1692.0,1268.0,48.9,0.7578,11.9
2004-03-10 19:00:00,2.0,1292.0,955.0,103.0,1174.0,92.0,1559.0,972.0,47.7,0.7255,9.4
2004-03-10 20:00:00,2.2,1402.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,54.0,0.7502,9.0
2004-03-10 21:00:00,2.2,1376.0,948.0,172.0,1092.0,122.0,1584.0,1203.0,60.0,0.7867,9.2
2004-03-10 22:00:00,1.6,1272.0,836.0,131.0,1205.0,116.0,1490.0,1110.0,59.6,0.7888,6.5


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9357 entries, 2004-03-10 18:00:00 to 2005-04-04 14:00:00
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   CO(GT)         7765 non-null   float64
 1   PT08.S1(CO)    8991 non-null   float64
 2   PT08.S2(NMHC)  8991 non-null   float64
 3   NOx(GT)        7718 non-null   float64
 4   PT08.S3(NOx)   8991 non-null   float64
 5   NO2(GT)        7715 non-null   float64
 6   PT08.S4(NO2)   8991 non-null   float64
 7   PT08.S5(O3)    8991 non-null   float64
 8   RH             8991 non-null   float64
 9   AH             8991 non-null   float64
 10  C6H6(GT)       9357 non-null   float64
dtypes: float64(11)
memory usage: 877.2 KB


In [38]:
# Visualization setup
%matplotlib
%config InlineBackend.figure_format = 'svg'

from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
plt.ion() # enable the interactive mode

import seaborn as sns
sns.set()  # set plot styles

Using matplotlib backend: Qt5Agg


In [70]:
# Interpolate the 'CO(GT)' column
co = df['CO(GT)'].interpolate().copy()
co

Datetime
2004-03-10 18:00:00    2.6
2004-03-10 19:00:00    2.0
2004-03-10 20:00:00    2.2
2004-03-10 21:00:00    2.2
2004-03-10 22:00:00    1.6
                      ... 
2005-04-04 10:00:00    3.1
2005-04-04 11:00:00    2.4
2005-04-04 12:00:00    2.4
2005-04-04 13:00:00    2.1
2005-04-04 14:00:00    2.2
Name: CO(GT), Length: 9357, dtype: float64

In [40]:
"""
Binning
"""

max_val = co.max()
min_val = co.min()

print(max_val, min_val)

11.9 0.0


In [41]:
# Make interval values
bins = np.linspace(min_val, max_val, 6)
bins

array([ 0.  ,  2.38,  4.76,  7.14,  9.52, 11.9 ])

In [42]:
# Labels for each bin
labels=['0 <=x<2.38', '2.38<=x<4.76', '4.76<=x<7.14',
       '7.14<=x<9.52', '9.52<=x<11.9']

In [43]:
# Convert the numerical values into the categorical values
df['bins'] = pd.cut(co, bins=bins, labels=labels, include_lowest=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9357 entries, 2004-03-10 18:00:00 to 2005-04-04 14:00:00
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   CO(GT)         7765 non-null   float64 
 1   PT08.S1(CO)    8991 non-null   float64 
 2   PT08.S2(NMHC)  8991 non-null   float64 
 3   NOx(GT)        7718 non-null   float64 
 4   PT08.S3(NOx)   8991 non-null   float64 
 5   NO2(GT)        7715 non-null   float64 
 6   PT08.S4(NO2)   8991 non-null   float64 
 7   PT08.S5(O3)    8991 non-null   float64 
 8   RH             8991 non-null   float64 
 9   AH             8991 non-null   float64 
 10  C6H6(GT)       9357 non-null   float64 
 11  bins           9357 non-null   category
dtypes: category(1), float64(11)
memory usage: 886.6 KB


In [44]:
# Print out bins
df[['CO(GT)','bins']]

Unnamed: 0_level_0,CO(GT),bins
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-03-10 18:00:00,2.6,2.38<=x<4.76
2004-03-10 19:00:00,2.0,0 <=x<2.38
2004-03-10 20:00:00,2.2,0 <=x<2.38
2004-03-10 21:00:00,2.2,0 <=x<2.38
2004-03-10 22:00:00,1.6,0 <=x<2.38
...,...,...
2005-04-04 10:00:00,3.1,2.38<=x<4.76
2005-04-04 11:00:00,2.4,2.38<=x<4.76
2005-04-04 12:00:00,2.4,2.38<=x<4.76
2005-04-04 13:00:00,2.1,0 <=x<2.38


In [45]:
# Visualize the histogram of bins
plt.hist(df['bins'], bins=5)

(array([2.787e+03, 6.058e+03, 4.470e+02, 5.900e+01, 6.000e+00]),
 array([0. , 0.8, 1.6, 2.4, 3.2, 4. ]),
 <BarContainer object of 5 artists>)

In [46]:
import seaborn as sns

In [47]:
"""
Log Transform
"""

# Distribution of original data
sns.distplot(df['PT08.S3(NOx)'])



<AxesSubplot:xlabel='PT08.S3(NOx)', ylabel='Density'>

In [48]:
# Calculate natural logarithm on 'CO(GT)' column
df['log'] = np.log10(df['PT08.S3(NOx)'])

In [49]:
# Print out
df[['PT08.S3(NOx)','log']]

Unnamed: 0_level_0,PT08.S3(NOx),log
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-03-10 18:00:00,1056.0,3.023664
2004-03-10 19:00:00,1174.0,3.069668
2004-03-10 20:00:00,1140.0,3.056905
2004-03-10 21:00:00,1092.0,3.038223
2004-03-10 22:00:00,1205.0,3.080987
...,...,...
2005-04-04 10:00:00,539.0,2.731589
2005-04-04 11:00:00,604.0,2.781037
2005-04-04 12:00:00,603.0,2.780317
2005-04-04 13:00:00,702.0,2.846337


In [50]:
# Distribution after log transform
sns.distplot(df['log'])
plt.xlabel('log(NOx)')
plt.show()



In [51]:
"""
One-hot Encoding
"""

# Make a dataset

emp_id = pd.Series([1, 2, 3, 4, 5])
gender = pd.Series(['Male', 'Female', 'Female', 'Male', 'Female'])
remarks = pd.Series(['Nice', 'Good', 'Great', 'Great', 'Nice'])

df = pd.DataFrame({'emp_id' : emp_id, 'gender' : gender, 'remarks' : remarks})

In [52]:
# Print unique values for each column
print(df['gender'].unique())
print(df['remarks'].unique())

['Male' 'Female']
['Nice' 'Good' 'Great']


In [53]:
# One-hot encoding the categorial values
df_encoded = pd.get_dummies(df, columns=['gender', 'remarks'])
df_encoded

Unnamed: 0,emp_id,gender_Female,gender_Male,remarks_Good,remarks_Great,remarks_Nice
0,1,0,1,0,0,1
1,2,1,0,1,0,0
2,3,1,0,0,1,0
3,4,0,1,0,1,0
4,5,1,0,0,0,1


In [71]:
nmhc = df['PT08.S2(NMHC)'].interpolate().copy()

In [72]:
"""
Normalization
"""

# Visualize two columns of different scales
plt.plot(co)
plt.plot(nmhc)

[<matplotlib.lines.Line2D at 0x1b5612fe7f0>]

In [73]:
# Normalize the 'CO(GT)' column
co_max = co.max()
co_min = co.min()
df['CO_Norm'] = (co - co_min ) / (co_max - co_min)

In [74]:
df[['CO(GT)','CO_Norm']]

Unnamed: 0_level_0,CO(GT),CO_Norm
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-03-10 18:00:00,2.6,0.218487
2004-03-10 19:00:00,2.0,0.168067
2004-03-10 20:00:00,2.2,0.184874
2004-03-10 21:00:00,2.2,0.184874
2004-03-10 22:00:00,1.6,0.134454
...,...,...
2005-04-04 10:00:00,3.1,0.260504
2005-04-04 11:00:00,2.4,0.201681
2005-04-04 12:00:00,2.4,0.201681
2005-04-04 13:00:00,2.1,0.176471


In [75]:
# Normalize the 'PT08.S2(NMHC)' column
nmhc_max = nmhc.max()
nmhc_min = nmhc.min()

df['NMHC_Norm'] = (nmhc - nmhc_min) / (nmhc_max - nmhc_min)
df['NMHC_Norm']

Datetime
2004-03-10 18:00:00    0.362097
2004-03-10 19:00:00    0.312398
2004-03-10 20:00:00    0.303659
2004-03-10 21:00:00    0.308575
2004-03-10 22:00:00    0.247406
                         ...   
2005-04-04 10:00:00    0.392135
2005-04-04 11:00:00    0.351720
2005-04-04 12:00:00    0.371382
2005-04-04 13:00:00    0.315674
2005-04-04 14:00:00    0.362643
Name: NMHC_Norm, Length: 9357, dtype: float64

In [76]:
# Visualized normalized columns
plt.plot(df['CO_Norm'], label='CO (normalized)')
plt.plot(df['NMHC_Norm'], label='NMHC (normalized)')
plt.legend(loc='best')

<matplotlib.legend.Legend at 0x1b560d245e0>

In [77]:
"""
Feature Split
"""

# Make untidy movie data
movies = pd.Series(["The Godfather, 1972, Francis Ford Coppola",
                    "Contact, 1997, Robert Zemeckis",
                   "Parasite, 2019, Joon-ho Bong"])

movies

0    The Godfather, 1972, Francis Ford Coppola
1               Contact, 1997, Robert Zemeckis
2                 Parasite, 2019, Joon-ho Bong
dtype: object

In [82]:
# Divide movie data into title, year, director columns

title_lst = []
year_lst = []
director_lst = []

for val in movies:
    title, year, director = val.split(',')
    title_lst.append(title)
    year_lst.append(year)
    director_lst.append(director)

print(title_lst, year_lst, director_lst)

['The Godfather', 'Contact', 'Parasite'] [' 1972', ' 1997', ' 2019'] [' Francis Ford Coppola', ' Robert Zemeckis', ' Joon-ho Bong']


In [83]:
# Make a DataFrame object
df = pd.DataFrame({'title' : title_lst,
             'year' : year_lst,
             'director' : director_lst})
print(df)

           title   year               director
0  The Godfather   1972   Francis Ford Coppola
1        Contact   1997        Robert Zemeckis
2       Parasite   2019           Joon-ho Bong
