In [10]:
import numpy as np
import pandas as pd

In [11]:
file_path = 'data/AirQualityUCI.csv'
df = pd.read_csv(file_path, delimiter=';')

In [12]:
df.columns = df.columns.str.strip(' ;')
df['Time'] = df['Time'].str.replace('.', ':')
df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], dayfirst=True)
df.set_index('datetime', inplace=True)

In [13]:
df['CO(GT)'] = df['CO(GT)'].str.replace(',', '.')
df['CO(GT)'] = pd.to_numeric(df['CO(GT)'], errors='coerce')
df['CO(GT)'] = df['CO(GT)'].replace([np.inf, -np.inf], np.nan)
df['CO(GT)'] = df['CO(GT)'].ffill().bfill()

In [14]:
# 1. Feature Extraction: Year, Month, Day feature
df['Year'] = df.index.year
df['Month'] = df.index.month
df['Day'] = df.index.day

print("Year, Month, Day features:")
print(df[['Year', 'Month', 'Day']].head())

Year, Month, Day features:
                       Year  Month   Day
datetime                                
2004-03-10 18:00:00  2004.0    3.0  10.0
2004-03-10 19:00:00  2004.0    3.0  10.0
2004-03-10 20:00:00  2004.0    3.0  10.0
2004-03-10 21:00:00  2004.0    3.0  10.0
2004-03-10 22:00:00  2004.0    3.0  10.0


In [15]:
# 2. Feature Extraction: Lag features (same day last week, last month, last year)
df['SameDayLastWeek'] = df['CO(GT)'].shift(7)   
df['SameDayLastMonth'] = df['CO(GT)'].shift(30)  
df['SameDayLastYear'] = df['CO(GT)'].shift(365)  

print("\nLag features:")
print(df[['CO(GT)', 'SameDayLastWeek', 'SameDayLastMonth', 'SameDayLastYear']].head(15))


Lag features:
                     CO(GT)  SameDayLastWeek  SameDayLastMonth  \
datetime                                                         
2004-03-10 18:00:00     2.6              NaN               NaN   
2004-03-10 19:00:00     2.0              NaN               NaN   
2004-03-10 20:00:00     2.2              NaN               NaN   
2004-03-10 21:00:00     2.2              NaN               NaN   
2004-03-10 22:00:00     1.6              NaN               NaN   
2004-03-10 23:00:00     1.2              NaN               NaN   
2004-03-11 00:00:00     1.2              NaN               NaN   
2004-03-11 01:00:00     1.0              2.6               NaN   
2004-03-11 02:00:00     0.9              2.0               NaN   
2004-03-11 03:00:00     0.6              2.2               NaN   
2004-03-11 04:00:00  -200.0              2.2               NaN   
2004-03-11 05:00:00     0.7              1.6               NaN   
2004-03-11 06:00:00     0.7              1.2               Na

In [16]:
# 3. Feature Extraction: 2-month rolling average
df['2MonthRollingAvg'] = df['CO(GT)'].rolling(window=60).mean()  

print("\n2-month rolling average:")
print(df[['CO(GT)', '2MonthRollingAvg']].head(15))


2-month rolling average:
                     CO(GT)  2MonthRollingAvg
datetime                                     
2004-03-10 18:00:00     2.6               NaN
2004-03-10 19:00:00     2.0               NaN
2004-03-10 20:00:00     2.2               NaN
2004-03-10 21:00:00     2.2               NaN
2004-03-10 22:00:00     1.6               NaN
2004-03-10 23:00:00     1.2               NaN
2004-03-11 00:00:00     1.2               NaN
2004-03-11 01:00:00     1.0               NaN
2004-03-11 02:00:00     0.9               NaN
2004-03-11 03:00:00     0.6               NaN
2004-03-11 04:00:00  -200.0               NaN
2004-03-11 05:00:00     0.7               NaN
2004-03-11 06:00:00     0.7               NaN
2004-03-11 07:00:00     1.1               NaN
2004-03-11 08:00:00     2.0               NaN


In [17]:
# 4. Feature Extraction: Expanding maximum value till date
df['ExpandingMax'] = df['CO(GT)'].expanding().max()

print("\nExpanding maximum value till date:")
print(df[['CO(GT)', 'ExpandingMax']].head(15))


Expanding maximum value till date:
                     CO(GT)  ExpandingMax
datetime                                 
2004-03-10 18:00:00     2.6           2.6
2004-03-10 19:00:00     2.0           2.6
2004-03-10 20:00:00     2.2           2.6
2004-03-10 21:00:00     2.2           2.6
2004-03-10 22:00:00     1.6           2.6
2004-03-10 23:00:00     1.2           2.6
2004-03-11 00:00:00     1.2           2.6
2004-03-11 01:00:00     1.0           2.6
2004-03-11 02:00:00     0.9           2.6
2004-03-11 03:00:00     0.6           2.6
2004-03-11 04:00:00  -200.0           2.6
2004-03-11 05:00:00     0.7           2.6
2004-03-11 06:00:00     0.7           2.6
2004-03-11 07:00:00     1.1           2.6
2004-03-11 08:00:00     2.0           2.6


In [18]:
# 5. Feature Extraction: Quarterly data using the resample function
df['Q'] = df['CO(GT)'].resample('Q').mean()

print("\nQuarterly mean of CO(GT):")
print(df['Q'].head())


Quarterly mean of CO(GT):
datetime
2004-03-10 18:00:00   NaN
2004-03-10 19:00:00   NaN
2004-03-10 20:00:00   NaN
2004-03-10 21:00:00   NaN
2004-03-10 22:00:00   NaN
Name: Q, dtype: float64


  df['Q'] = df['CO(GT)'].resample('Q').mean()


In [19]:
# 6. Feature Extraction: Yearly data using the resample function
df['Yearly'] = df['CO(GT)'].resample('Y').mean()

print("\nYearly mean of CO(GT):")
print(df['Yearly'].head())


Yearly mean of CO(GT):
datetime
2004-03-10 18:00:00   NaN
2004-03-10 19:00:00   NaN
2004-03-10 20:00:00   NaN
2004-03-10 21:00:00   NaN
2004-03-10 22:00:00   NaN
Name: Yearly, dtype: float64


  df['Yearly'] = df['CO(GT)'].resample('Y').mean()
