# Air Quality Index

### Import libraries

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn import metrics

%matplotlib inline

### Load dataset

In [22]:
df = pd.read_csv('../data/aqi_data.csv')

### Data inspection

In [23]:
df.head()

Unnamed: 0,date,time,pm25,temperature,pressure,humidity,wind
0,13-06-2024,12:23:14,73,31,1015,62,3
1,13-06-2024,11:56:38,76,31,1014,66,3
2,13-06-2024,12:00:50,76,31,1014,66,3
3,13-06-2024,12:15:52,76,31,1014,66,3
4,13-06-2024,12:18:49,76,31,1014,66,3


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         92 non-null     object
 1   time         92 non-null     object
 2   pm25         92 non-null     int64 
 3   temperature  92 non-null     object
 4   pressure     92 non-null     object
 5   humidity     92 non-null     object
 6   wind         92 non-null     object
dtypes: int64(1), object(6)
memory usage: 5.2+ KB


In [25]:
df.isnull().sum()

date           0
time           0
pm25           0
temperature    0
pressure       0
humidity       0
wind           0
dtype: int64

### Data cleaning

Remove time column, as it is not needed

In [26]:
df.drop(columns=['date', 'time'], axis=1, inplace=True)
df

Unnamed: 0,pm25,temperature,pressure,humidity,wind
0,73,31,1015,62,3
1,76,31,1014,66,3
2,76,31,1014,66,3
3,76,31,1014,66,3
4,76,31,1014,66,3
...,...,...,...,...,...
87,34,-,-,-,-
88,49,-,-,-,-
89,71,-,-,-,-
90,65,-,-,-,-


### Duplicate values

In [27]:
df.duplicated().sum()

30

Drop duplicate values

In [28]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,pm25,temperature,pressure,humidity,wind
0,73,31,1015,62,3
1,76,31,1014,66,3
5,77,32,1013,62,4
6,80,32,1013,59,3
7,83,33,1012,55,3
...,...,...,...,...,...
86,47,-,-,-,-
87,34,-,-,-,-
88,49,-,-,-,-
89,71,-,-,-,-


In [29]:
df = df[df['temperature'] != '-' ]
df = df[df['pressure'] != '-' ]
df = df[df['humidity'] != '-' ]
df = df[df['wind'] != '-' ]
df

Unnamed: 0,pm25,temperature,pressure,humidity,wind
0,73,31,1015,62,3
1,76,31,1014,66,3
5,77,32,1013,62,4
6,80,32,1013,59,3
7,83,33,1012,55,3
19,156,29,1013,79,1
20,150,29,1013,79,1
21,152,28,1012,83,1
22,155,28,1012,83,1
24,157,27,1011,88,1


In [43]:
df.shape

(56, 5)

### Model training

In [30]:
X = df.drop(columns=['pm25'])
y = df['pm25']

In [31]:
X

Unnamed: 0,temperature,pressure,humidity,wind
0,31,1015,62,3
1,31,1014,66,3
5,32,1013,62,4
6,32,1013,59,3
7,33,1012,55,3
19,29,1013,79,1
20,29,1013,79,1
21,28,1012,83,1
22,28,1012,83,1
24,27,1011,88,1


In [32]:
y

0      73
1      76
5      77
6      80
7      83
19    156
20    150
21    152
22    155
24    157
25     55
27     73
28     92
29     87
30     94
33    112
34    115
35     82
36     69
38     66
40     68
41     63
42     65
43     73
44     75
47     82
48     31
49     42
50     43
51     40
52     43
53     43
54     49
55     56
56     56
57     58
58     57
59     55
60     52
61     50
62     51
63     52
64     50
69     57
70     62
71     78
72     70
73     61
74     58
75     53
76     38
78     60
79     30
80     39
81     37
82     46
Name: pm25, dtype: int64

### Train test split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [34]:
lreg = LinearRegression()

In [35]:
# lreg.fit(X_train, y_train)
lreg.fit(X, y)

In [36]:
# y_pred = lreg.predict(X_test)
y_pred = lreg.predict(X)

### Getting accuracy score

In [37]:
# print(f'R2 Score: {metrics.r2_score(y_test, y_pred)}')
# print(f'RMSE: {metrics.mean_squared_error(y_test, y_pred)}')

print(f'R2 Score: {metrics.r2_score(y, y_pred)}')
print(f'RMSE: {metrics.mean_squared_error(y, y_pred)}')

R2 Score: 0.4106568127314598
RMSE: 591.3004606407985


### Trying other models

In [38]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)

In [39]:
ridge.fit(X, y)
ridge_pred = ridge.predict(X)

In [40]:
print(f'R2 Score: {metrics.r2_score(y, ridge_pred)}')
print(f'RMSE: {metrics.mean_squared_error(y, ridge_pred)}')

R2 Score: 0.410361715812362
RMSE: 591.596537609135


In [41]:
cross_val_score(ridge, X, y, cv=5, scoring='r2')

array([ -1.39093595,  -3.08388565,  -3.53310639, -21.24211316,
        -0.39177455])

### Saving model

In [42]:
import pickle
with open('model.pickle', mode='wb') as file:
    pickle.dump(lreg, file)