# Air Quality Index

### Import libraries

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn import metrics

%matplotlib inline

### Load dataset

In [38]:
df = pd.read_csv('../data/aqi_data.csv')

### Data inspection

In [39]:
df.head()

Unnamed: 0,date,time,pm25,temperature,pressure,humidity,wind
0,13-06-2024,12:23:14,73,31,1015,62,3
1,13-06-2024,11:56:38,76,31,1014,66,3
2,13-06-2024,12:00:50,76,31,1014,66,3
3,13-06-2024,12:15:52,76,31,1014,66,3
4,13-06-2024,12:18:49,76,31,1014,66,3


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         31 non-null     object
 1   time         31 non-null     object
 2   pm25         31 non-null     int64 
 3   temperature  31 non-null     int64 
 4   pressure     31 non-null     int64 
 5   humidity     31 non-null     int64 
 6   wind         31 non-null     int64 
dtypes: int64(5), object(2)
memory usage: 1.8+ KB


In [41]:
df.isnull().sum()

date           0
time           0
pm25           0
temperature    0
pressure       0
humidity       0
wind           0
dtype: int64

### Data cleaning

Remove time column, as it is not needed

In [42]:
df.drop(columns=['date', 'time'], axis=1, inplace=True)
df

Unnamed: 0,pm25,temperature,pressure,humidity,wind
0,73,31,1015,62,3
1,76,31,1014,66,3
2,76,31,1014,66,3
3,76,31,1014,66,3
4,76,31,1014,66,3
5,77,32,1013,62,4
6,80,32,1013,59,3
7,83,33,1012,55,3
8,83,33,1012,55,3
9,83,33,1012,55,3


### Duplicate values

In [43]:
df.duplicated().sum()

16

Drop duplicate values

In [44]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,pm25,temperature,pressure,humidity,wind
0,73,31,1015,62,3
1,76,31,1014,66,3
5,77,32,1013,62,4
6,80,32,1013,59,3
7,83,33,1012,55,3
19,156,29,1013,79,1
20,150,29,1013,79,1
21,152,28,1012,83,1
22,155,28,1012,83,1
24,157,27,1011,88,1


### Model training

In [45]:
X = df.drop(columns=['pm25'])
y = df['pm25']

In [46]:
X

Unnamed: 0,temperature,pressure,humidity,wind
0,31,1015,62,3
1,31,1014,66,3
5,32,1013,62,4
6,32,1013,59,3
7,33,1012,55,3
19,29,1013,79,1
20,29,1013,79,1
21,28,1012,83,1
22,28,1012,83,1
24,27,1011,88,1


In [47]:
y

0      73
1      76
5      77
6      80
7      83
19    156
20    150
21    152
22    155
24    157
25     55
27     73
28     92
29     87
30     94
Name: pm25, dtype: int64

### Train test split

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [49]:
lreg = LinearRegression()

In [50]:
# lreg.fit(X_train, y_train)
lreg.fit(X, y)

In [51]:
# y_pred = lreg.predict(X_test)
y_pred = lreg.predict(X)

### Getting accuracy score

In [52]:
# print(f'R2 Score: {metrics.r2_score(y_test, y_pred)}')
# print(f'RMSE: {metrics.mean_squared_error(y_test, y_pred)}')

print(f'R2 Score: {metrics.r2_score(y, y_pred)}')
print(f'RMSE: {metrics.mean_squared_error(y, y_pred)}')

R2 Score: 0.5134629977760781
RMSE: 646.1211389533685


### Trying other models

In [53]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)

In [54]:
ridge.fit(X, y)
ridge_pred = ridge.predict(X)

In [55]:
print(f'R2 Score: {metrics.r2_score(y, ridge_pred)}')
print(f'RMSE: {metrics.mean_squared_error(y, ridge_pred)}')

R2 Score: 0.513058853778426
RMSE: 646.6578421822504


In [56]:
cross_val_score(ridge, X, y, cv=5, scoring='r2')

array([-324.53317333,    0.75489151, -319.5626994 ,   -1.07569377,
        -80.02046956])

### Saving model

In [57]:
import pickle
with open('model.pickle', mode='wb') as file:
    pickle.dump(lreg, file)