In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import datetime
# https://www.kaggle.com/rohanrao/air-quality-data-in-india?select=city_day.csv
data = pd.read_csv("data/india/city_day.csv")  #local.
#!curl https://raw.githubusercontent.com/karthikeyanrathore/AQI/main/data/india/city_day.csv?token=GHSAT0AAAAAABTNEQV2U2F42YYZOVLV3ETIYSZM2EA > city_day.csv
#data = pd.read_csv("/content/city_day.csv")
data.info()

In [None]:
len(data['City'].unique()) # 26 citites
list(data['City'].unique())

In [None]:
#del data['PM10']
#del data['AQI']
del data['AQI_Bucket']
# why - https://smartairfilters.com/en/blog/difference-pm2-5-aqi-measurements/

In [None]:
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
data.info()

In [None]:
# worst polluted city? before covid 
# Ahmedabad
poll_city = data[data['AQI'] == 2049].dropna()
poll_city

# worst polluted city? after covid 
# Ahmedabad
poll_city = data[data['AQI'] == 1291].dropna()
poll_city

In [None]:
# fill nan places in data with mean.
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan , strategy="mean")
imputer.fit(data.iloc[:, 2:15])
data.iloc[:, 2:15] = imputer.transform(data.iloc[:, 2:15])

In [None]:
data.info()

In [None]:
before_covid = list(filter(lambda x: x < datetime.datetime(2019, 12, 31) , data['Date'])) # 2015-2019
covid_period = list(filter(lambda x: x > datetime.datetime(2019, 12, 31) , data['Date'])) # 2020-. () 
#len(before_covid) # 24862
#len(covid_period) # 4646

In [None]:
l_before = (data[data['Date'] < datetime.datetime(2019, 12, 31)]).dropna()
l_after = (data[data['Date'] > datetime.datetime(2019, 12, 31)]).dropna()
#len(list(filter(lambda x: x == True , l))),

# find the main factors/pollutants responsible using ML.

# Ozone
print('Ozone before', max(l_before['O3'])) 
print('Ozone after', max(l_after['O3']))

# AQI
print('AQI before', max(l_before['AQI'])) 
print('AQI after', max(l_after['AQI']))


In [None]:
x = data.iloc[:, 2:14].values # array (PM2.5, PM10, ... Xylene)
y = data.iloc[:, 14].values # AQI

In [None]:
# keeping it into AQI buckets
# https://pib.gov.in/newsite/PrintRelease.aspx?relid=110654
AQI_bucket = {'good': 0, 'satisfactory': 0, 'moderate': 0, 'poor': 0, 'very_poor': 0,'severe': 0}
for val in y:
  if 0 <= val <= 50:
    AQI_bucket['good'] += 1
  elif 51 <= val <= 100:
    AQI_bucket['satisfactory'] += 1
  elif 101 <= val <= 250:
    AQI_bucket['moderate'] += 1
  elif 251 <= val <= 350:
    AQI_bucket['poor'] += 1
  elif 351 <= val <= 430:
    AQI_bucket['very_poor'] += 1
  else:
    AQI_bucket['severe'] += 1

In [None]:
AQI_bucket

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

In [None]:
# Feature Scaling (conv high magnitude value to small)
# Standardisation (conv values b/w -3 & 3 more or less) 
# or Normalisation ( Conv values between 0 & 1)
# Standardisation is more prefered. 

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, :] = sc.fit_transform(x_train[:, :])
x_test[:, :] = sc.fit_transform(x_test[:, :])

### [LinearRegression model.](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression)

In [None]:
# Simple Linear regressor
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression
from sklearn.linear_model import LinearRegression
linear_regressor = LinearRegression()
linear_regressor.fit(x_train, y_train)

In [None]:
y_pred = linear_regressor.predict(x_test)

In [None]:
# https://scikit-learn.org/stable/modules/classes.html#regression-metrics
from sklearn.metrics import r2_score, mean_absolute_error
print("r2_score: ", r2_score(y_test, y_pred))
print("mean_absolute_error: ", mean_absolute_error(y_test, y_pred)) # . BAD

In [None]:
plt.scatter(y_test, y_pred)

In [None]:
sns.displot(y_test - y_pred)

### [SGDRegressor model.](https://scikitlearn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html#sklearn.linear_model.SGDRegressor)

In [None]:
from sklearn.linear_model import SGDRegressor
sdg_regressor = SGDRegressor(max_iter=6000, alpha=0.0005, tol=1e-3)
sdg_regressor.fit(x_train, y_train)

In [None]:
y_pred = sdg_regressor.predict(x_test)

In [None]:
# https://scikit-learn.org/stable/modules/classes.html#regression-metrics
from sklearn.metrics import r2_score
print("r2_score: ", r2_score(y_test, y_pred))
print("mean_absolute_error: ", mean_absolute_error(y_test, y_pred))

### [XG Boost Regressor model.](https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn)

In [None]:
from xgboost import XGBRegressor
xgb_regressor = XGBRegressor()

In [None]:
# in order to find the best parameters for XGBRegressor.
# we can use grid search CV https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# or randomised search CV https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# define param for XGBRegressor

n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]

learning_rate = ['0.05', '0.1', '0.2', '0.3', '0.5', '0.6']

max_depth = [int(x) for x in np.linspace(start=5, stop=30, num=6)]

subsample = [0.7, 0.6, 0.8]

min_child_weight = [3, 4, 5, 6, 7]

In [None]:
param = {
  'n_estimators': n_estimators,
  'learning_rate': learning_rate,
  'max_depth': max_depth,
  'subsample': subsample,
  'min_child_weight': min_child_weight
}

In [None]:
# scoring=r2_square error use 'r2' 
# https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values
xgb_select_param = RandomizedSearchCV(estimator=xgb_regressor, 
                                      param_distributions=param,
                                      scoring='r2',
                                      n_iter=10,
                                      cv=5,
                                      verbose=2,
                                      random_state=42,
                                      n_jobs=1)

In [None]:
# DONT train locally. use google colab.
# xgb_select_param.fit(x_train, y_train) 

In [None]:
# xgb_select_param.best_params_
'''
for: n_iter = 10
{'learning_rate': '0.05',
'max_depth': 15,
'min_child_weight': 5,
'n_estimators': 1200,
'subsample': 0.6}
'''

In [None]:
xgb_regressor = XGBRegressor(
  learning_rate= 0.05,
  max_depth= 15,
  min_child_weight= 5,
  n_estimators= 1200,
  subsample= 0.6
)

In [None]:
xgb_regressor.fit(x_train, y_train)

In [None]:
y_pred = xgb_regressor.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
print("r2_score: ", r2_score(y_test, y_pred))
print("mean_absolute_error: ", mean_absolute_error(y_test, y_pred))

In [None]:
# plt.scatter(y_test, y_pred)

### [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn-ensemble-randomforestregressor)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# define param for rf model.
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(start=5, stop=30, num=6)]

min_samples_split = [2, 5, 10, 15, 100]

min_samples_leaf = [1, 2, 5, 10]

In [None]:
param = {
  'n_estimators': n_estimators,
  'max_features': max_features,
  'max_depth': max_depth,
  'min_samples_split': min_samples_split,
  'min_samples_leaf': min_samples_leaf
}

In [None]:
# model selection.
# https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values
# neg_mean_squared_error for scoring
rf_select_param = RandomizedSearchCV(estimator=rf_regressor, 
                                      param_distributions=param,
                                      scoring='neg_mean_squared_error',
                                      n_iter=10,
                                      cv=5,
                                      verbose=2,
                                      random_state=42,
                                      n_jobs=1)

In [None]:
# DONT train locally. use google colab.
# rf_select_param.fit(x_train, y_train)

In [None]:
# rf_select_param.best_params_
{'max_depth': 25,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [None]:
rf_regressor = RandomForestRegressor(
  max_depth= 25,
  max_features= 'sqrt',
  min_samples_leaf= 1,
  min_samples_split= 2,
  n_estimators= 1000
)

In [None]:
rf_regressor.fit(x_train, y_train)

In [None]:
y_pred = rf_regressor.predict(x_test)

In [None]:
from sklearn.metrics import r2_score
print("r2_score: ", r2_score(y_test, y_pred))
print("mean_absolute_error: ", mean_absolute_error(y_test, y_pred))

### [SVR](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn-svm-svr)

In [None]:
from sklearn.svm import SVR
svr_regressor = SVR()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# define param for svr model.
from scipy import stats
# kernel = ['linear', 'poly', 'rbf']

# degree = [int(x) for x in np.linspace(start=3, stop=30, num=6)]

# gamma = ['scale', 'auto']

# C = [float(x) for x in np.linspace(start=1, stop=15, num=8)]

# epsilon = [0.1, 0.3, 0.5, 0.6, 0.8]

C = stats.uniform(2, 10)
gamma = stats.uniform(0.1, 1)

In [None]:
param = {
  'gamma': gamma,
  'C': C
}

In [None]:
# model selection.
# https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values
# neg_mean_squared_error for scoring
svr_select_param = RandomizedSearchCV(estimator=svr_regressor, 
                                      param_distributions=param,
                                      scoring='neg_mean_squared_error',
                                      n_iter=10,
                                      cv=5,
                                      verbose=2,
                                      random_state=42,
                                      n_jobs=1)

In [None]:
# DONT train locally. use google colab.
# svr_select_param.fit(x_train, y_train)

In [None]:
# svr_select_param.best_params_
{'C': 10.324426408004218, 'gamma': 0.31233911067827613}

In [None]:
svr_regressor = SVR(C=10.324426408004218, gamma=0.31233911067827613)

In [None]:
svr_regressor.fit(x_train, y_train)

In [None]:
y_pred = svr_regressor.predict(x_test)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error
print("r2_score: ", r2_score(y_test, y_pred))
print("mean_absolute_error: ", mean_absolute_error(y_test, y_pred))

### [DecisionTreeRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn-tree-decisiontreeregressor)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
splitter = ['best', 'random']

max_depth = [1, 3, 5, 7, 9, 11, 12]

min_samples_leaf = [x for x in range(1, 11)]

min_weight_fraction_leaf = [float('%.2f' % (x)) for x in np.linspace(start=0.1, stop=1, num=10)]

max_features = ['auto','log2','sqrt', None]

max_leaf_nodes = [None , 10, 20, 30, 40, 50, 60, 70, 80, 90]

In [None]:
param = {
  'splitter': splitter,
  'max_depth': max_depth,
  'min_samples_leaf': min_samples_leaf,
  'min_weight_fraction_leaf': min_weight_fraction_leaf,
  'max_features': max_features,
  'max_leaf_nodes': max_leaf_nodes
}

In [None]:
dt_select_param = GridSearchCV(
  estimator=dt_regressor,
  param_grid=param,
  scoring='neg_mean_squared_error',
  cv=3,
  verbose=3
)

In [None]:
# dt_select_param.fit(x_train, y_train)

In [None]:
# dt_select_param.best_params_
{'max_depth': 5,
 'max_features': 'sqrt',
 'max_leaf_nodes': 10,
 'min_samples_leaf': 9,
 'min_weight_fraction_leaf': 0.1,
 'splitter': 'best'}

In [None]:
dt_regressor = DecisionTreeRegressor(
  max_depth= 5,
  max_features='sqrt',
  max_leaf_nodes= 10,
  min_samples_leaf= 9,
  min_weight_fraction_leaf= 0.1,
  splitter= 'best'
)

In [None]:
dt_regressor.fit(x_train, y_train)

In [None]:
y_pred = dt_regressor.predict(x_test)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error
print("r2_score: ", r2_score(y_test, y_pred))
print("mean_absolute_error: ", mean_absolute_error(y_test, y_pred))