# AIR QUALITY ANALYSIS - INDIA

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
import warnings;
warnings.filterwarnings("ignore")


# Data Analysis

## Data Exploration

In [35]:
air_quality_df = pd.read_csv('./data/data.csv')


In [36]:
number_of_rows = len(air_quality_df)
print('Number of rows in the DataFrame: {}'.format(number_of_rows))


Number of rows in the DataFrame: 435742


In [37]:
air_quality_df.head()

Unnamed: 0,stn_code,sampling_date,state,location,agency,type,so2,no2,rspm,spm,location_monitoring_station,pm2_5,date
0,150.0,February - M021990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",4.8,17.4,,,,,1990-02-01
1,151.0,February - M021990,Andhra Pradesh,Hyderabad,,Industrial Area,3.1,7.0,,,,,1990-02-01
2,152.0,February - M021990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",6.2,28.5,,,,,1990-02-01
3,150.0,March - M031990,Andhra Pradesh,Hyderabad,,"Residential, Rural and other Areas",6.3,14.7,,,,,1990-03-01
4,151.0,March - M031990,Andhra Pradesh,Hyderabad,,Industrial Area,4.7,7.5,,,,,1990-03-01


In [38]:
air_quality_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435742 entries, 0 to 435741
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   stn_code                     291665 non-null  object 
 1   sampling_date                435739 non-null  object 
 2   state                        435742 non-null  object 
 3   location                     435739 non-null  object 
 4   agency                       286261 non-null  object 
 5   type                         430349 non-null  object 
 6   so2                          401096 non-null  float64
 7   no2                          419509 non-null  float64
 8   rspm                         395520 non-null  float64
 9   spm                          198355 non-null  float64
 10  location_monitoring_station  408251 non-null  object 
 11  pm2_5                        9314 non-null    float64
 12  date                         435735 non-null  object 
dtyp

In [8]:
air_quality_df.describe()

Unnamed: 0,so2,no2,rspm,spm,pm2_5
count,401096.0,419509.0,395520.0,198355.0,9314.0
mean,10.829414,25.809623,108.832784,220.78348,40.791467
std,11.177187,18.503086,74.87243,151.395457,30.832525
min,0.0,0.0,0.0,0.0,3.0
25%,5.0,14.0,56.0,111.0,24.0
50%,8.0,22.0,90.0,187.0,32.0
75%,13.7,32.2,142.0,296.0,46.0
max,909.0,876.0,6307.033333,3380.0,504.0


## Data Cleaning

### Missing Values

#### Dropping Columns
1) If the percentage of missing values in a column is more than 50%, that particular column will be dropped 
2) The particular columns will be dropped because they won't add any value to our Air Quality Analysis
3) Considering the below data, `spm` and `pm2_5` column will be dropped

In [40]:
def show_mising_values():
    print(((air_quality_df.isnull().sum())/(number_of_rows))*100)

In [41]:
show_mising_values()

stn_code                       33.064749
sampling_date                   0.000688
state                           0.000000
location                        0.000688
agency                         34.304933
type                            1.237659
so2                             7.951035
no2                             3.725370
rspm                            9.230692
spm                            54.478797
location_monitoring_station     6.309009
pm2_5                          97.862497
date                            0.001606
dtype: float64


In [42]:
air_quality_df.drop(columns=['spm', 'pm2_5'], inplace=True)

* The column `stn_code` also has 33% of its values missing
* No statistical methods can be used to input the null values with other values
* It has also got 803 unique values. Hence, it should be removed 

In [43]:
air_quality_df['stn_code'].describe()


count     291665.0
unique       803.0
top          193.0
freq        1428.0
Name: stn_code, dtype: float64

* The column `agency` also has 34% of its values missing
* No statistical methods can be used to input the null values with other values
* It has also got 64 unique values. Hence, it should be removed 

In [44]:
air_quality_df['agency'].describe()

count                                        286261
unique                                           64
top       Maharashtra State Pollution Control Board
freq                                          27857
Name: agency, dtype: object

In [45]:
air_quality_df.drop(columns=['agency', 'stn_code'], inplace=True)

#### Replacing NaN values with statistical values

In [46]:
for i in ['so2', 'no2', 'rspm']:
    air_quality_df[i].fillna((air_quality_df[i].mean()), inplace=True)

In [47]:
show_mising_values()

sampling_date                  0.000688
state                          0.000000
location                       0.000688
type                           1.237659
so2                            0.000000
no2                            0.000000
rspm                           0.000000
location_monitoring_station    6.309009
date                           0.001606
dtype: float64


#### Removing Rows consisting of string objects

In [49]:
air_quality_df.dropna(axis=0, how='any', inplace=True)

In [50]:
show_mising_values()

sampling_date                  0.0
state                          0.0
location                       0.0
type                           0.0
so2                            0.0
no2                            0.0
rspm                           0.0
location_monitoring_station    0.0
date                           0.0
dtype: float64


### Outliers and Duplicates