In [3]:
%matplotlib inline

import matplotlib.pyplot as plt
import geopandas as gpd
import palettable as pltt
import seaborn as sns
from seaborn import palplot
import pandas as pd

In [4]:
# Data file paths
pollution_dta_path = 'data/Indicators/ddl_pollution_sedac_2011.dta'

# Read dta file in
pollution_df= pd.read_stata(pollution_dta_path)
# Display top of table
pollution_df.head()

Unnamed: 0,pc11_district_id,pc11_district_name,pc11_state_id,pc11_state_name,mean_pollution
0,1,Kupwara,1,Jammu & Kashmir,29.339097
1,2,Badgam,1,Jammu & Kashmir,38.904636
2,3,LehLadakh,1,Jammu & Kashmir,3.99317
3,4,Kargil,1,Jammu & Kashmir,8.861835
4,5,Punch,1,Jammu & Kashmir,35.535978


### Data Documentation

Pollution data: We use data on the global surface of concentrations (micrograms per cubic meter) of mineral dust and sea-salt filtered fine particulate matter of 2.5 micrometers or smaller (PM2.5) data for 2016 from Global Annual PM2.5 Grids hosted at the Socioeconomic Data and Applications Center (SEDAC) (https://sedac.ciesin.columbia.edu/data/set/sdei-global-annual-gwr-pm2-5-modis-misr-seawifs-aod).

We export these data into ArcGIS where we join these data with shapefiles containing district boundaries from 2011 (census year) and from 2019 (latest) to obtain the average pollution estimates at the district level.  


|   Variable Name                   |             Definition                                      |
|-----------------------------------|-------------------------------------------------------------|
|pc11_district_id                   |Unique District ID Census 2011                               |
|pc11_district_name                 |District name in census 2011                                 |
|pc11_state_id                      |State ID in census 2011                                      |
|pc11_state_name                    |State name in census 2011                                    |
|mean_pollution                     |PM2.5 mean in 2016                                           |


In [5]:
# General info
pollution_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 640 entries, 0 to 639
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   pc11_district_id    640 non-null    object 
 1   pc11_district_name  640 non-null    object 
 2   pc11_state_id       640 non-null    int8   
 3   pc11_state_name     640 non-null    object 
 4   mean_pollution      640 non-null    float64
dtypes: float64(1), int8(1), object(3)
memory usage: 25.6+ KB


In [7]:
# number of NaN values per column
pollution_df.isnull().sum()

pc11_district_id      0
pc11_district_name    0
pc11_state_id         0
pc11_state_name       0
mean_pollution        0
dtype: int64

In [8]:
pollution_df.describe()

Unnamed: 0,pc11_state_id,mean_pollution
count,640.0,640.0
mean,17.114062,47.819471
std,9.426486,27.068609
min,1.0,3.99317
25%,9.0,28.019641
50%,18.0,39.533115
75%,24.0,62.502497
max,35.0,123.469135


In [10]:
# Dropping columns 
pollution_df=pollution_df.drop(['pc11_district_name', 'pc11_state_name'], axis=1
                    )
# Display top of table
pollution_df.head()

Unnamed: 0,pc11_district_id,pc11_state_id,mean_pollution
0,1,1,29.339097
1,2,1,38.904636
2,3,1,3.99317
3,4,1,8.861835
4,5,1,35.535978
