## Imports and Settings

In [1]:
import pandas
import numpy

In [2]:
dataframe = pandas.read_csv(r"C:\Users\justi\Downloads\Final Project\2. Data\Original\air_quality.csv")

In [11]:
pandas.options.display.max_columns = None

## Consistency Checks

In [4]:
dataframe.shape

(608699, 24)

In [5]:
dataframe.dtypes

Date                  object
Year                   int64
Month                  int64
Day                    int64
Address               object
State                 object
County                object
City                  object
O3 Mean              float64
O3 1st Max Value     float64
O3 1st Max Hour        int64
O3 AQI                 int64
CO Mean              float64
CO 1st Max Value     float64
CO 1st Max Hour        int64
CO AQI                 int64
SO2 Mean             float64
SO2 1st Max Value    float64
SO2 1st Max Hour       int64
SO2 AQI                int64
NO2 Mean             float64
NO2 1st Max Value    float64
NO2 1st Max Hour       int64
NO2 AQI                int64
dtype: object

In [6]:
# Check for missing values
dataframe.isnull().sum()

Date                 0
Year                 0
Month                0
Day                  0
Address              0
State                0
County               0
City                 0
O3 Mean              0
O3 1st Max Value     0
O3 1st Max Hour      0
O3 AQI               0
CO Mean              0
CO 1st Max Value     0
CO 1st Max Hour      0
CO AQI               0
SO2 Mean             0
SO2 1st Max Value    0
SO2 1st Max Hour     0
SO2 AQI              0
NO2 Mean             0
NO2 1st Max Value    0
NO2 1st Max Hour     0
NO2 AQI              0
dtype: int64

In [7]:
# Drop unneeded columns
dataframe.drop(columns = ['Year', 'Month', 'Day', 'Address', 'County', 'O3 1st Max Hour', 'CO 1st Max Hour', 'SO2 1st Max Hour',
                         'NO2 1st Max Hour'], inplace = True)

In [8]:
# Rename Columns
dataframe.rename(columns = {'O3 1st Max Value':'O3 Max', 'CO 1st Max Value':'CO Max', 'SO2 1st Max Value':'SO2 Max',
                            'NO2 1st Max Value':'NO2 Max'}, inplace = True)

In [9]:
# Check for mixed-type values
for col in dataframe.columns.tolist():
  mixtype = (dataframe[[col]].applymap(type) != dataframe[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (dataframe[mixtype]) > 0:
    print (col)

In [24]:
# Check for Duplicates
dataframe[dataframe.duplicated()]

Unnamed: 0,Date,State,City,O3 Mean,O3 Max,O3 AQI,CO Mean,CO Max,CO AQI,SO2 Mean,SO2 Max,SO2 AQI,NO2 Mean,NO2 Max,NO2 AQI
87221,7/6/2003,North Carolina,Charlotte,0.019412,0.025,23,0.300000,0.3,3,2.666667,22.0,31,5.000000,14.0,13
87265,7/28/2003,North Carolina,Charlotte,0.054647,0.079,129,0.400000,0.5,6,2.625000,17.0,24,10.000000,19.0,18
87301,8/15/2003,North Carolina,Charlotte,0.036882,0.068,93,0.554167,0.8,9,0.333333,2.0,3,18.708333,42.0,40
108678,5/17/2004,North Carolina,Charlotte,0.035235,0.048,44,0.350000,0.4,5,2.750000,16.0,23,7.333333,18.0,17
108700,5/28/2004,North Carolina,Charlotte,0.023412,0.051,47,0.337500,0.6,7,2.791667,8.0,11,13.708333,35.0,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600161,8/23/2021,Missouri,St. Louis,0.024118,0.047,44,0.316667,0.4,5,0.829167,4.8,6,11.158333,28.3,26
600187,9/5/2021,Missouri,St. Louis,0.023294,0.043,40,0.275000,0.6,7,0.227273,1.2,1,7.370833,27.6,25
600189,9/6/2021,Missouri,St. Louis,0.037000,0.050,46,0.375000,0.7,8,1.583333,23.1,33,5.725000,24.0,23
602582,1/16/2021,Ohio,Cleveland,0.019364,0.021,19,0.400000,0.4,5,0.000000,0.0,0,3.875000,7.0,7


In [33]:
# Check for accuracy
dataframe.describe()

Unnamed: 0,O3 Mean,O3 Max,O3 AQI,CO Mean,CO Max,CO AQI,SO2 Mean,SO2 Max,SO2 AQI,NO2 Mean,NO2 Max,NO2 AQI
count,608699.0,608699.0,608699.0,608699.0,608699.0,608699.0,608699.0,608699.0,608699.0,608699.0,608699.0,608699.0
mean,0.028477,0.038903,39.112517,0.337258,0.476798,5.377387,1.523358,4.210271,5.568708,11.738067,23.610711,22.12424
std,0.012218,0.015031,22.480864,0.282958,0.446321,5.142873,2.495094,7.983793,10.736164,9.083091,15.412489,14.610125
min,-0.000706,0.0,0.0,-0.4375,-0.4,0.0,-2.508333,-2.3,0.0,-4.629167,-4.4,0.0
25%,0.019647,0.029,27.0,0.179167,0.2,2.0,0.1875,0.6,0.0,4.978261,11.2,10.0
50%,0.028235,0.038,35.0,0.2625,0.4,5.0,0.666667,1.7,1.0,9.541667,21.8,20.0
75%,0.036765,0.048,44.0,0.420833,0.6,7.0,1.772727,4.0,6.0,16.304348,33.7,31.0
max,0.107353,0.14,237.0,7.508333,15.5,201.0,321.625,351.0,200.0,140.65,269.2,133.0


## Exports

In [28]:
dataframe.to_csv(r"C:\Users\justi\Downloads\Final Project\2. Data\Prepared\air_quality_prepared.csv", index = False)