In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import stats

%matplotlib inline

In [88]:
data = pd.read_csv("10_state_aqi_2021.csv", index_col = 0, encoding='unicode_escape')

In [89]:
print("Number of rows and columns in the data:", data.shape)

print("Information about the data structure:")
data.info()

print("Names of the attributes in the dataset:", data.columns)

print("Some top rows of the data:")
data.head()

Number of rows and columns in the data: (64988, 11)
Information about the data structure:
<class 'pandas.core.frame.DataFrame'>
Index: 64988 entries, Alaska to Virgin Islands
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   county Name                64988 non-null  object
 1   State Code                 64988 non-null  int64 
 2   County Code                64988 non-null  int64 
 3   Date                       64988 non-null  object
 4   AQI                        64988 non-null  int64 
 5   Category                   64988 non-null  object
 6   Defining Parameter         64988 non-null  object
 7   Defining Site              64988 non-null  object
 8   Number of Sites Reporting  64988 non-null  int64 
 9   Created                    64988 non-null  object
 10  Last Updated               64988 non-null  object
dtypes: int64(4), object(7)
memory usage: 5.9+ MB
Names of the attributes in 

Unnamed: 0_level_0,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,Created,Last Updated
State Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alaska,Aleutians East,2,13,2021-01-01,7,Good,PM2.5,02-013-0002,1,2021-01-01 16:30:00,2021-12-31 23:12:00
Alaska,Aleutians East,2,13,2021-01-04,8,Good,PM2.5,02-013-0002,1,2021-01-04 16:30:00,2021-12-31 23:12:00
Alaska,Aleutians East,2,13,2021-01-07,11,Good,PM2.5,02-013-0002,1,2021-01-07 16:30:00,2021-12-31 23:12:00
Alaska,Aleutians East,2,13,2021-01-10,4,Good,PM2.5,02-013-0002,1,2021-01-10 16:30:00,2021-12-31 23:12:00
Alaska,Aleutians East,2,13,2021-01-13,4,Good,PM2.5,02-013-0002,1,2021-01-13 16:30:00,2021-12-31 23:12:00


In [90]:
print("Missing Values:")
missing_values = data.isnull().sum()
print(missing_values)

Missing Values:
county Name                  0
State Code                   0
County Code                  0
Date                         0
AQI                          0
Category                     0
Defining Parameter           0
Defining Site                0
Number of Sites Reporting    0
Created                      0
Last Updated                 0
dtype: int64


In [91]:
data['Date'] = pd.to_datetime(data['Date'], format='mixed').dt.strftime('%d-%m-%Y')
data['Created'] = pd.to_datetime(data['Created']).dt.strftime('%H:%M:%S %d-%m-%Y')
data['Last Updated'] = pd.to_datetime(data['Last Updated']).dt.strftime('%H:%M:%S %d-%m-%Y')
data.head()

Unnamed: 0_level_0,county Name,State Code,County Code,Date,AQI,Category,Defining Parameter,Defining Site,Number of Sites Reporting,Created,Last Updated
State Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alaska,Aleutians East,2,13,01-01-2021,7,Good,PM2.5,02-013-0002,1,16:30:00 01-01-2021,23:12:00 31-12-2021
Alaska,Aleutians East,2,13,04-01-2021,8,Good,PM2.5,02-013-0002,1,16:30:00 04-01-2021,23:12:00 31-12-2021
Alaska,Aleutians East,2,13,07-01-2021,11,Good,PM2.5,02-013-0002,1,16:30:00 07-01-2021,23:12:00 31-12-2021
Alaska,Aleutians East,2,13,10-01-2021,4,Good,PM2.5,02-013-0002,1,16:30:00 10-01-2021,23:12:00 31-12-2021
Alaska,Aleutians East,2,13,13-01-2021,4,Good,PM2.5,02-013-0002,1,16:30:00 13-01-2021,23:12:00 31-12-2021


In [92]:
if data['AQI'].between(0, 500).all():
    print(f"The AQI column is within the valid range (0-500).")
else:
    print(f"The AQI column has values outside the valid range (0-500).")
    invalid_rows = data.loc[~data['AQI'].between(0, 500)].index
    print(f"Invalid rows: {', '.join(map(str, invalid_rows))}")

The AQI column is within the valid range (0-500).


In [93]:
print(data['Category'].unique())

category_values = ["Good", "Moderate", "Unhealthy for Sensitive Groups", "Unhealthy", "Very Unhealthy", "Hazardous"]

if data['Category'].isin(category_values).all():
    print("All values in the 'Defining Parameter' column are valid.")
else:
    print("The 'Defining Parameter' column contains invalid values.")
    invalid_rows = data.loc[~data['Category'].isin(category_values)].index
    print(f"Invalid rows: {', '.join(map(str, invalid_rows))}")

['Good' 'Moderate' 'Unhealthy for Sensitive Groups' 'Unhealthy'
 'Very Unhealthy' 'Hazardous']
All values in the 'Defining Parameter' column are valid.


In [94]:
print(data['Defining Parameter'].unique())

pollutant_values = ["Ozone", "PM2.5", "PM10", "CO", "NO2"]

if data['Defining Parameter'].isin(pollutant_values).all():
    print("All values in the 'Defining Parameter' column are valid.")
else:
    print("The 'Defining Parameter' column contains invalid values.")
    invalid_rows = data.loc[~data['Defining Parameter'].isin(pollutant_values)].index
    print(f"Invalid rows: {', '.join(map(str, invalid_rows))}")

['PM2.5' 'PM10' 'CO' 'Ozone' 'NO2']
All values in the 'Defining Parameter' column are valid.


In [95]:
data.to_csv("10_state_aqi_2021_cleaned.csv", index = False)