In [104]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm


'''
Notes: Some of these graphs are very large and so I had difficulty importing them
into my final deliverable. Additionally, I wrote most of this in a Jupyter Notebook,
so I didn't have to explicitly call plt.show() to display my graphs. It also means
that sometimes you see output at the end of a cell, which can be disregaded except
as a sanity/type check

'''

#Begin by parsing data

rawData = ["EPADataSets\\daily_06_037_1103_2016.csv", "EPADataSets\\daily_06_037_1103_2017.csv", 
           "EPADataSets\\daily_06_037_1103_2018.csv", "EPADataSets\\daily_06_037_1103_2019.csv",
           "EPADataSets\\daily_06_037_1103_2020.csv"]

# keepList = ["Ammonium Ion PM2.5 LC", "Chromium PM2.5 LC", "Cesium PM2.5 LC", "Lead PM2.5 LC",
#             "Zinc PM2.5 LC", "Chlorine PM2.5 LC", "Magnesium PM2.5 LC", "Phosophorus PM2.5 LC",
#            "Nickel PM2.5 LC", "Strontium PM2.5 LC", "Cadmium PM2.5 LC", "Iron PM2.5 LC", "Sulfate PM2.5 LC",
#             "Bromine PM2.5 LC", "Antimony PM2.5 LC", "Selenium PM2.5 LC", "Sodium PM2.5 LC", 
#             "Rubidium PM2.5 LC", "Aluminum PM2.5 LC", "Sodium Ion Pm2.5" "Silicon PM2.5 LC", 
#             "Potassium PM2.5 LC", "Sulfur PM2.5 LC", "Chloride PM2.5 LC", "Barium PM2.5 LC",
#             "Calcium PM2.5 LC", "Titanium PM2.5 LC", "Zirconium PM2.5 LC", "Potassium PM2.5 LC", 
#             "Copper PM2.5 LC", "Tin PM2.5 LC", "Manganese PM2.5 LC", "Silver PM2.5 LC", "Indium PM2.5 LC", 
#             "Cerium PM2.5 LC"]

#https://pubs-acs-org.libproxy2.usc.edu/doi/full/10.1021/es400416g

# keepList = ["Wind Direction - Scalar", "Wind Direction - Resultant", "Chlorine PM2.5 LC", "Calcium PM2.5LC", "Magnesium PM2.5LC",
#            "Sodium PM2.5LC", "OC1 PM2.5 LC","OC2 PM2.5 LC","OC3 PM2.5 LC","EC1 PM2.5 LC",
#            "EC2 PM2.5 LC","EC3 PM2.5 LC",]
        
keepList = ["Ozone", "Nitrogen dioxide (NO2)", "Nitric oxide (NO)", "Carbon monoxide", "Sulfur "]    

allYears = []
for i in rawData:
    # Read the columns we want
    df = pd.read_csv(i, usecols=["Parameter Name", "Date (Local)", "Arithmetic Mean"])

    # Change date to datetime object
    #df["Date (Local)"] = pd.to_datetime(df["Date (Local)"])

    #Only keep the rows that contain data of relevant elements
    df = df[df["Parameter Name"].isin(keepList)]    
    
    #Rename wind directions
    df.loc[df["Parameter Name"] == "Wind Direction - Scalar", "Parameter Name"] = "Wind Direction - Resultant"
    
    allYears.append(df)

print(allYears[0].head())

       Parameter Name Date (Local)  Arithmetic Mean
0        OC3 PM2.5 LC   2016-01-01            2.261
22  Chlorine PM2.5 LC   2016-01-01            0.008
49       OC1 PM2.5 LC   2016-01-01            0.664
79  Chlorine PM2.5 LC   2016-01-01            0.027
87  Chlorine PM2.5 LC   2016-01-01            0.027


In [105]:
from statistics import mean 

#reformat the data the way we want it
cleandf = pd.DataFrame(columns=["Parameter", "Date", "Average"])

#average out repeats per day per parameter name
for year in allYears:
    year = year.groupby("Date (Local)")
    
    for key, grp in year:
        temp = {}
        #get the counts for each parameter per day
        for i, row in grp.iterrows():
            if(row[0] in temp):
                 temp[row[0]].append(row[2])
            else:
                 temp[row[0]] = [row[2]]
                
        #average out the counts for each perameter
        for parameter in temp:
            temp[parameter] = mean(temp[parameter])
        
        temp2 = {"Date": [], "Parameter":[], "Average":[]} 
        for parameter in temp:
            temp2["Date"].append(key)
            temp2["Parameter"].append(parameter)
            temp2["Average"].append(temp[parameter])
            
        tempdf = pd.DataFrame(temp2)
        cleandf = cleandf.append(tempdf)
    
cleandf = cleandf.pivot(index="Date", columns="Parameter", values="Average")


print(cleandf)

Parameter   Chlorine PM2.5 LC  EC1 PM2.5 LC  EC2 PM2.5 LC  EC3 PM2.5 LC  \
Date                                                                      
2016-01-01           0.020667         1.296         0.036         0.000   
2016-01-02                NaN           NaN           NaN           NaN   
2016-01-03                NaN           NaN           NaN           NaN   
2016-01-04           0.013000         0.694         0.078         0.018   
2016-01-05                NaN           NaN           NaN           NaN   
...                       ...           ...           ...           ...   
2020-09-26                NaN           NaN           NaN           NaN   
2020-09-27                NaN           NaN           NaN           NaN   
2020-09-28                NaN           NaN           NaN           NaN   
2020-09-29                NaN           NaN           NaN           NaN   
2020-09-30                NaN           NaN           NaN           NaN   

Parameter   OC1 PM2.5 LC

In [106]:
#clean up the data even more
cleandf = cleandf.dropna()
cleandf["FromSea"] = cleandf['Wind Direction - Resultant'].apply(lambda x: 0 if x < 180 else 1)

In [107]:
#Let's explore the data a bit
print(len(cleandf), "\n")
print(cleandf.shape, "\n")

520 

(520, 9) 



In [22]:
#To get a sense of the information:
for year in allYears:
    print(year.info(), "\n")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1321 entries, 0 to 36231
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Parameter Name   1321 non-null   object 
 1   Date (Local)     1321 non-null   object 
 2   Arithmetic Mean  1321 non-null   float64
dtypes: float64(1), object(2)
memory usage: 41.3+ KB
None 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1703 entries, 3 to 38668
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Parameter Name   1703 non-null   object 
 1   Date (Local)     1703 non-null   object 
 2   Arithmetic Mean  1703 non-null   float64
dtypes: float64(1), object(2)
memory usage: 53.2+ KB
None 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1563 entries, 0 to 31937
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Pa

In [23]:
for year in allYears:
    print(year.isnull().sum())

Parameter Name     0
Date (Local)       0
Arithmetic Mean    0
dtype: int64
Parameter Name     0
Date (Local)       0
Arithmetic Mean    0
dtype: int64
Parameter Name     0
Date (Local)       0
Arithmetic Mean    0
dtype: int64
Parameter Name     0
Date (Local)       0
Arithmetic Mean    0
dtype: int64
Parameter Name     0
Date (Local)       0
Arithmetic Mean    0
dtype: int64


In [17]:
direction = allYears[0][(allYears[0]["Parameter Name"] == "Wind Direction - Scalar") | 
                       (allYears[0]["Parameter Name"] == "Wind Direction - Resultant")]
variables = allYears[0][(allYears[0]["Parameter Name"] != "Wind Direction - Scalar")
                       & (allYears[0]["Parameter Name"] != "Wind Direction - Resultant")]

In [None]:
fig, ax = plt.subplots(7, 1, figsize = (5, 30))

graph = 0
byParameter = variables.groupby("Parameter Name")

for key, grp in byParameter:
    for i in range(len(grp["Date (Local)"])):        
        ax[graph].set_title(key)
        for j in range(len(direction["Date (Local)"])):
            if direction["Date (Local)"].iloc[j] == grp["Date (Local)"].iloc[i]:
                ax[graph].scatter(grp["Arithmetic Mean"].iloc[i], direction["Arithmetic Mean"].iloc[j], 
                                  s = 5, c="BLUE")#c=colors[graph], s = 5)
                break
    graph += 1
                
        
fig.tight_layout()

In [None]:
#EC3 stands out as mostly being 0s, so lets drop it because it probably isn't really needed anyway


In [18]:
#Define "blowing from the sea" as degrees >=180,  "blowing from the land" < 180
direction['FromSea'] = direction['Arithmetic Mean'].apply(lambda x: 0 if x < 180 else 1)
print(direction.head())

              Parameter Name Date (Local)  Arithmetic Mean  FromSea
147  Wind Direction - Scalar   2016-01-01       105.791667        0
363  Wind Direction - Scalar   2016-01-02        84.125000        0
398  Wind Direction - Scalar   2016-01-03       104.291667        0
509  Wind Direction - Scalar   2016-01-04       105.083333        0
536  Wind Direction - Scalar   2016-01-05       192.791667        1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  direction['FromSea'] = direction['Arithmetic Mean'].apply(lambda x: 0 if x < 180 else 1)


In [None]:
new_df = pd.DataFrame()

byParameter = variables.groupby("Parameter Name")

for key, grp in byParameter:
    for i in range(len(grp["Date (Local)"])):        
        ax[graph].set_title(key)
        for j in range(len(direction["Date (Local)"])):
            if direction["Date (Local)"].iloc[j] == grp["Date (Local)"].iloc[i]:
                
                
                
                
                break
    graph += 1
                