In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

aqi_data = pd.read_csv('aqi_data.csv')
ozone_data = pd.read_csv('ozone_data.csv')
pm_data = pd.read_csv('pm2.5_data.csv')
sf_data = pd.read_csv('sanfrancisco.csv')

In [2]:
aqi_data

Unnamed: 0.1,Unnamed: 0,AQI,Date
0,26299,87,2007-01-01
1,26300,70,2007-01-02
2,26301,52,2007-01-03
3,26302,27,2007-01-04
4,26303,25,2007-01-05
...,...,...,...
15334,26521,36,2021-12-27
15335,26522,30,2021-12-28
15336,26523,25,2021-12-29
15337,26524,24,2021-12-30


In [3]:
ozone_data

Unnamed: 0.1,Unnamed: 0,Daily Max 8-hour Ozone Concentration,Date,DAILY_AQI_VALUE
0,0,0.010,01/01/2009,9
1,1,0.025,01/02/2009,23
2,2,0.024,01/03/2009,22
3,3,0.020,01/04/2009,19
4,4,0.010,01/05/2009,9
...,...,...,...,...
8277,355,0.030,12/26/2004,28
8278,356,0.024,12/27/2004,22
8279,357,0.006,12/28/2004,6
8280,358,0.039,12/29/2004,36


In [4]:
pm_data

Unnamed: 0.1,Unnamed: 0,Daily Mean PM2.5 Concentration,Date,DAILY_AQI_VALUE
0,0,22.1,01/01/2018,72
1,1,26.7,01/02/2018,82
2,2,39.5,01/03/2018,111
3,3,15.6,01/04/2018,58
4,4,5.2,01/05/2018,22
...,...,...,...,...
7021,190,12.3,12/19/2001,51
7022,191,4.2,12/20/2001,18
7023,192,10.4,12/22/2001,43
7024,193,9.7,12/23/2001,40


In [5]:
sf_data

Unnamed: 0.1,Unnamed: 0,Date,Max.TemperatureF,Mean.TemperatureF,Min.TemperatureF,Max.Dew.PointF,MeanDew.PointF,Min.DewpointF,Max.Humidity,Mean.Humidity,...,Min.VisibilityMiles,Max.Wind.SpeedMPH,Mean.Wind.SpeedMPH,Max.Gust.SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees.br...,city,season
0,1,1948-01-01,55,50.0,46.0,53.0,48.0,42.0,96.0,87.0,...,2.0,16.0,7.0,,0.00,0.0,Fog-Rain,124<br />,San Francisco (CA),Winter
1,2,1948-01-02,57,55.0,53.0,56.0,53.0,51.0,93.0,91.0,...,3.0,22.0,11.0,,0.00,0.0,Rain,202<br />,San Francisco (CA),Winter
2,3,1948-01-03,57,54.0,50.0,54.0,51.0,50.0,100.0,94.0,...,0.0,8.0,2.0,,0.00,0.0,Fog,79<br />,San Francisco (CA),Winter
3,4,1948-01-04,59,57.0,55.0,55.0,54.0,54.0,97.0,91.0,...,3.0,18.0,7.0,,0.00,0.0,Fog-Rain,191<br />,San Francisco (CA),Winter
4,5,1948-01-05,59,55.0,51.0,55.0,52.0,50.0,100.0,91.0,...,0.0,12.0,4.0,,0.00,0.0,Fog,289<br />,San Francisco (CA),Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24738,24806,2015-12-27,48,41.0,34.0,37.0,34.0,29.0,82.0,71.0,...,9.0,15.0,4.0,17.0,0.01,5.0,Rain,156<br />,San Francisco (CA),Winter
24739,24807,2015-12-28,48,45.0,41.0,40.0,37.0,34.0,89.0,77.0,...,4.0,13.0,5.0,15.0,0.11,4.0,Rain,52<br />,San Francisco (CA),Winter
24740,24808,2015-12-29,54,45.0,36.0,41.0,36.0,28.0,92.0,67.0,...,10.0,12.0,3.0,13.0,0.00,1.0,,250<br />,San Francisco (CA),Winter
24741,24809,2015-12-30,49,45.0,40.0,44.0,40.0,37.0,93.0,80.0,...,2.0,16.0,7.0,18.0,0.01,6.0,Rain,19<br />,San Francisco (CA),Winter


In [6]:
aqi_data['Year'] = aqi_data['Date'].str.slice(0, 4).astype(int)
ozone_data['Year'] = ozone_data['Date'].str.slice(6, 10).astype(int)
pm_data['Year'] = pm_data['Date'].str.slice(6, 10).astype(int)
sf_data['Year'] = sf_data['Date'].str.slice(0, 4).astype(int)

In [7]:
pd.unique(sorted(aqi_data['Year']))

array([1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021])

In [8]:
pd.unique(sorted(ozone_data['Year']))

array([1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
       2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
       2021])

In [9]:
pd.unique(sorted(pm_data['Year']))

array([1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
       2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
       2021])

In [10]:
pd.unique(sorted(sf_data['Year']))

array([1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958,
       1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969,
       1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980,
       1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015])

In [None]:
# Filter
aqi_data = aqi_data[(aqi_data['Year'] >= 1999) & (aqi_data['Year'] <= 2021)]
ozone_data = ozone_data[(ozone_data['Year'] >= 1999) & (ozone_data['Year'] <= 2021)]
pm_data = pm_data[(pm_data['Year'] >= 1999) & (pm_data['Year'] <= 2021)]
sf_data = sf_data[(sf_data['Year'] >= 1999) & (sf_data['Year'] <= 2015)]

# Merge
merged_df = pd.merge(aqi_data, ozone_data, on='Year', how='outer')
merged_df = pd.merge(merged_df, pm_data, on='Year', how='outer')
merged_df = pd.merge(merged_df, sf_data, on='Year', how='outer')

# Handle missing values or duplicates
merged_df = merged_df.drop_duplicates(subset='Year')
merged_df = merged_df.fillna(method='ffill')  # Forward fill missing values

In [None]:
merged_df