In [15]:
import pandas as pd
from datetime import datetime

# Data Collection
### Climate data of Virginia Beach extracted from https://en.tutiempo.net/climate

In [2]:
# extracting the climate data from websites and adding a date column
for year in range(2014,2021):
    for month in range(1,13):
        if (month<10):
            url = 'https://en.tutiempo.net/climate/0{}-{}/ws-723075.html'.format(month,year)  # virginia beach

        else:
            url = 'https://en.tutiempo.net/climate/{}-{}/ws-723075.html'.format(month,year)
        if year==2014 and month==1:
            data = pd.read_html(url)[2][:-2]
            date = [pd.to_datetime('{}/{}/{}'.format(i,month,year),format='%d/%m/%Y') for i in data['Day']]
            data['date'] = date
        else:
            df = pd.read_html(url)[2][:-2]
            date = [pd.to_datetime('{}/{}/{}'.format(i,month,year),format='%d/%m/%Y') for i in df['Day']]
            df['date'] = date            
            data = pd.concat((data,df))

In [3]:
data.head()

Unnamed: 0,Day,T,TM,Tm,SLP,H,PP,VV,V,VM,VG,RA,SN,TS,FG,date
0,1,4.4,11.7,-1.7,1027.7,68.0,0.0,16.1,7.2,16.5,-,,,,,2014-01-01
1,2,7.3,11.7,-1.7,1014.8,71.0,0.0,15.8,3.1,7.6,-,o,,,,2014-01-02
2,3,1.2,8.9,-4.4,1016.4,69.0,13.97,12.6,32.4,51.9,68.3,o,o,,,2014-01-03
3,4,,,,,,,,,,,,,,,2014-01-04
4,5,,,,,,,,,,,,,,,2014-01-05


### AQI data downloaded from https://www.epa.gov/outdoor-air-quality-data/air-quality-index-daily-values-report

In [4]:
# Reading aqi csv files
for year in range(2014,2021):
    temp = pd.read_csv('Data/AQI Data/virginia beach/aqidaily{}.csv'.format(year))
    if year == 2014:
        aqi = pd.read_csv('Data/AQI Data/virginia beach/aqidaily{}.csv'.format(year))
    else:
        temp = pd.read_csv('Data/AQI Data/virginia beach/aqidaily{}.csv'.format(year))
        aqi = pd.concat((aqi,temp))

In [5]:
# dropping irrelavant features
aqi.drop(['Site Name','Site ID','Source'],axis=1,inplace=True)
aqi.head()

Unnamed: 0,Date,PM2.5 AQI Value,AQI Category
0,01/01/2014,64,Moderate
1,01/02/2014,51,Moderate
2,01/03/2014,25,Good
3,01/04/2014,31,Good
4,01/05/2014,22,Good


In [6]:
# changing datatype of 'Date' feature from str to datetime
aqi['Date'] = pd.to_datetime(aqi['Date'])

In [12]:
# combining the climate and aqi data based on dates
all_data = aqi.merge(data,left_on='Date',right_on='date')
all_data.head()

Unnamed: 0,Date,PM2.5 AQI Value,AQI Category,Day,T,TM,Tm,SLP,H,PP,VV,V,VM,VG,RA,SN,TS,FG,date
0,2014-01-01,64,Moderate,1,4.4,11.7,-1.7,1027.7,68.0,0.0,16.1,7.2,16.5,-,,,,,2014-01-01
1,2014-01-02,51,Moderate,2,7.3,11.7,-1.7,1014.8,71.0,0.0,15.8,3.1,7.6,-,o,,,,2014-01-02
2,2014-01-03,25,Good,3,1.2,8.9,-4.4,1016.4,69.0,13.97,12.6,32.4,51.9,68.3,o,o,,,2014-01-03
3,2014-01-04,31,Good,4,,,,,,,,,,,,,,,2014-01-04
4,2014-01-05,22,Good,5,,,,,,,,,,,,,,,2014-01-05


In [13]:
all_data.drop(['Date','date','Day'],axis=1,inplace=True)

In [16]:
# Saving the final dataset as a csv file
all_data.to_csv('extracted_data.csv',index=False)