This script is used to webscrape from the websites https://gasprices.aaa.com/state-gas-price-averages/ and https://web.archive.org/ and collect the average gas prices for each U.S. state every day since the start of the war in Ukraine (February 24th, 2022). The data was then imported into Tableau to visualize the trends in gas prices: https://public.tableau.com/app/profile/ja.rey.corcuera/viz/GasPrices_16547935894400/Dashboard

In [None]:
from urllib.request import Request, urlopen
import requests
import ssl
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

In [9]:
# Set to true to begin webscraping
run = False

if run:
    
    # Initialize dataframes
    regular_df = pd.DataFrame()
    gas_df = pd.DataFrame()

    # Obtain list of states from a csv file
    states_df =  pd.read_csv("states.csv")
    states = states_df["State"].tolist()

    # Starting day (February 24th: The first day of Russia's invasion on Ukraine)
    month = 2
    day = 24
    total_days = 108 # Number of days from Feb. 24 to June 11th
    
    # Loop through each day and obtain the average gas prices for each state
    for count in range(0, total_days):
        
        # Format month for URL
        if month < 10:
            month_str = "0" + str(month)
        else:
            month_str = str(month)
        
        # Format day for URL
        if day < 10:
            day_str = "0" + str(day)
        else:
            day_str = str(day)

        # Construct URL that will retrieve the average gas prices for each state at a particular day
        url = "https://web.archive.org/web/2022" + month_str + day_str + "120000/https://gasprices.aaa.com/state-gas-price-averages/"
        print("Opening page URL:", url)

        # Obtain html from URL using BeautifuLSoup
        ssl._create_default_https_context = ssl._create_unverified_context
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        html = urlopen(req).read()
        soup = BeautifulSoup(html, 'html.parser')

        # Scrape the average regular, midgrade, premium, and diesel
        # gas prices for each state and insert data into dataframes
        regular_prices = soup.find_all('td', class_="regular")
        midgrade_prices = soup.find_all('td', class_="mid_grade")
        premium_prices = soup.find_all('td', class_="premium")
        diesel_prices = soup.find_all('td', class_="diesel")
        
        curr_row = {}
        curr_row["Date"] = month_str + "/" + day_str + "/" + "2022"
        
        i = 0
        for price in regular_prices:
            curr_row[states[i]] = float(price.get_text()[1:])
            i = i + 1
        regular_df = regular_df.append(curr_row, ignore_index=True)

        i = 0
        for state in states:
            curr_row["State"] = state
            curr_row["Regular Gas Price"] = float(regular_prices[i].get_text()[1:])
            curr_row["Midgrade Gas Price"] = float(midgrade_prices[i].get_text()[1:])
            curr_row["Premium Gas Price"] = float(premium_prices[i].get_text()[1:])
            curr_row["Diesel Gas Price"] =  float(diesel_prices[i].get_text()[1:])

            i = i + 1
            gas_df = gas_df.append(curr_row, ignore_index=True)  
            
        # Update date to the next day
        if (month == 2 and day == 28) or (month == 3 and day == 31) or (month == 4 and day == 30) or (month == 5 and day == 31) or (month == 6 and day == 30):
            month = month + 1
            day = 1
        else :
            day = day + 1

        # Random delay between accessing webpages
        time.sleep(random.uniform(1, 2)) 

    # Write dataframes to csv files
    regular_df.to_csv('regular_gas.csv', index=False)
    gas_df.to_csv('gas.csv', index = False)

    print("DONE")
