# Table and File Extractions

This script holds two functions for downloaded files from the web and tables on the webpage.

It is set up for the extractions that need to be made.

Information needs to be transferred to a Lambda Function in AWS
    a. Here code needs to run functions for each source and download output into S3 bucket ready for archiving and transforming
    b. Error handling and recording for successful and non-successful extractions required. I.e. website change preventing             extractions from occuring (log what isnt downloading and for what days


In [1]:
# load library Dependencies ---- Needed for implementation into AWS Lambda function

import pandas as pd
import requests 
from bs4 import BeautifulSoup # web scaping
from urllib.error import HTTPError

# variables to be used in Lambda function

########### Exchange Rates #########
current_rate_table = "https://www.rba.gov.au/statistics/frequency/exchange-rates.html"
    # list index is 0
    
historical_rates_csv = "https://www.rba.gov.au/statistics/tables/csv/f11.1-data.csv"

############ TGP ###############
caltex_pds = "https://www.caltex.com.au/-/media/pricing/caltex-terminal-gate-prices.ashx"
bp_pfd = "https://www.bp.com/content/dam/bp/country-sites/en_au/australia/home/products-services/pricing/terminal-gate-price.pdf"
bp_xls = "https://www.bp.com/content/dam/bp/country-sites/en_au/australia/home/products-services/pricing/tgp-excel.xls"

Mobile_table = "http://apps.exxonmobil.com.au/apps/htm/mn_mobil_products_automotive_pricing.asp"
    # Mobile list index is 1

Liberty_table = "https://www.libertyoil.com.au/terminal-gate-pricing"
    # cannot find table - Handled with exception 
    # no list index 

Viva_table = "https://www.vivaenergy.com.au/products/terminal-gate-pricing/current-tgp/tgp-current"
    # Viva list index is 0

# date for extraction needs to be put into Puma Website URL
Puma_table = "https://www.pumaenergy.com.au/for-business/terminal-gate-price/?date=2020-05-02"
    # Forbidden Error - pretend to be a browser - Handled with exception
    # Puma list index is 0-6 for various states/regions
    # Sydney is list index 1

# They said wholesale price but there is another one for TGP
###### CLARIFY with client ########
United_table_wholesale = "https://www.unitedpetroleum.com.au/wholesale/list-pricing/"
    # United Wholesale list index is 0
United_table_tgp = "https://www.unitedpetroleum.com.au/wholesale/tgp-pricing/"
    # United TGP list index is 0

In [2]:
def file_website_extraction(website, filename):
    req = requests.get(website) # requests the url
    url_content = req.content # reads the file content
    file = open(filename, 'wb') # creates and opens a file. Ensure the correct file extension
    file.write(url_content) # writes the content to the file
    file.close() # closes the file connection

In [3]:
# website table download
def table_website_extraction(website, list_index):
    
    try:
        table = pd.read_html(website) # reads the table from the website as it is. A list
        df = pd.DataFrame(table[list_index]) # extracts list element and places into dataframe. This needs to be set for the table
        return(df)

    except HTTPError as e:
        if e.code == 403:
            # Pretend I'm a website
            header = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest"}

            table = requests.get(website, headers = header)
            table2 = pd.read_html(table.text)
            df = pd.DataFrame(table2[list_index])
            return(df)
    
    except ValueError: # no tables found
        res = requests.get(website)
        soup = BeautifulSoup(res.content,'lxml')
        
        # get informatio from classes that hold the data in the table
        result = soup.find_all('div', {'class': ['col-md-2', 'col-md-3']}) # set for Liberty
        
        # extract information into a list
        res = [] 
        for i in result:
            row = [i.text.strip() for i in result if i.text.strip()]
            if row:
                res.append(row)
            
        # store data in a dataframe ready for transformation
        df = pd.DataFrame(res[0])
        return(df)
        