<a href="https://colab.research.google.com/github/hainesdata/gas/blob/main/lr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium

Executing: /tmp/apt-key-gpghome.fpvS52KXPd/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
gpg: key DCC9EFBF77E11517: "Debian Stable Release Key (10/buster) <debian-release@lists.debian.org>" not changed
gpg: Total number processed: 1
gpg:              unchanged: 1
Executing: /tmp/apt-key-gpghome.PaxIFJ28vu/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
gpg: key DC30D7C23CBBABEE: "Debian Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" not changed
gpg: Total number processed: 1
gpg:              unchanged: 1
Executing: /tmp/apt-key-gpghome.J8EwBRHfle/gpg.1.sh --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A
gpg: key 4DFAB270CAA96DFA: "Debian Security Archive Automatic Signing Key (10/buster) <ftpmaster@debian.org>" not changed
gpg: Total number processed: 1
gpg:              unchanged: 1
gpg: cannot open '/dev/tty': No such device or address
gpg: [stdout]: write error: Broken pipe
gpg: filter_flush failed on c



In [None]:
# Imports
!pip install http-request-randomizer # proxy library

import pandas as pd
import plotly.express as px
import numpy as np
import datetime
import requests
import regex as re
import random
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from statistics import stdev, mean, median, variance
from math import sqrt
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from http_request_randomizer.requests.proxy.requestProxy import RequestProxy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Init (Code 1)
raw = pd.read_csv('gas_buddy_2022-04-18.csv')
raw.info()
raw.nunique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53737 entries, 0 to 53736
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   services_included  53737 non-null  object 
 1   price_time_stamp   53737 non-null  object 
 2   currency           53737 non-null  object 
 3   postal_code        53699 non-null  object 
 4   loc_name           53737 non-null  object 
 5   city               53737 non-null  object 
 6   review_count       52907 non-null  float64
 7   state              53737 non-null  object 
 8   zip_code_searched  53737 non-null  int64  
 9   latitude           53737 non-null  float64
 10  product_name       53737 non-null  object 
 11  payment_type       53737 non-null  object 
 12  DATE_SCRAPED       53737 non-null  object 
 13  RUN_START_DATE     53737 non-null  object 
 14  source_url         53737 non-null  object 
 15  phone              50483 non-null  object 
 16  loc_number         537

services_included     2932
price_time_stamp     11960
currency                 1
postal_code           8508
loc_name              1292
city                  1010
review_count           469
state                    2
zip_code_searched      547
latitude              9459
product_name             6
payment_type             2
DATE_SCRAPED          1472
RUN_START_DATE           1
source_url            9471
phone                 8605
loc_number            9471
price_current          283
country                  1
longitude             9460
address_1             9446
address_2             5826
overall_rating          40
dtype: int64

In [None]:
# Exploratory (Code 2 - 18)

# CODE 2----------------------------------------------------------------
# Create working copy of raw data to preserve raw data
sandbox = raw.copy()

# Instantiate LabelEncoder and StandardScaler objects from scikit-learn
le = LabelEncoder()
s = StandardScaler()

# Encode
sandbox['services_included'] = le.fit_transform(sandbox['services_included'])
si_1d = np.array(sandbox['services_included']).reshape(-1, 1)

# Standardize
s.fit(si_1d)
sandbox['services_included'] = s.transform(si_1d)

# Return distribution histogram
px.histogram(x=sandbox['services_included'], height=300, width=500).show()

# CODE 3----------------------------------------------------------------
sandbox = sandbox[sandbox['price_current'] != 0]
px.box(sandbox, x='payment_type', y='price_current', width=400, height=500).show()

# CODE 4----------------------------------------------------------------
px.histogram(x=sandbox['overall_rating'], height=300, width=500).show()

# CODE 5----------------------------------------------------------------
px.histogram(x=sandbox['review_count'], height=300, width=500).show()


# CODE 6----------------------------------------------------------------
le.fit(sandbox[['loc_number']])
sandbox['loc_number'] = le.transform(sandbox[['loc_number']])
px.histogram(x=sandbox['loc_number'], height=300, width=500).show()


# CODE 7----------------------------------------------------------------
le.fit(sandbox[['loc_name']])
sandbox['loc_name'] = le.transform(sandbox[['loc_name']])
px.histogram(x=sandbox['loc_name'], height=300, width=500).show()

# CODE 8----------------------------------------------------------------
print(len(sandbox['city'].unique()))

le.fit(sandbox[['city']])
sandbox['city'] = le.transform(sandbox[['city']])
px.histogram(x=sandbox['city'], height=300, width=500).show()

# CODE 9-18--------------------------------------------------------------
px.box(sandbox, x='product_name', y='price_current', width=600).show()

px.histogram(sandbox, x='price_current', color='product_name', barmode='stack', nbins=64, width=1000).show()

px.scatter(sandbox, x='review_count', y='price_current', trendline='ols', width=900).show()

px.scatter(sandbox, x='overall_rating', y='price_current', trendline='ols', width=900).show()

px.scatter(sandbox, x='overall_rating', y='review_count', trendline='ols', width=900).show()

px.scatter(sandbox, x='latitude', y='price_current', trendline='ols', width=900).show()

px.scatter(sandbox, x='longitude', y='price_current', trendline='ols', width=900).show()

fig = px.scatter_mapbox(sandbox, lat="latitude", lon="longitude", 
                        hover_name='loc_name',
                        hover_data=["latitude","longitude"],
                        zoom=4, height=500, width=400
                        )
fig.update_layout(mapbox_style="open-street-map")
fig.show()

fig = px.scatter_mapbox(sandbox[sandbox['price_current'] < 5], lat="latitude", lon="longitude", 
                        hover_name='loc_name',
                        hover_data=["latitude","longitude"],
                        zoom=4, height=500, width=400,
                        color='product_name'
                        )
fig.update_layout(mapbox_style="open-street-map")
fig.show()

px.scatter(sandbox, x='loc_name', y='price_current', width=900)

In [2]:
# Feature Selection (Code 19)
# Drop features
sandbox = raw.copy()

# sandbox['price_weekday'] = pd.to_datetime(sandbox['price_time_stamp'], format='%Y-%m-%d %H:%M:%S').apply(lambda i: str(i.weekday()))
# sandbox['zip2'] = sandbox['postal_code'].apply(lambda i: str(str(i)[1]))

sandbox.drop(columns=['source_url', 'DATE_SCRAPED', 'RUN_START_DATE', 
                      'zip_code_searched', 'country', 'currency', 'state',
                      'address_1', 'address_2', 'services_included', 
                      'price_time_stamp'
                      ],
             inplace=True)
sandbox = sandbox[sandbox['price_current'] != 0]
# sandbox = sandbox.dropna(how='any', axis=0)

print(sandbox.nunique())
print(sandbox.info())

# Encode and standardize (includes processing done for histograms)
s = StandardScaler()

def label_encode(df):
    le = LabelEncoder()
    for col in df.columns:
        if type(df[col][0]) is str:
            le = le.fit(df[[col]].values.ravel())
            df[col] = le.transform(df[[col]].values.ravel())

def one_hot(df, cols):
    ohe = OneHotEncoder()
    for col in cols:
        if type(df[col][0]) is str:
            ohe = ohe.fit(df[[col]])
            enc_arr = ohe.transform(df[[col]]).toarray()
            onehot_df = pd.DataFrame(enc_arr, columns=ohe.get_feature_names_out([col]))
            if len(onehot_df.columns) > 5000:
                raise MemoryError(f'There are > 5000 columns in this encoded feature ({col}). Concatenating on input dataframe is expensive and may crash. Please reduce the number of columns for this feature or reduce the number of possible values for this feature.')
            df = df.drop(columns=[col])
            df = pd.concat([df, onehot_df], axis=1)
    return df

# s = s.fit(sandbox[['services_included']])
# sandbox['services_included'] = s.transform(sandbox[['services_included']])
y_name = 'price_current'
x_name = [name for name in sandbox.columns if name != y_name]

# sandbox = one_hot(sandbox, x_name)
label_encode(sandbox)

# Drop NAs and display metadata
sandbox = sandbox.dropna(how='any', axis=0)
sandbox.info()

# Model Training (Code 20-25)

# CODE 20----------------------------------------------------------------
def train_model(df):
    y_name = 'price_current'
    x_name = [name for name in df.columns if name != y_name]
    X = df[x_name]
    y = df[y_name]

    X_t, X_v, y_t, y_v = train_test_split(X, y, test_size=0.2, random_state=42)

    lr = LinearRegression()
    lr.fit(X_t, y_t)

    weights = lr.coef_
    print('WEIGHTS---------------------')
    for i, w in zip(x_name, weights):
        print(f'{i.ljust(20)} {w}')
    print('')

    y_hat = lr.predict(X_v)
    err = y_v-y_hat
    sigma_y = stdev(y_v)
    sigma_e = stdev(err)
    mse = mean_squared_error(y_v, y_hat)
    lmbda = 1
    me = sqrt(mse)
    pe = me/(2*lmbda*sigma_y)
    performance = me/(2*lmbda*sigma_e)

    print('PERFORMANCE----------------')
    print(f'{"Mean Square Error".ljust(20)} {mse}')
    print(f'{"Mean Error".ljust(20)} {me}')
    print(f'{"Mean Percent Error".ljust(20)} {pe}')
    print(f'{"Error Variance".ljust(20)} {variance(err)}')
    print(f'{"Adj MSE Performance".ljust(20)} {performance}')

    # CODE 21----------------------------------------------------------------
    px.histogram(x=err, width=900).show()

    # CODE 22----------------------------------------------------------------
    x_p = []
    y_p = []

    for i in range(1, 5):
        x_p.append(i)
        y_p.append(me/(2*i*sigma_y))

    x_e = []
    y_e = []

    for i in range(1, 5):
        x_e.append(i)
        y_e.append(me/(2*i*sigma_e))

    fig = px.line(x=x_p, y=[y_p, y_e], labels={'x':'Lambda', 'value': 'Value'}, width=900)
    newnames = {'wide_variable_0':'MPE', 'wide_variable_1': 'Adjusted MSE'}
    fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                          legendgroup = newnames[t.name],
                                          hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                        )
                      )

    return fig, lr

fig, model = train_model(sandbox)
fig.show()
# # CODE 23----------------------------------------------------------------
# sandbox = sandbox.drop(columns=['loc_name', 'city'])
# train_model(sandbox)

# # CODE 24----------------------------------------------------------------
# sandbox = sandbox.drop(columns=['review_count'])
# train_model(sandbox)

# # CODE 25----------------------------------------------------------------
# sandbox = sandbox.drop(columns=['services_included'])
# train_model(sandbox)

# CODE 26----------------------------------------------------------------


NameError: ignored

In [36]:
def load_page(url, use='requests'):
    if use == 'selenium':
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        driver = webdriver.Chrome('/usr/bin/chromedriver', options=options)
        driver.get(url)
        html = driver.page_source
        driver.quit()
        return html
    elif use == 'requests':
        hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        resp = requests.get(url, headers=hdr)
        return resp
    else:
        raise ValueError(f'Invalid parameter passed to load_page(). Expected one of ("selenium", "requests") but got {use} instead')


def get_ids(zipcode):
    url = f"https://www.gasbuddy.com/home?search={zipcode}&fuel=1&method=all&maxAge=24"
    resp = load_page(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    print(resp)
    u = []
    ids = soup.select('div[class*="GenericStationListItem-module__station___"]')
    for i in ids:
        u.append(i.get('id'))
    return u

def get_info(id, city):
    url = f"https://www.gasbuddy.com/station/{id}"
    resp = load_page(url)
    soup = BeautifulSoup(resp.text, "html.parser")

    features = []

    zip_match = re.search(r'CA,([0-9]{5})', str(soup.select('a[class*="Station-module__directionsLink___"]')[0].get('href')))
    postal_code = zip_match.group(1) if zip_match else None
    features.append(postal_code)

    try:
        loc_name = [item.text.split("\xa0")[0] for item in soup.select('h2[class*="StationInfoBox-module__header___"]')][0]
    except IndexError:
        try:
            loc_name = [item.text for item in soup.select('h2[class*="StationInfoBox-module__header___"]')]
        except IndexError:
            print('Error parsing loc_name: no stations exist in the zipcode or zipcode does not exist. Skipping...')
            return 0
    features.append(loc_name)

    features.append(city)

    review_count = int([re.split("[()]+", item.text)[1] for item in soup.select('span[class*="StationInfoBox-module__ratings___"]')][0])
    features.append(review_count)

    try:
        latitude = float(str(soup.select('a[class*="Station-module__directionsLink___"]')[0].get('href')).split('@')[1].split(',')[0])
    except:
        print(f'Error parsing latitude. Skipping...')
        latitude = 0.0
    features.append(latitude)

    features.append(id)

    try:
        phone = [item.text for item in soup.select('a[class*="PhoneLink-module__blue___"]')][0]
    except IndexError:
        phone = ''
        print(f'Error parsing phone: Either phone was empty or incorrect. Skipping...')
    features.append(phone)

    try:
        longitude = float(str(soup.select('a[class*="Station-module__directionsLink___"]')[0].get('href')).split('@')[1].split(',')[1])
    except:
        longitude = 0.0
    features.append(longitude)

    payment_method = 'Credit'
    features.append(payment_method)

    try:
        overall_rating = float([item.text for item in soup.select('span[class*="Station-module__ratingAverage___"]')][0])
    except ValueError:
        return 0
    features.append(overall_rating)

    price_val = [item.text for item in soup.select('span[class*="FuelTypePriceDisplay-module__price___"]')]
    price_key = [item.text for item in soup.select('span[class*="GasPriceCollection-module__fuelTypeDisplay"]')]
    prices = {k:v for k,v in zip(price_key, price_val)}
    features.append(prices)

    return features

df_test = pd.DataFrame(columns=['postal_code', 'loc_name', 'city', 'review_count', 'latitude', 'loc_number', 'phone', 'longitude', 'payment_type', 'overall_rating', 'prices'])
zipcodes = pd.read_csv('zipcodes.us.csv', usecols=['state_code', 'zipcode', 'place'])
zipcodes = zipcodes[zipcodes['state_code'] == 'CA'].drop(columns=['state_code'])

for k, i in enumerate(zipcodes['zipcode'].sample(10)):
    print(f'[Zip: {i}] Retrieving IDs...')
    ids = get_ids(i)
    print(f'[Zip: {i}] Done.')
    failed = False
    for j in ids:
        print(f'[Zip: {i}] Retrieving ID {j} features...')
        features = get_info(j, zipcodes.iloc[k]['place'])
        if features == 0:
            failed = True
            break
        df_test.loc[len(df_test)] = features
        print(f'[Zip: {i}] Done.')
    if failed:
        print(f'[Zip: {i}] Parse failed.\n\n')
        continue
    print(f'[Zip: {i}] Parse successful.\n\n')

df_test[:10]

[Zip: 95322] Retrieving IDs...
<Response [200]>
[Zip: 95322] Done.
[Zip: 95322] Retrieving ID 40602 features...
[Zip: 95322] Done.
[Zip: 95322] Retrieving ID 40586 features...
[Zip: 95322] Done.
[Zip: 95322] Retrieving ID 40588 features...
[Zip: 95322] Done.
[Zip: 95322] Retrieving ID 36367 features...
[Zip: 95322] Done.
[Zip: 95322] Retrieving ID 36855 features...
[Zip: 95322] Done.
[Zip: 95322] Retrieving ID 40603 features...
[Zip: 95322] Done.
[Zip: 95322] Retrieving ID 40605 features...
[Zip: 95322] Done.
[Zip: 95322] Retrieving ID 40587 features...
[Zip: 95322] Done.
[Zip: 95322] Parse successful.


[Zip: 93514] Retrieving IDs...
<Response [200]>
[Zip: 93514] Done.
[Zip: 93514] Retrieving ID 203515 features...
Error parsing phone: Either phone was empty or incorrect. Skipping...
[Zip: 93514] Done.
[Zip: 93514] Retrieving ID 16224 features...
[Zip: 93514] Done.
[Zip: 93514] Retrieving ID 19714 features...
[Zip: 93514] Done.
[Zip: 93514] Parse successful.


[Zip: 92556] Retrieving I

Unnamed: 0,postal_code,loc_name,city,review_count,latitude,loc_number,phone,longitude,payment_type,overall_rating,prices
0,95322,Shell,Alameda,103,37.110153,40602,1 (209) 826-0741,-121.015034,Credit,4.2,"{'Regular': '$4.42', 'Midgrade': '$4.77', 'Pre..."
1,95322,ARCO,Alameda,148,37.111532,40586,1 (209) 827-9065,-121.01611,Credit,3.9,"{'Regular': '- - -', 'Midgrade': '- - -', 'Pre..."
2,95322,Love's Travel Stop,Alameda,188,37.111423,40588,1 (209) 827-1399,-121.016986,Credit,4.2,"{'Regular': '$4.29', 'Midgrade': '$4.59', 'Pre..."
3,95322,Rotten Robbie,Alameda,93,37.102766,36367,1 (209) 826-4418,-121.015357,Credit,4.1,"{'Regular': '$4.29', 'Midgrade': '$4.59', 'Pre..."
4,95322,Petro Travel Center,Alameda,33,37.054781,36855,1 (209) 827-8025,-121.015963,Credit,3.9,"{'Regular': '$4.32', 'Midgrade': '$4.57', 'Pre..."
5,95322,Valero,Alameda,34,37.104324,40603,1 (209) 829-1002,-121.01539,Credit,4.4,"{'Regular': '$4.67', 'Midgrade': '$4.87', 'Pre..."
6,95322,76,Alameda,92,37.10289,40605,1 (209) 826-4383,-121.016112,Credit,4.1,"{'Regular': '$4.69', 'Midgrade': '$4.84', 'Pre..."
7,95322,Chevron,Alameda,14,37.103508,40587,1 (209) 826-1046,-121.016253,Credit,4.3,"{'Regular': '$4.89', 'Midgrade': '$5.05', 'Pre..."
8,93514,Yuhubi Nobi,Alameda,57,37.361617,203515,,-118.41345,Credit,4.7,"{'Regular': '- - -', 'Midgrade': '- - -', 'Pre..."
9,93514,Sinclair,Alameda,49,37.365603,16224,1 (760) 872-7084,-118.395071,Credit,4.7,"{'Regular': '$4.99', 'Midgrade': '- - -', 'Pre..."


In [37]:
df_test_2 = df_test.copy()

# Create duplicate rows and fill col2 using dictionary keys
product_prices = pd.DataFrame(df_test_2['prices'].tolist()).stack().reset_index(level=1).rename(columns={0:'price_current'})
product_prices['product_name'] = product_prices['level_1']
product_prices = product_prices.drop('level_1', axis=1)
df_test_2 = pd.merge(df_test_2, product_prices, left_on=df_test_2.index, how='left', right_on=product_prices.index)
df_test_2 = df_test_2.drop(columns=['prices', 'key_0'])
df_test_2 = df_test_2[df_test_2['price_current'] != '- - -']
df_test_2['price_current'] = [float(i.replace('$','')) for i in df_test_2['price_current']]
df_test_2[:3]

Unnamed: 0,postal_code,loc_name,city,review_count,latitude,loc_number,phone,longitude,payment_type,overall_rating,price_current,product_name
0,95322,Shell,Alameda,103,37.110153,40602,1 (209) 826-0741,-121.015034,Credit,4.2,4.42,Regular
1,95322,Shell,Alameda,103,37.110153,40602,1 (209) 826-0741,-121.015034,Credit,4.2,4.77,Midgrade
2,95322,Shell,Alameda,103,37.110153,40602,1 (209) 826-0741,-121.015034,Credit,4.2,5.27,Premium


In [1]:
y_name = 'price_current'
x_name = [name for name in sandbox.columns if name != y_name]

label_encode(df_test_2)

# Drop NAs and display metadata
df_test_2 = df_test_2.dropna(how='any', axis=0)

df_test_2[:5]
# model.predict(df_test_2[x_name])

NameError: ignored