### Scraping current supercharger data from Teslas website

In [None]:
# This uses the complete width of the screen with Jupyter
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin,urlparse,parse_qs
import re
from datetime import datetime, timezone

In [None]:
# Get list of countries that have Superchargers
url = 'https://www.tesla.com/findus/list'
response = requests.get(url)
soup = BeautifulSoup(response.content)
countries = []
for row in soup.find_all('section', {'class': 'row'}):
    for h2 in row.find_all('h2'):
        if 'Superchargers' in (h2.text):
            hrefs = row.find_all('a', href=True)
            for href in hrefs:
                country = href['href'].rsplit('/')[-1]
                countries.append(country)
countries = list(set(countries))
countries.sort()

In [None]:
# Function to grab locations of Superchargers from country website
def get_locations(url,country):
    locations = []
    response = requests.get(url)
    try:
        soup = BeautifulSoup(response.content, 'lxml')
        if soup:
            for address in soup.find_all('address', {'class': 'vcard'}):
                location = {}
                location['URL'] = urljoin(url ,address.find('a', href=True)['href'])
                # Some locations do not have an address, so we skip if this fails
                try:
                    location['street_address'] = address.find('span', {'class': 'street-address'}).text.strip()
                except:
                    pass
                try:
                    location['locality'] = address.find('span', {'class': 'locality'}).text.strip()
                except:
                    pass
                locations.append(location)
    except:
        print("Fail on {}".format(country))
        pass
    return locations

In [None]:
locations = []
# Show progressbar
from tqdm.notebook import tqdm
# Loop over countries, and store info in list of dicts
for country in tqdm(set(countries)):
    url = 'https://www.tesla.com/findus/list/superchargers/'
    url = urljoin(url, country)
    locations = locations + get_locations(url, country)

In [None]:
# Create pandas dataframe from locations
df = pd.DataFrame(locations)

In [None]:
# Remove double whitespaces from dataframe
df = df.replace(to_replace ='\s+', value = ' ', regex = True)

In [None]:
# Show some info on dataframe
df.describe()

In [None]:
# Store intermediate result, so we have it if the notebook fails
filename = 'df.'+datetime.utcnow().strftime('%Y%m%d')+'.parquet'
df.to_parquet(filename)

In [None]:
# Function to extract info from location websites
def get_info_from_url(url, index, debug=False):
    # Use the global datframe, so we do not have to shuffle data around
    global df
    # Grab data from URL
    response = requests.get(url)
    # We use the lxml parser, as the html parser gave weird results (&center -> ¢er)
    soup = BeautifulSoup(response.content, 'lxml')
    # Try to grab coordinate info
    # Do we have a "Driving Directions" entry ?
    hrefs = soup.find_all('a', href=True)
    coords = ''
    for href in hrefs:
        if href.text == 'Driving Directions':
            coords = urlparse(href['href']).query.split('=')[1]
    # If not, use grab the info from the map image
    if len(coords) == 0:
        div = soup.find('div', {'id': 'location-map'})
        if div:
            img_src = div.find('img')['src']
            if img_src:
                coords = parse_qs(urlparse(img_src).query)['center'][0]
    # Add coordinate info to dataframe, leave empty if missing
    if coords:
        df.loc[index, 'latitude'] = coords.split(',')[0]
        df.loc[index, 'longitude'] = coords.split(',')[1]
    else:
        df.loc[index, 'latitude'] = ''
        df.loc[index, 'longitude'] = ''
    # Grab charging info from "Charging" line
    charging_text = soup.select_one('p:-soup-contains("Charging")')
    if charging_text:
        charging_text= charging_text.get_text(separator=" ")
        # Store text for later , better parsing
        df.loc[index, 'charging_text'] = charging_text
        if debug:
            print(charging_text)
        # Grab first info on charging
        number = re.search('(\d+).*Superchargers', charging_text)
        if number:
            number = number.group(1)
            df.loc[index, 'number'] = number
            if debug:
                print(number)
        kw = re.search('(\d+)kW', charging_text)
        if kw:
            kw = kw.group(1)        
            df.loc[index, 'kw'] = kw
            if debug:
                print(kw)        
    else:
        df.loc[index, 'number'] = ''
        df.loc[index, 'kw'] = ''
            
    # Grab info on possibility of charging for Non-Tesla cars
    open_text = soup.select_one('i:-soup-contains("Non-Tesla")')
    if open_text:
        df.loc[index, 'open'] = open_text.get_text(separator=" ")
        if debug:
            print(open_text)
    else:
        df.loc[index, 'open'] = ''


In [None]:
from tqdm.notebook import tqdm
import traceback
# Loop over all locations, and grab extra info from URL (and show nicer errors on fail)
for index,location in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        get_info_from_url(location['URL'],index)
    except Exception:
        print("Fail on ({}):{}".format(index,location['URL']))
        traceback.print_exc()
        break

In [None]:
# Store intermediate result, so we have it if the notebook fails
filename = 'df.filled.'+datetime.utcnow().strftime('%Y%m%d')+'.parquet'
df.to_parquet(filename)

In [None]:
# How does the datafram look like now ?
df.describe()

In [None]:
# Now we parse the charging text, and grab all kW values and number of stalls.
from tqdm.notebook import tqdm
import traceback
for index,location in tqdm(df.iterrows(), total=df.shape[0]):
    charging_text = location["charging_text"]
    try:
        if charging_text and isinstance(charging_text, (str, bytes)):
            counts = re.findall('(\d+)\s+Superchargers',charging_text)
            powers = re.findall('(\d+)kW',charging_text)
            # For each pair of kW and stall count, add the kW value as column, and count as value
            for count, power in zip(counts,powers):
                df.loc[index, power] = count
    except:
        print(index,charging_text, type(charging_text))
        traceback.print_exc()
        break

In [None]:
for column in df.columns:
    if column.isdigit():
        df[column]= pd.to_numeric(df[column], errors='coerce')

In [None]:
# Store final result
filename = 'df.done.'+datetime.utcnow().strftime('%Y%m%d')+'.parquet'
df.to_parquet(filename)

In [None]:
# Now we can show the complete result.
df.head()

In [None]:
#df[(df['150'] > 0)& (df.open.str.contains('Non'))]

In [None]:
#df[(df['150'] > 0)& (df.open.str.contains('Non'))].to_csv("150_world_open.csv",index=False)