# Scraping the data
First step is to scrape data from www.datacenters.com
There are 85 pages, each with many data centres listed.
So, iterate through each page, and then iterate through each data centre and retrieve its data.

In [None]:
import time
import requests
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

base_url = 'https://www.datacenters.com/locations'  # base URL of initial page

# Function to get the HTML content of a page
def get_html(url: str) -> str | None:
    '''
    Get the HTML content of a page.

    Args:
        url (str): text URL.

    Returns:
        str | None: text content of page, if available.
    '''
    response = requests.get(url)
    if response.status_code == 200:  # successful get
        return response.text
    return None

def get_data_center_links(page_number: int) -> list:
    '''
    Parse the initial page and extract links to individual data centres.

    Args:
        page_number (int): number between 1 and 86 representing the current page.

    Returns:
        list: a list of all the data centres on this page.
    '''
    url = f'{base_url}?page={page_number}'
    html = get_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    links = []
    grid = soup.find_all('div', class_='LocationsIndex__tiles__Sc6sW')  # find the grid list of data centres
    for grid_item in grid[0].find_all('div', class_='LocationTile__location__tZKRS'):  # iterate through and extract the individual hrefs
        a_tag = grid_item.find('a')
        if a_tag:
            links.append(a_tag['href'])
    return links

def extract_data_center_info(url: str) -> dict:
    '''
    Extract data from an individual data centre page.

    Args:
        url (str): text URL of individual data centre page.

    Returns:
        dict: a dictionary containing all extracted data.
    '''
    html = get_html(url)
    try:
        soup = BeautifulSoup(html, 'html.parser')
    except:
        print('Page not found! Skipping!')
    try:
        info = soup.find('div', class_='LocationShow__sidebar__Pqjuu') # try to locate the main block of information on the page
    except:
        return {}
    try:
        name = info.find('a', class_='LocationShowSidebar__sidebarProviderLink__CRcRB').text.lstrip('View ')  # extract the owning company name
    except:
        name = np.nan
    try:
        location = info.find('span', class_='LocationShowSidebar__sidebarAddress__AZdxu').text.split(',')  # extract the address
    except:
        location = np.nan

    if location:  # if the address was successfully extracted, then try splitting it into components
        try:
            country = location[-1]
        except IndexError:
            country = np.nan
        try:
            city = location[-2]
        except IndexError:
            city = np.nan
        try:
            town = location[-3]
        except IndexError:
            town = np.nan
        try:
            address = location[-4]
        except IndexError:
            address = np.nan

    data_dict = {  # store current data in a dictionary
        'name': name,
        'country': country,
        'city': city,
        'town': town,
        'address': address,
        'total space (sqft)': np.nan,
        'colocation space (sqft)': np.nan,
        'total power (MW)': np.nan,
    }

    try:
        stats = info.find('div', class_='LocationShowSidebar__sidebarStats__OxlOT')  # extract statistics block
    except:
        return data_dict
    for i, stat in enumerate(stats.find_all('div', class_='LocationShowSidebarStat__statContainer__LPgsu')):  # iterate through individual stats
        if i < 3:  # only interested in the first three
            stat_text = stat.find_all('div')[-1].text
            stat_text = stat_text.split(' ')
            try:
                data_dict[f'{stat_text[2]} {stat_text[3]} ({stat_text[1]})'] = f'{stat_text[0]}'  # extract formatted stat, i.e. total power (MW) = 1
            except IndexError:
                pass  # missing data for this stat
    return data_dict

def main() -> None:
    '''
    Main script to collect all data and output to csv.
    '''
    data_centers = []
    for page in range(1, 86):  # there are 85 pages
        print(f'Scraping page {page}')
        links = get_data_center_links(page)  # get all data centres on the current page
        for i, link in enumerate(links):
            print(f'    link {i}')
            data_center_url = f'https://www.datacenters.com{link}'
            data_center_info = extract_data_center_info(data_center_url)  # extract individual data
            data_centers.append(data_center_info)
            time.sleep(1)  # to prevent overwhelming the server with requests
    df = pd.DataFrame(data_centers)  # convert to dataframe
    df.to_csv('data/data_centers.csv', index=False)  # output to csv

if __name__ == '__main__':  # run the main script
    data_centers = main()

# Geocoding data
Now that the data is scraped and in csv format, the address must be geocoded to get latitude and longitude values which can then be plotted on a geographical plot. Geocoding can be done with the free geocode.maps API (though at a limited rate).

In [None]:
import pandas as pd

df = pd.read_csv('data/data_centers.csv', sep=',')  # read in the data
df.drop(columns=['Unnamed: 0'], inplace=True)  # remove useless column

In [None]:
# clean the addresses first
df['country'] = df['country'].str.strip().str.lower()  # remove whitespace and convert to lower case
df['country'] = df['country'].replace('u.s.', 'usa')  # consistent naming
df['country'] = df['country'].replace('united kingdom', 'uk')
df.dropna(subset=['country'], inplace=True)  # remove blanks
pattern = r'^[a-zA-Z\s]+$'  # match all english alphabetic characters (including spaces)
df_cleaned = df[df['country'].str.contains(pattern, na=False)]  # filter by above regex
df_cleaned['country'] = df_cleaned['country'].str.replace(r'\bU\.S\.?\b', 'USA', regex=True)  # replace inconsistent USA naming
df_cleaned = df_cleaned[~df_cleaned['country'].str.contains(r'[0-9一-龯]', na=False)]  # filter out non matching characters
df = df_cleaned

In [None]:
import requests

def geocode(location: str, index: int, df: pd.DataFrame) -> None:
    '''
    Geocode address and store lat and lon values in the dataframe.

    Args:
        location (str): address to geocode.
        index (int): position to store geocoded data in the dataframe.
        df (pd.DataFrame): dataframe in which to store the data.
    '''
    url = f'https://geocode.maps.co/search?q={location}&api_key=668d02c6d9de4757313352qgufc7ba3'
    response = requests.get(url)  # geocode by sending get request to API containing address
    if response.status_code == 200 and response.json():  # successful get
        print(response.json()[0])
        df.loc[index, 'latitude'] = float(response.json()[0]['lat'])  # store in dataframe
        df.loc[index, 'longitude'] = float(response.json()[0]['lon'])

In [None]:
import time
import numpy as np

df['latitude'] = np.nan  # initialise the lat and lon columns as nan
df['longitude'] = np.nan

for index in range(len(df)):
    try:
        location: str = f'{df.loc[index, "city"]}, {df.loc[index, "country"]}'  # format the address for the API
    except:
        continue
    geocode(location, index)
    time.sleep(1.5)  # geocode at a limited rate

df.to_csv('data/geo_data_centers_cleaned.csv')  # save to csv

# Visualisation
Now the data has been cleaned and geocoded, visualisation can begin.
This will be done using plotly and its associated dashboard library dash.

In [1]:
import pandas as pd

data: pd.DataFrame = pd.read_csv('data/geo_data_centers_cleaned.csv', sep=',')  # read in csv data

data.drop(columns=['Unnamed: 0.1'], inplace=True)  # remove useless columns
data.drop(columns=['Unnamed: 0'], inplace=True)

In [2]:
location_data: pd.DataFrame = data[data['latitude'].notna() & data['longitude'].notna()]  # drop data centres that were not geocoded successfully

In [5]:
def clean_total_power(value) -> float:
    '''
    Convert the power column to float values.

    Args:
        value (np.nan | str | float): power value to be cleaned.

    Returns:
        float: the power value converted to a float
    '''
    if pd.isna(value):  # nan values are set to zero
        return 0.0
    if isinstance(value, str):  # remove erroneous data bits
        value = value.replace('MW', '').strip()
    try:
        value = float(value)  # convert to float
        if value > 1000000:  # account for incorrect units
            value /= 1000000
        elif value > 100:
            value /= 1000
        return value
    except ValueError:
        return 0.0  # set to zero if exception occurs

location_data['total power (MW)'] = location_data['total power (MW)'].apply(clean_total_power)  # clean the power column by applying the above function

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_data['total power (MW)'] = location_data['total power (MW)'].apply(clean_total_power)  # clean the power column by applying the above function


In [6]:
import dash

app = dash.Dash(__name__, external_stylesheets=['https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css', 'styles.css'])  # create dash application and reference bootstrap layout style sheet

In [7]:
import dash_core_components as dcc
import dash_html_components as html

from dash import dcc, html
from plotly import express as px
from plotly import graph_objects as go
from dash.dependencies import Input, Output
from pandas.core.groupby.generic import DataFrameGroupBy

BACKGROUND = '#121212'  # constant for background colour

app.layout = html.Div(className='main-container', children=[  # create the app layout with html and css styling (under assets directory)
    html.Div(className='top-row', children=[  # top row contains descriptive paragraph and map plot
        html.Div(className='col-5', children=[  # descriptive paragraph
            html.H1('Visualising Global Data Centres'),  # visualisation title
            html.P(  # description
                '''
                Data centres play a vital but largely unseen role in everyday
                internet activity. This dashboard explores where these centres
                are located, who owns them, and how large they are.
                '''
            ),
            html.P(
                '''
                *Note, some data (particularly related to China) is not openly
                available. Keep this in consideration during comparisons.
                '''
            ),
            html.P('''Use the drop down to filter all graphs and compare by country or by owning company.
                   Use the range slider to filter based on the number of data centres owned.'''),
            html.Div(className='compare-widget-container', children=[  # comparison dropdown and slider widgets container
                html.Div(className='compare-by', children=[  # dropdown widget container
                    html.Label(className='compare-widget-label', children='Compare by:'),  # dropdown widget label
                    dcc.Dropdown(className='compare-widget', id='compare-filter', options=[  # dropdown widget
                        {'label': html.Span(className='drop-text', children='Country'), 'value': 'COUNTRY'},  # selectable options
                        {'label': html.Span(className='drop-text', children='Company'), 'value': 'COMPANY'},
                    ], value='COUNTRY', searchable=False, clearable=False),  # cannot be searched or cleared (there must be a selected option)
                ]),
                html.Div(className='adjust-minimum', children=[  # slider widget container
                    html.Label(className='slider-label', children='# Centres:'),  # slider widget label
                    dcc.RangeSlider(min=0.99, max=(max_ := 8), step=None, marks={i: str(2 ** i) for i in range(max_ + 1)}, value=[4, 8], id='minimum-slider', className='slider-widget'),
                ]),  # slider widget that range from 1 to 11 but is represented by powers of 2 for filtering
            ]),
        ]),
        html.Div(className='col-7', children=[  # map graph column
            html.H2('Geographical Distribution of Data Centres'),  # map graph heading
            html.Div(className='map-widget-container', children=[  # map graph container
                dcc.Graph(className='map-container', id='map-graph'),  # map graph
                html.Div(className='map-widget', children=[  # map widget container
                    html.Label(className='map-label', children='Filter by Power'),  # map widget label
                    dcc.Checklist(className='checklist-container', id='power-filter', options=[  # map widget
                        {'label': html.Span(className='check-text', children='Power Known'), 'value': 'WITH_POWER'},  # selectable options
                        {'label': html.Span(className='check-text', children='Power Unknown'), 'value': 'WITHOUT_POWER'},
                    ], value=['WITH_POWER']),  # default to data centres with power data available
                    html.P(className='map-description', children=
                           '''
                           Explore data centre locations on this interactive globe with sizes
                           proportionate to power consumption (where known).
                           '''
                    ),  # map description
                ]),
            ])
        ]),
    ]),
    html.Div(className='bottom-row', children=[  # bottom row contains three interactive comparison plots
        html.Div(className='compare-container', children=[  # row of plots container
            html.Div(className='plot1', children=[  # lollipop graph container
                html.H3('Data Centre Quantity'),  # plot heading
                dcc.Graph(className='quantity-container', id='quantity-graph'),  # lollipop graph
                html.P(className='plot-description', children=
                       '''
                       Compare the number of data centres located in a specific country or
                       owned by a certain company.
                       '''
                ),  # plot description
            ]),
            html.Div(className='plot2', children=[  # box plot container
                html.H3('Data Centre Size'),  # plot heading
                dcc.Graph(className='size-container', id='size-graph'),  # box plot
                html.P(className='plot-description', children=
                       '''
                       Compare the average and spread of data centre sizes located in a specific
                       country or owned by a certain company.
                       '''
                ),
            ]),
            html.Div(className='plot3', children=[
                html.H3('Data Centre Power'),
                dcc.Graph(className='power-container', id='power-graph'),  # pie chart
                html.P(className='plot-description', children=
                       '''
                       Compare the proportion of total power that data centres located in a
                       specific country or owned by a certain company account for. Shares of < 2% have 
                       been accumulated into the "other" column.
                       '''
                ),
            ]),
        ]),
    ])
])

def world_plot(lon: list, lat: list, text: list, marker: dict) -> go.Figure:
    '''
    Create the map figure.

    Args:
        lon (list): longitude values to plot.
        lat (list): latitude values to plot.
        text (list): additional hover information to plot.
        marker (dict): marker display information (what size to plot the points).

    Returns:
        go.Figure: the map figure.
    '''
    fig = go.Figure()
    fig.add_trace(go.Scattergeo(lon=lon, lat=lat, text=text, marker=marker))  # plot the points
    fig.update_layout(
        geo=dict(
            showland=True, landcolor=BACKGROUND,  # set map attributes and colours
            showocean=True, oceancolor='grey',
            showlakes=True, lakecolor='grey',
            bgcolor=BACKGROUND, # set background colour
            projection=dict(
                type='orthographic',  # globe type
                rotation=dict(lon=-100, lat=40)  # default lon and lat
            ),
            scope='world', showcountries=True,
        ),
        margin=dict(l=0, r=0, t=0, b=0),  # remove outer whitespace
        paper_bgcolor=BACKGROUND,  # set outer plot background colour
    )
    return fig

@app.callback(  # callback to update map based on checklist selection
    Output('map-graph', 'figure'),  # outputs to map plot
    [Input('power-filter', 'value'), Input('compare-filter', 'value'), Input('minimum-slider', 'value')],
)  # takes the power filter, comparison filter, and range slider values in to produce the plot
def update_map(power: list, owner: str, num: list) -> go.Figure:
    '''
    Redraw the map depending on which data is selected for plotting by the user.

    Args:
        power (list): list of strings representing power widget selection.
        owner (str): text representing comparison widget selection.
        num (list): list of values representing range slider selection.

    Returns:
        go.Figure: the map figure.
    '''
    if max(num) == 8:  # if range slider is at the max value
        num[num.index(max(num))] = 2048  # extend the maximum to include all values

    if owner == 'COUNTRY':
        key = 'country'
    else:
        key = 'name'
    counts = location_data[key].value_counts()  # entry counts by key
    filtered = counts[(counts > 2 ** min(num)) & (counts < 2 ** max(num))].index  # all entries within the slider range
    filtered_data = location_data[location_data[key].isin(filtered)]  # filtered data according to slider range

    if power == ['WITH_POWER']:  # power widget selection
        filtered_data: pd.DataFrame = filtered_data[filtered_data['total power (MW)'] > 0]  # entries with power values
    elif power == ['WITHOUT_POWER']:
        filtered_data = filtered_data[filtered_data['total power (MW)'] == 0]  # entries without power values
    elif len(power) == 2:  # both selected
        filtered_data = filtered_data
    else:  # none selected
        return world_plot(lon=[0], lat=[0], text=[''], marker={'size': [0.5]})  # clear the plot

    # Constant size for centres without power data, proportional for those with power data.
    sizes: pd.Series = filtered_data['total power (MW)'].apply(lambda x: 7 if x == 0 else x / 3)
    # Blue for centres without power data, red for those with power data.
    colours: list = ['blue' if x == 0 else 'red' for x in filtered_data['total power (MW)']]

    return world_plot(
        lon=filtered_data['longitude'],
        lat=filtered_data['latitude'],
        text=filtered_data.apply(
            lambda row: f"Power: {row['total power (MW)']} MW<br>Colocation Space: {row['colocation space (sqft)']} sqft<br>Total Space: {row['total space (sqft)']} sqft",
            axis=1,
        ),
        marker=dict(
            size=sizes,
            color=colours,
            line=dict(width=0)
        )
    )

def create_lollipop(data: DataFrameGroupBy, filter: str, min_: int, max_: int) -> go.Figure:
    '''
    Create the lollipop graph (custom as not supported by plotly).

    Args:
        data (DataFrameGroupBy): data grouped by filter.
        filter (str): text xaxis label.
        min_ (int): minimum number of data centres to filter by.
        max_ (int): maximum number of data centres to filter by.

    Returns:
        go.Figure: the lollipop graph.
    '''
    if max_ == 256:
        max_ = 2048  # set the max to go above the range slider and show top values
    fig = go.Figure()
    fig.add_trace(go.Scatter(  # create a scatter trace of the data points initially
            x=[a[0] for a in data if len(a[1]) > min_ and len(a[1]) < max_],  # strings for xaxis
            y=[len(a[1]) for a in data if len(a[1]) > min_ and len(a[1]) < max_],  # data values (counts of data centres) for yaxis
            mode='markers', marker=dict(color='red'), fillcolor=BACKGROUND
    ))
    shapes: list = []
    i = 0
    for _, group in data:  # iterate through the groups and create an appropriate line
        if len(group) > min_ and len(group) < max_:
            shapes.append(  # create the lines for the lollipop diagram
                dict(type='line', xref='x', yref='y', x0=i, y0=0.9, x1=i, y1=len(group), line=dict(color='#fff', width=1))
            )
            i += 1

    fig.update_layout(
        shapes=shapes,  # add the line shapes with the scatter plot to create the custom lollipop diagram
        xaxis=dict(type='category', color='#fff', title=filter), yaxis=dict(type='log', color='#fff', title='Number'),
        margin=dict(l=0, r=0, t=0, b=0),
        paper_bgcolor=BACKGROUND,
        plot_bgcolor=BACKGROUND,
    )

    fig.update_xaxes(  # customise plot formatting
        gridcolor=BACKGROUND,  # hide grid lines
        showline=True,  # show axis line
        ticks='outside',  # show axis ticks
        title_font=dict(size=20),  # set font attributes
        tickfont=dict(size=15),
    )

    fig.update_yaxes(
        gridcolor=BACKGROUND,
        ticks='outside',
        title_font=dict(size=20),
        tickfont=dict(size=15),
    )

    return fig

@app.callback(
    Output('quantity-graph', 'figure'),  # outputs to lollipop graph
    [Input('compare-filter', 'value'), Input('minimum-slider', 'value')],
)  # takes comparison filter and slider range in to produce the plot
def update_quantity(filter: str, num: list) -> go.Figure:
    '''
    Redraw the quantity lollipop chart depending on which comparison category is selected.

    Args:
        filter (str): text representing comparison widget selection.
        num (list): list of values representing range slider selection.

    Returns:
        go.Figure: the custom lollipop graph.
    '''
    if filter == 'COUNTRY':
        filtered_data = location_data.groupby('country')
        return create_lollipop(data=filtered_data, filter='Country', min_=2 ** min(num), max_=2 ** max(num))
    else:
        filtered_data = location_data.groupby('name')
        return create_lollipop(data=filtered_data, filter='Company', min_=2 ** min(num), max_=2 ** max(num))

def create_box(data: DataFrameGroupBy, filter: str, min_: int, max_: int) -> go.Figure:
    '''
    Create the go.Box plot.

    Args:
        data (DataFrameGroupBy): data grouped by filter.
        filter (str): text xaxis label.
        min_ (int): minimum number of data centres to filter by.
        max_ (int): maximum number of data centres to filter by.

    Returns:
        go.Figure: the box plot.
    '''
    if max_ == 256:
        max_ = 2048  # extend the range as above
    traces: list = []
    colours = px.colors.qualitative.Plotly  # select plotly's qualitative contrasting colours for colour blind accessibility
    for i, (key, group) in enumerate(data):
        if len(y := group['total space (sqft)']) > min_ and len(y := group['total space (sqft)']) < max_:  # filter within slider range
            traces.append(go.Box(  # plot each box plot
                name=key, y=y,
                marker=dict(color=colours[i % len(colours)])
            ))

    layout = go.Layout(  # customise box plot layoutd
        xaxis=dict(title=filter, title_font=dict(size=20), tickfont=dict(size=15)),
        yaxis=dict(title='Size (sqft)', title_font=dict(size=20), tickfont=dict(size=15), type='log'),
        template='plotly_dark',
        margin=dict(l=0, r=0, t=0, b=0),
    )

    return go.Figure(data=traces, layout=layout)

@app.callback(
    Output('size-graph', 'figure'),  # outputs to box plot
    [Input('compare-filter', 'value'), Input('minimum-slider', 'value')],
)  # takes comparison filter and slider range in to produce the plot
def update_size(filter: str, num: list) -> go.Figure:
    '''
    Redraw the box plots depending on which comparison category is selected.

    Args:
        filter (str): text representing comparison widget selection.
        num (list): list of values representing range slider selection.

    Returns:
        go.Figure: the box plot.
    '''
    if filter == 'COUNTRY':
        filtered_data = location_data.groupby('country')
        return create_box(data=filtered_data, filter='Country', min_=2 ** min(num), max_=2 ** max(num))
    else:
        filtered_data = location_data.groupby('name')
        return create_box(data=filtered_data, filter='Company', min_=2 ** min(num), max_=2 ** max(num))

def create_pie(data: DataFrameGroupBy, val_col: str, lab_col: str) -> go.Figure:
    '''
    Create the pie chart.

    Args:
        data (DataFrameGroupBy): data grouped by filter.
        val_col (str): data column of interest in the grouped data object.
        lab_col (str): label column of interest in the grouped data object.

    Returns:
        go.Figure: the pie chart.
    '''
    total: float = data[val_col].sum()  # pie chart total
    data['percentage'] = data[val_col] / total  # calculate percentage shares
    large: pd.DataFrame = data[data['percentage'] >= 0.02]  # include if >= 2% share
    small: pd.DataFrame = data[data['percentage'] < 0.02]  # otherwise exclude
    other: float = small[val_col].sum()  # accumulate < 2% shares
    if other > 0:
        large = pd.concat([large, pd.DataFrame({lab_col: ['Other'], val_col: [other], 'percentage': [other / total]})])  # add 'other' column

    pie_data = go.Pie(  # create pie chart
        labels=large[lab_col],
        values=large[val_col],
        textinfo='label+percent',  # show labels and percentage share values
        insidetextorientation='radial',  # display text radially in chart
        hoverinfo='label+percent+value',
        marker=dict(
            colors=px.colors.qualitative.Plotly,  # use Plotly's qualitative color palette
            line=dict(color='#121212', width=2)  # white borders with width 2
        ),
    )

    layout = go.Layout(  # customise layout
        template='plotly_dark',
        legend=dict(  # text attributes for legend
            title=dict(text=lab_col.capitalize(), font=dict(size=16)),
            x=1, y=1,
        )
    )

    return go.Figure(data=[pie_data], layout=layout)

@app.callback(
    Output('power-graph', 'figure'),  # output to pie chart.
    [Input('compare-filter', 'value'), Input('minimum-slider', 'value')]
)  # takes comparison filter and slider range in to produce the plot
def update_power(filter, num) -> go.Figure:
    '''
    Redraw the pie chart depending on which comparsion category is selected.

    Args:
        filter (str): text representing comparison widget selection.
        num (list): list of values representing range slider selection.

    Returns:
        go.Figure: the pie chart.
    '''
    if max(num) == 8:
        num[num.index(max(num))] = 11
    if filter == 'COUNTRY':
        key = 'country'
    else:
        key = 'name'
    datacenter_counts = location_data[key].value_counts()  # entry counts by key
    filtered = datacenter_counts[(datacenter_counts > 2 ** min(num)) & (datacenter_counts < 2 ** max(num))].index  # all entries within slider range
    filtered_data = location_data[location_data[key].isin(filtered)]  # filter by slider range
    grouped_data = filtered_data.groupby(key)['total power (MW)'].sum().reset_index()  # group by key and sum power values
    return create_pie(grouped_data, 'total power (MW)', key)

app.run_server(debug=True)  # run the server to display the dashboard



The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`



The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`

