# Net Zero Coalition - Embodied Carbon Work Package

## Graphic summary of pipeline projects
The code in this notebook will provide a graphic summary of projects selected for further analysis from the Infrastructure and Projects Authority National Infrastructure and Construction Pipeline.

Code was authored by Dr Jannik Giesekam at University of Leeds in March 2020 - contact J.Giesekam@leeds.ac.uk for more details

In [77]:
# Import necessary packages
import pandas as pd
import re
from ipywidgets import interact
from bokeh.layouts import layout, column
from bokeh.models import ColumnDataSource, Select, HoverTool, BoxAnnotation, Span, Label, NumeralTickFormatter
from bokeh.plotting import figure, gridplot
from bokeh.themes import Theme
from bokeh.io import show, output_notebook, push_notebook
from bokeh.palettes import Category10
from bokeh.transform import factor_cmap

## Import data

Import selected projects that were previously filtered

In [78]:
projects = pd.read_csv('selected_projects.csv',engine='python',encoding='utf-8-sig') # load data

## Prepare and visualise data

Put into groups and create interactive graphic for EDA

In [79]:
# Tidy data

# Start by renaming columns to formats that bokeh can handle
projects.rename(columns={'Total Capex Cost all funding (£m)':'Capex', 
                         'Project / Programme Name':'Project',
                         'Project Summary':'Description',
                         'Key Client':'Client',
                         'Scheme Status':'Status',
                         'Start of Works / Construction (Projected)':'Start_Date',
                         'Date in Service (Projected)':'Date_In_Service'
                        },inplace=True)

# Set data types
projects['Sector'] = projects['Sector'].astype(str)
projects['Sub-Sector'] = projects['Sub-Sector'].astype(str)
projects['Project'] = projects['Project'].astype(str)
projects['Description'] = projects['Description'].astype(str)

# Clean numeric values
projects['Capex'] = projects['Capex'].str.replace(',','') # remove commas from capex values
projects['Capex'] = pd.to_numeric(projects['Capex'], errors='coerce')
projects['Capex'] = projects['Capex']*1000000 # convert to £ from millions

# Dates
# Clean and convert dates to datetime
projects['Start_Date_Clean'] = projects['Start_Date'] # create cleaning variant of date whilst preserving original
# replace ranges with initial start date
two_year_range_pat = re.compile(r'/\d\d$') # expression for common data entry format e.g. 2018/19
alt_pat = re.compile(r'-\d*') # expression for common data entry format e.g. 2018-19
projects['Start_Date_Clean'] = projects['Start_Date_Clean'].str.replace(two_year_range_pat,'')
projects['Start_Date_Clean'] = projects['Start_Date_Clean'].str.replace(alt_pat,'')
# replace text values
projects['Start_Date_Clean'] = projects['Start_Date_Clean'].str.replace('Active Programme','2020') # Insert 2020 for active
projects['Start_Date_Clean'] = projects['Start_Date_Clean'].str.replace('Started','2020') # Insert 2020 for started
projects['Start_Date_Clean'] = projects['Start_Date_Clean'].str.replace('Ongoing','2020') # Insert 2020 for ongoing
projects['Start_Date_Clean'] = projects['Start_Date_Clean'].str.replace('Various','2020') # Insert 2020 for various
projects['Start_Date_Clean'] = projects['Start_Date_Clean'].str.replace('TBC','2020') # Insert 2021 for TBC
# convert two remaining common formats
projects['Start_Date_Clean'].fillna('2020', inplace=True) # fill blanks with 2020
projects['Start_Date_Cleaned'] = pd.to_datetime(projects['Start_Date_Clean'], format='%Y',errors='coerce') # Convert dates to datetime that follow 2019 format
mask = projects['Start_Date_Cleaned'].isnull() # create mask for other format
projects.loc[mask, 'Start_Date_Cleaned'] = pd.to_datetime(projects[mask]['Start_Date_Clean'], format = '%b %y', errors='coerce') # Convert dates to datetime that follow Oct 2016 format
# Clean and convert date in service dates to datetime
projects['Date_In_Service_Clean'] = projects['Date_In_Service'] # create cleaning variant of date whilst preserving original
# replace ranges with initial start date
projects['Date_In_Service_Clean'] = projects['Date_In_Service_Clean'].str.replace(two_year_range_pat,'')
projects['Date_In_Service_Clean'] = projects['Date_In_Service_Clean'].str.replace(alt_pat,'')
# replace text values
projects['Date_In_Service_Clean'] = projects['Date_In_Service_Clean'].str.replace('Ongoing','2020') # Insert 2020 for ongoing
projects['Date_In_Service_Clean'] = projects['Date_In_Service_Clean'].str.replace('Various','2021') # Insert 2020 for various
projects['Date_In_Service_Clean'] = projects['Date_In_Service_Clean'].str.replace('TBC','2021') # Insert 2021 for TBC
projects['Date_In_Service_Clean'] = projects['Date_In_Service_Clean'].str.replace('2021/22 ','2021') # correction
projects['Date_In_Service_Clean'] = projects['Date_In_Service_Clean'].str.replace('21','2021') # correction
projects['Date_In_Service_Clean'] = projects['Date_In_Service_Clean'].str.replace('202021','2021') # correction
# convert two remaining common formats
projects['Date_In_Service_Cleaned'] = pd.to_datetime(projects['Date_In_Service_Clean'],errors='coerce') # Convert dates to datetime that follow 2019 format
projects['Date_In_Service_Cleaned'].fillna(pd.Timestamp('2025-01-01'), inplace=True) # fill blanks with 2025

# Other clean up bits and bobs
projects['Sub-Sector'] = projects['Sub-Sector'].str.strip() # remove spaces
projects.loc[projects['Sub-Sector']=='Electricity transmission','Sub-Sector'] = 'Electricity Transmission' # remove difference in case
projects.loc[projects['Status']=='Active programme','Status'] = 'Active Programme' # remove difference in case

# Create short description column
projects['Short_Description'] = projects['Description'].str[0:200] # use first 200 characters

In [80]:
# Plot bar charts showing aggregated headline data by sector, sub-sector and scheme status

output_notebook()

# set colormap based on sector
index_cmap = factor_cmap('Sector_Sub-Sector', palette=Category10[7], factors=sorted(projects['Sector'].unique()), end=1)

# Plot number of projects by sector and sub-sector
group = projects.groupby(by=['Sector', 'Sub-Sector'])
p1 = figure(plot_width=900, plot_height=500, title="Count of projects by category", x_range=group)
p1.vbar(x='Sector_Sub-Sector', top='Capex_count', source=group, width=1, 
       fill_color=index_cmap,line_color="black")
p1.y_range.start = 0
p1.x_range.range_padding = 0.05
p1.xgrid.grid_line_color = None
p1.yaxis.axis_label = "Number of projects"
p1.xaxis.major_label_orientation = 1.2

# Plot total capex of projects by sector and sub-sector
group2 = projects.groupby(by=['Sector','Sub-Sector']).sum()
p2 = figure(plot_width=900, plot_height=500, title="Capex of projects by category", x_range=group)
p2.vbar(x='Sector_Sub-Sector', top='Capex', source=group2, width=1, 
       fill_color=index_cmap,line_color="black")
p2.y_range.start = 0
p2.x_range.range_padding = 0.05
p2.xgrid.grid_line_color = None
p2.yaxis.axis_label = "Total Capex £ (where stated)"
p2.xaxis.major_label_orientation = 1.2
p2.yaxis[0].formatter = NumeralTickFormatter(format="(0 a)")

# Plot by sector and project status
index_cmap2 = factor_cmap('Sector_Status', palette=Category10[7], factors=sorted(projects['Sector'].unique()), end=1)
group3 = projects.groupby(by=['Sector','Status']).sum()
group3b = projects.groupby(by=['Sector','Status'])
p3 = figure(plot_width=900, plot_height=500, title="Capex of projects by scheme status", x_range=group3b)
p3.vbar(x='Sector_Status', top='Capex', source=group3, width=1,
        fill_color=index_cmap2, line_color="black")
p3.y_range.start = 0
p3.x_range.range_padding = 0.05
p3.xgrid.grid_line_color = None
p3.yaxis.axis_label = "Total Capex £ (where stated)"
p3.xaxis.major_label_orientation = 1.2
p3.yaxis[0].formatter = NumeralTickFormatter(format="(0 a)")

show(p1)
show(p2)
show(p3)

In [82]:
# Create EDA graphic with circle for each project and dropdowns to select by sector with tooltip showing key info on hover
          
# Extract dropdown choice lists
sector_list = projects.Sector.unique().tolist()
sector_list.append('All') # append an All option
status_list = projects.Status.unique().tolist()
status_list.append('All') # append an All option

# Generate custom tooltips
TOOLTIPS_EDA = """
    <div style="margin:5px; width:200px">
        <div>
            <span style="font-size: 13px; font-weight: bold;">@Project</span>
        </div>
        <div>
            <span style="font-size: 11px; font-weight: bold;">£@Capex{0 a} - @Client</span>
        </div>       
        <div>
            <span style="color: #31476b">Status:</span>&nbsp<span>@Status</span>
        </div>
        <div>
            <span style="color: #31476b">Start of Works:</span>&nbsp<span>@Start_Date</span>
        </div>
        <div>
            <span style="color: #31476b">Date in Service (Projected):</span>&nbsp<span>@Date_In_Service</span>
        </div>        
        <div>        
            <span style="font-size: 9px">@Short_Description</span>
        </div>
    </div>
"""
hover = HoverTool(tooltips=TOOLTIPS_EDA)  

# set colormap based on sector
status_cmap = factor_cmap('Status', palette=Category10[8], factors=sorted(projects['Status'].unique()), end=1)

output_notebook()

# Set plot data source
source = ColumnDataSource(projects)

# Set plot dimensions, data source and glyphs
left_p = figure(plot_height=450, plot_width=450, x_axis_type='datetime', title="Filter projects with dropdowns below and hover for info", tools=[hover,'pan','wheel_zoom','box_zoom','save','reset'])
left = left_p.circle("Start_Date_Cleaned", "Capex", source=source, color=status_cmap, size=12, line_color=None)
right_p = figure(plot_height=450, plot_width=450, x_axis_type='datetime', tools=[hover,'pan','wheel_zoom','lasso_select','box_zoom','save','reset'])
right = right_p.circle("Date_In_Service_Cleaned", "Capex", source=source, color=status_cmap, size=12, line_color=None)

# Set axes
left_p.yaxis.axis_label = 'Capex £'
left_p.xaxis.axis_label = 'Start of works or construction'
right_p.xaxis.axis_label = 'Date in service (projected)'
left_p.yaxis.formatter=NumeralTickFormatter(format="0 a")
right_p.yaxis.formatter=NumeralTickFormatter(format="0 a")

# Function to update plot based on interactions
def update(sector, status):    
    if sector == "All" and status =="All":
        left.data_source.data['Start_Date_Cleaned'] = projects['Start_Date_Cleaned']
        left.data_source.data['Capex'] = projects['Capex']               
        right.data_source.data['Start_Date_Cleaned'] = projects['Start_Date_Cleaned']
        right.data_source.data['Capex'] = projects['Capex']
    elif sector == "All":
        left.data_source.data['Start_Date_Cleaned'] = projects[projects['Status']==status]['Start_Date_Cleaned']
        left.data_source.data['Capex'] = projects[projects['Status']==status]['Capex']               
        right.data_source.data['Start_Date_Cleaned'] = projects[projects['Status']==status]['Start_Date_Cleaned']
        right.data_source.data['Capex'] = projects[projects['Status']==status]['Capex']             
    elif status == "All":
        left.data_source.data['Start_Date_Cleaned'] = projects[projects['Sector']==sector]['Start_Date_Cleaned']
        left.data_source.data['Capex'] = projects[projects['Sector']==sector]['Capex']               
        right.data_source.data['Start_Date_Cleaned'] = projects[projects['Sector']==sector]['Start_Date_Cleaned']
        right.data_source.data['Capex'] = projects[projects['Sector']==sector]['Capex']         
    else:
        left.data_source.data['Start_Date_Cleaned'] = projects[(projects['Status']==status)&(projects['Sector']==sector)]['Start_Date_Cleaned']
        left.data_source.data['Capex'] = projects[(projects['Status']==status)&(projects['Sector']==sector)]['Capex']               
        right.data_source.data['Start_Date_Cleaned'] = projects[(projects['Status']==status)&(projects['Sector']==sector)]['Start_Date_Cleaned']
        right.data_source.data['Capex'] = projects[(projects['Status']==status)&(projects['Sector']==sector)]['Capex']                       
    push_notebook()
    
p = gridplot([[left_p, right_p]])
show(p, notebook_handle=True)
interact(update, sector=sector_list, status=status_list)

interactive(children=(Dropdown(description='sector', options=('Communications', 'Energy', 'Flood', 'Transport'…

<function __main__.update(sector, status)>