<center><span style="font-size:30px; font-weight: bold;">Nordic Compass Database</span></center>
<center><span style="font-size:24px;">Analysis of ESG Performance and CSRD Compliance</span></center>

<center><span style="font-size:22px;"><b>Section 2:</b> Gap analysis </span></center>

## Introduction to this section

## Imports

In [155]:
import pandas as pd
import numpy as np
import sys
import os

pd.set_option("display.max_columns", None)
sys.path.append(os.path.abspath(".."))
import random

from functions import (
    test_company,
)

# Reporting: Gap analysis

In [156]:
reporting_df = pd.read_csv("../datasets/reporting_df_original.csv")

In [157]:
reporting_df["year"] = pd.to_datetime(reporting_df["year"], format="%Y").dt.year

In [158]:
reporting_df.head()

Unnamed: 0,company,ticker,year,segment,industry,hq_country,external_audit_of_ESG_report,years_esg_data,base_year,ceo_sust_statem,environmental_policy_and_assessment,environmental_performance_targets,reduced_environmental_impact,increased_renewable_energy,disclosure_of_raw_material_use,resource_efficiency_target,disclosure_of_water_discharges,supplier_guidelines,disclosure_of_suppliers_audited,disclosure_of_supplier_evaluation_procedures,supplier_environmental_assessment,energy_consump_bool,water_withdraw_bool,ghg_emis_bool,transport_emis_bool
0,Archer Ltd.,ARCHO,2020,Mid,Energy,Norway,1,1,2020,1,1,1,1,0,0,1,0,1,1,0,0,1,0,0,0
1,AutoStore Holdings Ltd.,AUTO,2021,Large,Industrial Goods and Services,Bermuda,0,1,2021,1,1,0,1,0,1,0,0,1,0,1,0,0,0,1,1
2,Avance Gas Holding ltd,AGAS,2019,Mid,Energy,Norway,0,2,2019,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
3,Avance Gas Holding ltd,AGAS,2020,Mid,Energy,Norway,1,2,2019,1,1,1,1,0,0,1,0,1,0,0,0,1,0,0,0
4,Borr Drilling Ltd,BDRILL,2019,Mid,Energy,Bermuda,0,4,2019,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1,1


In [159]:
reporting_df.columns

Index(['company', 'ticker', 'year', 'segment', 'industry', 'hq_country',
       'external_audit_of_ESG_report', 'years_esg_data', 'base_year',
       'ceo_sust_statem', 'environmental_policy_and_assessment',
       'environmental_performance_targets', 'reduced_environmental_impact',
       'increased_renewable_energy', 'disclosure_of_raw_material_use',
       'resource_efficiency_target', 'disclosure_of_water_discharges',
       'supplier_guidelines', 'disclosure_of_suppliers_audited',
       'disclosure_of_supplier_evaluation_procedures',
       'supplier_environmental_assessment', 'energy_consump_bool',
       'water_withdraw_bool', 'ghg_emis_bool', 'transport_emis_bool'],
      dtype='object')

# HERE

In [160]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Define discrete columns outside the function for flexibility
columns_to_include = [
    "year",
    "segment",
    "industry",
    "hq_country",
    "years_esg_data",
    "base_year",
    "external_audit_of_ESG_report",
    "ceo_sust_statem",
    "environmental_policy_and_assessment",
    "environmental_performance_targets",
    "reduced_environmental_impact",
    "increased_renewable_energy",
    "disclosure_of_raw_material_use",
    "resource_efficiency_target",
    "disclosure_of_water_discharges",
    "supplier_guidelines",
    "disclosure_of_suppliers_audited",
    "disclosure_of_supplier_evaluation_procedures",
    "supplier_environmental_assessment",
    "energy_consump_bool",
    "water_withdraw_bool",
    "ghg_emis_bool",
    "transport_emis_bool",
]


def chart_visualisations(
    df: pd.DataFrame,
    columns_to_include: list,
    legend_column: str = None,
    n_cols: int = 3,
) -> None:
    """
    Creates a 3-column subplot visualization for discrete columns in the provided DataFrame.

    Parameters:
    ----------
    df : pd.DataFrame
        The input DataFrame containing the data to visualize.

    columns_to_include : list
        A list of discrete columns to visualize.

    legend_column : str, optional
        The name of the column that differentiates the data groups. If None, no grouping is applied.

    n_cols : int, optional
        The number of columns in the subplot layout.

    Returns:
    -------
    None
        Displays the final Plotly figure inline.
    """
    # Identify relevant columns that exist in df
    relevant_columns = [col for col in columns_to_include if col in df.columns]

    # Identify integer columns (excluding year and years_esg_data))
    int_columns = [
        col
        for col in relevant_columns
        if df[col].dtype == "int64" and col not in ["year", "years_esg_data"]
    ]

    n_rows = -(-len(relevant_columns) // n_cols)
    subplot_titles = [f"{col}" for col in relevant_columns]

    fig = make_subplots(
        rows=n_rows,
        cols=n_cols,
        subplot_titles=subplot_titles,
    )

    for idx, col in enumerate(relevant_columns):
        row_num = (idx // n_cols) + 1
        col_num = (idx % n_cols) + 1

        grouped_data = df[col].value_counts().sort_index()

        if col in int_columns:
            colors = grouped_data.index.map(
                lambda x: (
                    "rgb(6,212,124)"
                    if x == 1
                    else "rgb(254,240,205)" if x == 0 else "rgb(31, 119, 180)"
                )
            )
        else:
            colors = "rgb(31, 119, 180)"  # Default color

        fig.add_trace(
            go.Bar(
                x=grouped_data.index.astype(str),
                y=grouped_data.values,
                marker=dict(color=colors),
                name=f"{col} Count",
                showlegend=False,
            ),
            row=row_num,
            col=col_num,
        )

        fig.update_yaxes(title_text="Count", row=row_num, col=col_num, showgrid=False)

    # fig.update_annotations(dict(yshift=50))

    fig.update_layout(
        showlegend=False,
        legend_title_text=legend_column,
        height=400 * n_rows,
        width=1600,
        template="plotly_white",
    )

    return fig

In [163]:
chart_visualisations(reporting_df, columns_to_include)

# Create a new visualisation called parameters_by_year. All are stacked charts with the same colour scheme as above, but year runs along the x-axis of each. For those non-int_columns, have the stacked bar chart as different shades of blue, but for hq_country do a 'Sweden', 'Denmark', 'Norway', 'Finland' and 'Other' group... For industry, just skip it...

# To do:

Create 'gap analysis: total missing metrics (coverage of metrics)'

#### New columns (emissions)


Create a column: 'GHG per EUR revenue_ranking_all_companies' - This is binned from 1 to 10 (using quartiles and calculated using only values 
from the same year)

Create a column: 'GHG per EUR revenue_ranking_sector' - This is also binned from 1 to 10 (and calculated using only values from the same year)

Calculate the average GHG per EUR revenue as well as IQR--apply the outlier transformation and put all outliers in the '0' bin

Create a column: 'GHG per EUR revenue_ranking_all_PY' -- This is to compare to the values from the previous year

Create a column: 'GHG per EUR revenue_ranking_sector_PY' -- This is to compare to the values from the previous year

Create a column: '% change in GHG per EUR revenue vs PY'

Create a column: '% change in GHG emissions vs PY'

Create a column: 'transport emissions as a % of total emissions'

Create a column: '% change in transport emissions vs PY'

Create a column: 'Transport emissions as % of total emissions' (compare to sector)


Use the bins only for GHG emissions/EUR--compare values in each bin for all columns...

See how bin values vary from year to year

Calculate the number of companies that have migrated from bin to bin





#### Bonus columns

'GHG intensity reduction % vs sector-specific targets'--normalise so make it a % above or below target

'GHG intensity reduction % vs others in the sector_CY'--also normalise (and consider whether positive is good or bad)



#### Summary columns

Summarise results by:

- Segment/Industry

- HQ country

Declarations per year

--check which industry has the highest % of missing values

Percentage of companies in each industry that have their sustainability work audited

## Appendix

In [None]:
# # 2. Create 'consecutive_years_esg_data' by checking consecutive years starting from 2022
# def calculate_consecutive_years(group):
#     # Create a set of years for the current 'comp_name'
#     years = set(group["year"])
#     # Start from 2022 and count consecutive years backwards
#     count = 0
#     for year in range(2022, 2019, -1):  # Checking years 2022, 2021, 2020, ...
#         if year in years:
#             count += 1
#         else:
#             break  # Stop if any year is missing in the consecutive sequence

#     return count


# # Apply the function to each group of 'comp_name'
# df["consecutive_years_esg_data"] = (
#     df.groupby("comp_name")
#     .apply(calculate_consecutive_years)
#     .reset_index(level=0, drop=True)
# )