- Emphasise that this can be used to target new customers--you can immediately see which companies have missing X, Y and Z as part of their gap analysis...
- You can see which data companies find it hardest to collect, etc...

<center><span style="font-size:30px; font-weight: bold;">Nordic Compass Database</span></center>
<center><span style="font-size:24px;">Analysis of ESG Performance and CSRD Compliance</span></center>

<center><span style="font-size:22px;"><b>Section 2:</b> Gap analysis </span></center>

## Introduction to this section

## Imports

In [348]:
import pandas as pd
import numpy as np
import sys
import os

pd.set_option("display.max_columns", None)
sys.path.append(os.path.abspath(".."))
import random

from functions import test_company, show_missing_values

pd.options.display.float_format = "{:,.2f}".format
from IPython.display import display

# Reporting: Gap analysis

In [349]:
reporting_df = pd.read_csv("../datasets/reporting_df_original.csv")

In [350]:
reporting_df["year"] = pd.to_datetime(reporting_df["year"], format="%Y").dt.year

In [351]:
reporting_df.head()

Unnamed: 0,company,ticker,year,segment,industry,hq_country,external_audit_of_ESG_report,years_esg_data,base_year,ceo_sust_statem,environmental_policy_and_assessment,environmental_performance_targets,reduced_environmental_impact,increased_renewable_energy,disclosure_of_raw_material_use,resource_efficiency_target,disclosure_of_water_discharges,supplier_guidelines,disclosure_of_suppliers_audited,disclosure_of_supplier_evaluation_procedures,supplier_environmental_assessment,energy_consump_bool,water_withdraw_bool,ghg_emis_bool,transport_emis_bool
0,Archer Ltd.,ARCHO,2020,Mid,Energy,Norway,1,1,2020,1,1,1,1,0,0,1,0,1,1,0,0,1,0,0,0
1,AutoStore Holdings Ltd.,AUTO,2021,Large,Industrial Goods and Services,Bermuda,0,1,2021,1,1,0,1,0,1,0,0,1,0,1,0,0,0,1,1
2,Avance Gas Holding ltd,AGAS,2019,Mid,Energy,Norway,0,2,2019,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
3,Avance Gas Holding ltd,AGAS,2020,Mid,Energy,Norway,1,2,2019,1,1,1,1,0,0,1,0,1,0,0,0,1,0,0,0
4,Borr Drilling Ltd,BDRILL,2019,Mid,Energy,Bermuda,0,4,2019,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1,1


In [352]:
reporting_df.columns

Index(['company', 'ticker', 'year', 'segment', 'industry', 'hq_country',
       'external_audit_of_ESG_report', 'years_esg_data', 'base_year',
       'ceo_sust_statem', 'environmental_policy_and_assessment',
       'environmental_performance_targets', 'reduced_environmental_impact',
       'increased_renewable_energy', 'disclosure_of_raw_material_use',
       'resource_efficiency_target', 'disclosure_of_water_discharges',
       'supplier_guidelines', 'disclosure_of_suppliers_audited',
       'disclosure_of_supplier_evaluation_procedures',
       'supplier_environmental_assessment', 'energy_consump_bool',
       'water_withdraw_bool', 'ghg_emis_bool', 'transport_emis_bool'],
      dtype='object')

# HERE

In [353]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Define discrete columns outside the function for flexibility
columns_to_include = [
    "year",
    "segment",
    "industry",
    "hq_country",
    "years_esg_data",
    "base_year",
    "external_audit_of_ESG_report",
    "ceo_sust_statem",
    "environmental_policy_and_assessment",
    "environmental_performance_targets",
    "reduced_environmental_impact",
    "increased_renewable_energy",
    "disclosure_of_raw_material_use",
    "resource_efficiency_target",
    "disclosure_of_water_discharges",
    "supplier_guidelines",
    "disclosure_of_suppliers_audited",
    "disclosure_of_supplier_evaluation_procedures",
    "supplier_environmental_assessment",
    "energy_consump_bool",
    "water_withdraw_bool",
    "ghg_emis_bool",
    "transport_emis_bool",
]


def chart_visualisations(
    df: pd.DataFrame,
    columns_to_include: list,
    legend_column: str = None,
    n_cols: int = 3,
) -> None:
    """
    Creates a 3-column subplot visualization for discrete columns in the provided DataFrame.

    Parameters:
    ----------
    df : pd.DataFrame
        The input DataFrame containing the data to visualize.

    columns_to_include : list
        A list of discrete columns to visualize.

    legend_column : str, optional
        The name of the column that differentiates the data groups. If None, no grouping is applied.

    n_cols : int, optional
        The number of columns in the subplot layout.

    Returns:
    -------
    None
        Displays the final Plotly figure inline.
    """
    # Identify relevant columns that exist in df
    relevant_columns = [col for col in columns_to_include if col in df.columns]

    # Identify integer columns (excluding year and years_esg_data))
    int_columns = [
        col
        for col in relevant_columns
        if df[col].dtype == "int64" and col not in ["year", "years_esg_data"]
    ]

    n_rows = -(-len(relevant_columns) // n_cols)
    subplot_titles = [f"{col}" for col in relevant_columns]

    fig = make_subplots(
        rows=n_rows,
        cols=n_cols,
        subplot_titles=subplot_titles,
    )

    for idx, col in enumerate(relevant_columns):
        row_num = (idx // n_cols) + 1
        col_num = (idx % n_cols) + 1

        grouped_data = df[col].value_counts().sort_index()

        if col in int_columns:
            colors = grouped_data.index.map(
                lambda x: (
                    "rgb(6,212,124)"
                    if x == 1
                    else "rgb(254,240,205)" if x == 0 else "rgb(31, 119, 180)"
                )
            )
        else:
            colors = "rgb(31, 119, 180)"  # Default color

        fig.add_trace(
            go.Bar(
                x=grouped_data.index.astype(str),
                y=grouped_data.values,
                marker=dict(color=colors),
                name=f"{col} Count",
                showlegend=False,
            ),
            row=row_num,
            col=col_num,
        )

        fig.update_yaxes(title_text="Count", row=row_num, col=col_num, showgrid=False)

    # fig.update_annotations(dict(yshift=50))

    fig.update_layout(
        showlegend=False,
        legend_title_text=legend_column,
        height=400 * n_rows,
        width=1600,
        template="plotly_white",
    )

    return fig

# Figure out how to make this much faster

In [354]:
chart_visualisations(reporting_df, columns_to_include)

# Create a new visualisation called parameters_by_year. All are stacked charts with the same colour scheme as above, but year runs along the x-axis of each. For those non-int_columns, have the stacked bar chart as different shades of blue, but for hq_country do a 'Sweden', 'Denmark', 'Norway', 'Finland' and 'Other' group... For industry, just skip it...

In [355]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def parameters_by_year(df: pd.DataFrame, columns_to_include: list) -> go.Figure:
    """
    Creates stacked bar charts for selected parameters by year.

    Parameters:
    -----------
    df : pd.DataFrame
        The input DataFrame containing the data to visualize.

    columns_to_include : list
        A list of discrete columns to visualize.

    Returns:
    --------
    go.Figure
        A Plotly figure displaying stacked bar charts for each parameter over time.
    """

    # Exclude columns that should not be visualized
    exclude_columns = {
        "year",
        "industry",
        "years_esg_data",
        "base_year",
        "segment",
        "hq_country",
    }
    relevant_columns = [
        col
        for col in columns_to_include
        if col in df.columns and col not in exclude_columns
    ]

    # Modify hq_country to group non-Nordic countries into "Other"
    if "hq_country" in df.columns:
        df["hq_country"] = df["hq_country"].apply(
            lambda x: x if x in {"Sweden", "Norway", "Denmark", "Finland"} else "Other"
        )

    # Identify integer columns that are binary (0/1)
    binary_columns = [
        col
        for col in relevant_columns
        if df[col].nunique() == 2 and sorted(df[col].unique()) == [0, 1]
    ]

    # Create subplot layout
    n_cols = 3
    n_rows = -(-len(relevant_columns) // n_cols)
    fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=relevant_columns)

    for idx, col in enumerate(relevant_columns):
        row_num = (idx // n_cols) + 1
        col_num = (idx % n_cols) + 1

        # Aggregate counts per year per category
        grouped_data = df.groupby(["year", col]).size().unstack(fill_value=0)

        grouped_data = grouped_data.div(grouped_data.sum(axis=1), axis=0) * 100

        # Assign colors (binary columns get green/beige, others get default)
        if col in binary_columns:
            colors = ["rgb(6,212,124)", "rgb(254,240,205)"]  # Green for 1, Beige for 0
        else:
            colors = None  # Default Plotly colors

        # Add traces for each unique category
        for i, category in enumerate(grouped_data.columns):
            fig.add_trace(
                go.Bar(
                    x=grouped_data.index.astype(str),
                    y=grouped_data[category],
                    name=f"{category}",
                    marker=dict(color=colors[i] if colors else None),
                    showlegend=False,
                ),
                row=row_num,
                col=col_num,
            )

    fig.update_layout(
        barmode="stack",
        height=400 * n_rows,
        width=1600,
        template="plotly_white",
        showlegend=False,
        yaxis=dict(title="Percentage", tickformat=".1f%"),
    )

    return fig

In [356]:
parameters_by_year(reporting_df, columns_to_include)

In [357]:
metrics = [
    "external_audit_of_ESG_report",
    "ceo_sust_statem",
    "environmental_policy_and_assessment",
    "environmental_performance_targets",
    "reduced_environmental_impact",
    "increased_renewable_energy",
    "disclosure_of_raw_material_use",
    "resource_efficiency_target",
    "disclosure_of_water_discharges",
    "supplier_guidelines",
    "disclosure_of_suppliers_audited",
    "disclosure_of_supplier_evaluation_procedures",
    "supplier_environmental_assessment",
    "energy_consump_bool",
    "water_withdraw_bool",
    "ghg_emis_bool",
    "transport_emis_bool",
]

# Calculate count_of_metrics (sum of columns with value 1)
reporting_df["metrics_reported"] = reporting_df[metrics].sum(axis=1)

# # Identify missing metrics (those that are 0)
# reporting_df["missing_metrics"] = reporting_df[metrics].apply(
#     lambda row: [col for col in metrics if row[col] == 0], axis=1
# )

In [358]:
reporting_df.head()

Unnamed: 0,company,ticker,year,segment,industry,hq_country,external_audit_of_ESG_report,years_esg_data,base_year,ceo_sust_statem,environmental_policy_and_assessment,environmental_performance_targets,reduced_environmental_impact,increased_renewable_energy,disclosure_of_raw_material_use,resource_efficiency_target,disclosure_of_water_discharges,supplier_guidelines,disclosure_of_suppliers_audited,disclosure_of_supplier_evaluation_procedures,supplier_environmental_assessment,energy_consump_bool,water_withdraw_bool,ghg_emis_bool,transport_emis_bool,metrics_reported
0,Archer Ltd.,ARCHO,2020,Mid,Energy,Norway,1,1,2020,1,1,1,1,0,0,1,0,1,1,0,0,1,0,0,0,9
1,AutoStore Holdings Ltd.,AUTO,2021,Large,Industrial Goods and Services,Other,0,1,2021,1,1,0,1,0,1,0,0,1,0,1,0,0,0,1,1,8
2,Avance Gas Holding ltd,AGAS,2019,Mid,Energy,Norway,0,2,2019,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,4
3,Avance Gas Holding ltd,AGAS,2020,Mid,Energy,Norway,1,2,2019,1,1,1,1,0,0,1,0,1,0,0,0,1,0,0,0,8
4,Borr Drilling Ltd,BDRILL,2019,Mid,Energy,Other,0,4,2019,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1,1,7


In [359]:
summary_by_industry_df = (
    reporting_df.groupby(["industry", "year"])[metrics + ["metrics_reported"]]
    .mean()
    .reset_index()
    .set_index(["industry", "year"])
)

median_count_metrics = (
    reporting_df.groupby(["industry", "year"])["metrics_reported"]
    .median()
    .reset_index()
)

median_count_metrics.rename(
    columns={"metrics_reported": "metrics_reported_median"}, inplace=True
)

# Merge the median with the original summary DataFrame
summary_by_industry_df = summary_by_industry_df.merge(
    median_count_metrics, on=["industry", "year"], how="left"
)

Calculate the median metrics count. Note to self: still need to calculate percentiles

In [360]:
# use this as a benchmark to judge the performance of each company
median_metrics_by_industry = summary_by_industry_df[
    ["industry", "year", "metrics_reported", "metrics_reported_median"]
].set_index(["industry", "year"])

median_metrics_by_industry

Unnamed: 0_level_0,Unnamed: 1_level_0,metrics_reported,metrics_reported_median
industry,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Basic Materials,2019,11.48,13.00
Basic Materials,2020,12.16,13.00
Basic Materials,2021,14.28,14.00
Basic Materials,2022,12.45,13.00
Biotechnology,2019,1.50,1.50
...,...,...,...
Unknown,2022,8.00,8.00
Utilities,2019,10.75,11.00
Utilities,2020,13.00,12.50
Utilities,2021,11.80,10.00


In [361]:
summary_overall_df = (
    reporting_df.groupby(["year"])[metrics + ["metrics_reported"]]
    .mean()
    .reset_index()
    .set_index(["year"])
)

median_count_metrics = (
    reporting_df.groupby(["year"])["metrics_reported"].median().reset_index()
)

median_count_metrics.rename(
    columns={"metrics_reported": "metrics_reported_median"}, inplace=True
)

# Merge the median with the original summary DataFrame
summary_overall_df = summary_overall_df.merge(
    median_count_metrics, on=["year"], how="left"
)

In [362]:
median_metrics_overall = summary_overall_df[
    ["year", "metrics_reported", "metrics_reported_median"]
].set_index(["year"])

median_metrics_overall

Unnamed: 0_level_0,metrics_reported,metrics_reported_median
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2019,8.42,8.0
2020,9.25,10.0
2021,9.63,10.0
2022,8.91,10.0


In [363]:
# Ensure DataFrame is sorted by company and year
# reporting_df = reporting_df.sort_values(by=["company", "year"])

# Calculate the change directly without keeping the previous year column
reporting_df["metrics_change_from_prev_year"] = reporting_df[
    "metrics_reported"
] - reporting_df.groupby(["company", "year"])["metrics_reported"].shift(1)

In [364]:
reporting_df = reporting_df.merge(
    reporting_df.loc[
        reporting_df["year"] == reporting_df["base_year"],
        ["company", "metrics_reported"],
    ],
    on="company",
    how="left",
    suffixes=("", "_base_year"),
)

# Compute the change from base year
reporting_df["metrics_change_from_base_year"] = (
    reporting_df["metrics_reported"] - reporting_df["metrics_reported_base_year"]
)

reporting_df.loc[
    reporting_df["year"] == reporting_df["base_year"], "metrics_change_from_base_year"
] = float("nan")

reporting_df.drop("metrics_reported_base_year", axis=1, inplace=True)

In [365]:
reporting_df.head(5)

Unnamed: 0,company,ticker,year,segment,industry,hq_country,external_audit_of_ESG_report,years_esg_data,base_year,ceo_sust_statem,environmental_policy_and_assessment,environmental_performance_targets,reduced_environmental_impact,increased_renewable_energy,disclosure_of_raw_material_use,resource_efficiency_target,disclosure_of_water_discharges,supplier_guidelines,disclosure_of_suppliers_audited,disclosure_of_supplier_evaluation_procedures,supplier_environmental_assessment,energy_consump_bool,water_withdraw_bool,ghg_emis_bool,transport_emis_bool,metrics_reported,metrics_change_from_prev_year,metrics_change_from_base_year
0,Archer Ltd.,ARCHO,2020,Mid,Energy,Norway,1,1,2020,1,1,1,1,0,0,1,0,1,1,0,0,1,0,0,0,9,,
1,AutoStore Holdings Ltd.,AUTO,2021,Large,Industrial Goods and Services,Other,0,1,2021,1,1,0,1,0,1,0,0,1,0,1,0,0,0,1,1,8,,
2,Avance Gas Holding ltd,AGAS,2019,Mid,Energy,Norway,0,2,2019,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,4,,
3,Avance Gas Holding ltd,AGAS,2020,Mid,Energy,Norway,1,2,2019,1,1,1,1,0,0,1,0,1,0,0,0,1,0,0,0,8,,4.0
4,Borr Drilling Ltd,BDRILL,2019,Mid,Energy,Other,0,4,2019,0,1,0,1,0,0,1,0,1,0,0,0,1,0,1,1,7,,


# Then calculate relative change vs. previous year--ie. rating change (whether it has migrated up or not)

# Now consider how I am going to calculate my percentiles and rankings...

In [366]:
from typing import Union


def assign_rating(percentile: float) -> str:
    """
    Assigns a rating (A-F) based on the given percentile.

    Parameters:
    percentile (float): The percentile value (0 to 1).

    Returns:
    str: The corresponding rating ("A", "B", "C", "D", "E", or "F").
    """
    if percentile >= 0.83:
        return "A"
    elif percentile >= 0.67:
        return "B"
    elif percentile >= 0.50:
        return "C"
    elif percentile >= 0.33:
        return "D"
    elif percentile >= 0.17:
        return "E"
    else:
        return "F"



# Ensure DataFrame is sorted



reporting_df = reporting_df.sort_values(by=["year", "industry", "metrics_reported"])



# Compute industry percentile (within each year & industry)



reporting_df["industry_percentile"] = reporting_df.groupby(["year", "industry"])[
    "metrics_reported"
].rank(pct=True)



# Compute overall percentile (within each year)


reporting_df["overall_percentile"] = reporting_df.groupby("year")[
    "metrics_reported"
].rank(pct=True)



# Assign ratings based on the defined scale



reporting_df["industry_rating"] = reporting_df["industry_percentile"].apply(
    assign_rating

)



reporting_df["overall_rating"] = reporting_df["overall_percentile"].apply(assign_rating)



# Display relevant columns


reporting_df[
    [
        "company",
        "year",
        "industry",
        "metrics_reported",
        "industry_percentile",
        "industry_rating",
        "overall_percentile",
        "overall_rating",
    ]
]

Unnamed: 0,company,year,industry,metrics_reported,industry_percentile,industry_rating,overall_percentile,overall_rating
475,Afarak Group Plc,2019,Basic Materials,1,0.04,F,0.04,F
52,Josemaria Resources Inc.,2019,Basic Materials,2,0.08,F,0.07,F
56,Lundin Gold Inc.,2019,Basic Materials,5,0.12,F,0.23,E
160,H+H International A/S,2019,Basic Materials,7,0.16,F,0.37,D
54,Lucara Diamond Corp.,2019,Basic Materials,8,0.20,E,0.46,D
...,...,...,...,...,...,...,...,...
269,GreenMobility A/S,2022,Unknown,8,1.00,A,0.35,D
725,Arendals Fossekompani ASA,2022,Utilities,8,0.25,E,0.35,D
1796,OX2 AB,2022,Utilities,9,0.50,C,0.44,D
190,Ørsted A/S,2022,Utilities,13,0.75,B,0.88,A


In [367]:
import pandas as pd
from typing import Optional


def top_n_companies(
    df: pd.DataFrame,
    n_companies: int = 20,
    industry: Optional[str] = None,
    year: Optional[int] = None,
    hq_country: Optional[str] = None,
    segment: Optional[str] = None,
) -> pd.DataFrame:
    """
    Returns the top N companies based on metrics_reported.

    Parameters:
    df (pd.DataFrame): The reporting DataFrame.
    n_companies (int): Number of top companies to return (default = 20).
    industry (Optional[str]): Industry to filter by (default = None, includes all industries).
    year (Optional[int]): Year to filter by (default = None, includes all years).
    hq_country (Optional[str]): HQ country to filter by (default = None, includes all countries).
    segment (Optional[str]): Segment to filter by (default = None, includes all segments).

    Returns:
    pd.DataFrame: Top N companies sorted by metrics_reported.
    """

    # Create a filtered DataFrame based on user input
    filtered_df = df.copy()

    if industry is not None:
        filtered_df = filtered_df[filtered_df["industry"] == industry]

    if year is not None:
        filtered_df = filtered_df[filtered_df["year"] == year]

    if hq_country is not None:
        filtered_df = filtered_df[filtered_df["hq_country"] == hq_country]

    if segment is not None:
        filtered_df = filtered_df[filtered_df["segment"] == segment]

    # Sort by metrics_reported in descending order
    top_companies = filtered_df.sort_values(
        by="metrics_reported", ascending=False
    ).head(n_companies)

    return top_companies

In [368]:
reporting_df["industry"].unique()

array(['Basic Materials', 'Biotechnology', 'Consumer Goods and Services',
       'Energy', 'Finance', 'Health Care',
       'Industrial Goods and Services', 'Leisure', 'Media', 'Real Estate',
       'Retail', 'Technology', 'Telecommunications', 'Travel and Leisure',
       'Utilities', 'Unknown'], dtype=object)

In [369]:
desired_columns = [
    "company",
    "year",
    "industry",
    "metrics_reported",  # Move this near the beginning
    "industry_rating",  # Move this near the beginning
    "overall_rating",  # Move this near the beginning
    "industry_percentile",
    "overall_percentile",
]

# Ensure all other columns are preserved in the order
remaining_columns = [col for col in reporting_df.columns if col not in desired_columns]

# Reorder the DataFrame columns
reporting_df = reporting_df[desired_columns + remaining_columns]

# Fix this duplicated year at source

In [372]:
reporting_df[reporting_df["years_esg_data"] == 5]

Unnamed: 0,company,year,industry,metrics_reported,industry_rating,overall_rating,industry_percentile,overall_percentile,ticker,segment,hq_country,external_audit_of_ESG_report,years_esg_data,base_year,ceo_sust_statem,environmental_policy_and_assessment,environmental_performance_targets,reduced_environmental_impact,increased_renewable_energy,disclosure_of_raw_material_use,resource_efficiency_target,disclosure_of_water_discharges,supplier_guidelines,disclosure_of_suppliers_audited,disclosure_of_supplier_evaluation_procedures,supplier_environmental_assessment,energy_consump_bool,water_withdraw_bool,ghg_emis_bool,transport_emis_bool,metrics_change_from_prev_year,metrics_change_from_base_year


In [373]:
pd.set_option("display.max_colwidth", 100)

# Create a DataFrame with unique values for each column
unique_values_df = pd.DataFrame(
    {
        "columns": reporting_df.columns,
        "unique_values": [
            reporting_df[col].unique().tolist() for col in reporting_df.columns
        ],
    }
).set_index("columns")

unique_values_df

Unnamed: 0_level_0,unique_values
columns,Unnamed: 1_level_1
company,"[Afarak Group Plc, Josemaria Resources Inc., Lundin Gold Inc., H+H International A/S, Lucara Dia..."
year,"[2019, 2020, 2021, 2022]"
industry,"[Basic Materials, Biotechnology, Consumer Goods and Services, Energy, Finance, Health Care, Indu..."
metrics_reported,"[1, 2, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 3, 4, 6, 0, 17]"
industry_rating,"[F, E, D, C, B, A]"
overall_rating,"[F, E, D, C, B, A]"
industry_percentile,"[0.04, 0.08, 0.12, 0.16, 0.2, 0.26, 0.34, 0.4, 0.46, 0.58, 0.68, 0.76, 0.92, 0.5, 1.0, 0.0235294..."
overall_percentile,"[0.0411522633744856, 0.06995884773662552, 0.2294238683127572, 0.3734567901234568, 0.460905349794..."
ticker,"[AFAGR, JOSE, LUG, HH, LUC, SPG, BOL, AM1, ELK, NSKOG, LUMI, HPOL, STERV, BRG, HOLM, BEIA, UPM, ..."
segment,"[Mid, Large, Small]"


In [374]:
top_n_companies(reporting_df, n_companies=None, industry="Energy", year=2021)

Unnamed: 0,company,year,industry,metrics_reported,industry_rating,overall_rating,industry_percentile,overall_percentile,ticker,segment,hq_country,external_audit_of_ESG_report,years_esg_data,base_year,ceo_sust_statem,environmental_policy_and_assessment,environmental_performance_targets,reduced_environmental_impact,increased_renewable_energy,disclosure_of_raw_material_use,resource_efficiency_target,disclosure_of_water_discharges,supplier_guidelines,disclosure_of_suppliers_audited,disclosure_of_supplier_evaluation_procedures,supplier_environmental_assessment,energy_consump_bool,water_withdraw_bool,ghg_emis_bool,transport_emis_bool,metrics_change_from_prev_year,metrics_change_from_base_year
452,Neste Oyj,2021,Energy,17,A,A,1.0,1.0,NESTE,Large,Finland,1,4,2019,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,,2.0
306,Vestas Wind Systems A/S,2021,Energy,15,A,A,0.96,0.94,VWS,Large,Denmark,0,4,2019,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,,4.0
856,Aker BP ASA,2021,Energy,14,A,A,0.92,0.87,AKRBP,Large,Norway,1,4,2019,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,,1.0
1225,Lundin Energy AB,2021,Energy,13,B,B,0.81,0.77,LUNE,Large,Sweden,1,4,2019,1,1,1,1,1,0,0,1,1,0,0,1,1,1,1,1,,-1.0
931,Aker Solutions ASA,2021,Energy,13,B,B,0.81,0.77,AKSO,Large,Norway,0,4,2019,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,1,,1.0
810,Equinor ASA (formerly Statoil ASA),2021,Energy,13,B,B,0.81,0.77,EQNR,Large,Norway,1,4,2019,1,1,1,1,0,0,1,1,1,0,0,1,1,1,1,1,,-1.0
297,The Drilling Company of 1972 A/S,2021,Energy,13,B,B,0.81,0.77,DRLCO,Large,Denmark,1,4,2019,1,1,1,1,0,0,1,0,0,1,1,1,1,1,1,1,,1.0
13,BW Offshore Limited,2021,Energy,12,C,C,0.65,0.67,BWO,Large,Other,0,4,2019,1,1,1,1,0,0,1,0,1,1,1,1,1,0,1,1,,6.0
997,Aker Carbon Capture AS,2021,Energy,12,C,C,0.65,0.67,ACC,Large,Norway,0,1,2021,1,1,1,1,0,0,1,0,1,0,1,1,1,1,1,1,,
991,Prosafe SE,2021,Energy,12,C,C,0.65,0.67,PRS,Large,Other,0,4,2019,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,,2.0


# Create a summary_df to make it easier to query individual companies

# To do:

Create 'gap analysis: total missing metrics (coverage of metrics)'

#### New columns (emissions)


Create a column: 'GHG per EUR revenue_ranking_all_companies' - This is binned from 1 to 10 (using quartiles and calculated using only values 
from the same year)

Create a column: 'GHG per EUR revenue_ranking_sector' - This is also binned from 1 to 10 (and calculated using only values from the same year)

Calculate the average GHG per EUR revenue as well as IQR--apply the outlier transformation and put all outliers in the '0' bin

Create a column: 'GHG per EUR revenue_ranking_all_PY' -- This is to compare to the values from the previous year

Create a column: 'GHG per EUR revenue_ranking_sector_PY' -- This is to compare to the values from the previous year

Create a column: '% change in GHG per EUR revenue vs PY'

Create a column: '% change in GHG emissions vs PY'

Create a column: 'transport emissions as a % of total emissions'

Create a column: '% change in transport emissions vs PY'

Create a column: 'Transport emissions as % of total emissions' (compare to sector)


Use the bins only for GHG emissions/EUR--compare values in each bin for all columns...

See how bin values vary from year to year

Calculate the number of companies that have migrated from bin to bin





#### Bonus columns

'GHG intensity reduction % vs sector-specific targets'--normalise so make it a % above or below target

'GHG intensity reduction % vs others in the sector_CY'--also normalise (and consider whether positive is good or bad)



#### Summary columns

Summarise results by:

- Segment/Industry

- HQ country

Declarations per year

--check which industry has the highest % of missing values

Percentage of companies in each industry that have their sustainability work audited

## Appendix

In [375]:
# # 2. Create 'consecutive_years_esg_data' by checking consecutive years starting from 2022
# def calculate_consecutive_years(group):
#     # Create a set of years for the current 'comp_name'
#     years = set(group["year"])
#     # Start from 2022 and count consecutive years backwards
#     count = 0
#     for year in range(2022, 2019, -1):  # Checking years 2022, 2021, 2020, ...
#         if year in years:
#             count += 1
#         else:
#             break  # Stop if any year is missing in the consecutive sequence

#     return count


# # Apply the function to each group of 'comp_name'
# df["consecutive_years_esg_data"] = (
#     df.groupby("comp_name")
#     .apply(calculate_consecutive_years)
#     .reset_index(level=0, drop=True)
# )