In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import plotly.express as px

### Build Scoring System and Create Banckmark

In [30]:
# Step 1: Load Data
esg_file_path = './data/Technology Industry ESG data.csv'
esg_data = pd.read_csv(esg_file_path)

In [31]:
esg_data.shape

(1250, 23)

In [None]:
def ESG_scoring(esg_data):
    company_info = esg_data[['Sub-sector', 'Company Name', 'Year']]
    esg_data_numeric = esg_data.drop(columns=['Sub-sector', 'Company Name', 'Year', 'GHG Emissions (Total) (tCO2e)',
                                              'Recordable work-related ill health cases', 
                                              'Average Training Hours per Employee', 'Total Energy Consumption (MWhs)',
                                              'Women on the Board (%)', 'Current Employees by Gender (Female %)', 
                                              'Women in Management Team (%)', 'Fatalities', 'Board Independence (%)'])
    
    imputer = SimpleImputer(strategy='median')
    esg_data_imputed = pd.DataFrame(imputer.fit_transform(esg_data_numeric), columns=esg_data_numeric.columns)
    
    scaler = StandardScaler()
    esg_data_scaled = pd.DataFrame(scaler.fit_transform(esg_data_imputed), columns=esg_data_numeric.columns)
    
    wcss = []  # Within-cluster sum of squares
    for i in range(1, 11):
        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
        kmeans.fit(esg_data_scaled)
        wcss.append(kmeans.inertia_)
        
    optimal_clusters = 3
    kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=42)
    esg_data['Cluster'] = kmeans.fit_predict(esg_data_scaled)
    
    esg_cluster_centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=esg_data_numeric.columns)
    esg_cluster_centers['Cluster'] = range(optimal_clusters)
    
    def categorize_performance(cluster):
        if cluster == esg_cluster_centers['Cluster'].idxmax():
            return 'Good'
        elif cluster == esg_cluster_centers['Cluster'].idxmin():
            return 'Poor'
        else:
            return 'Average'

    esg_data['Performance Category'] = esg_data['Cluster'].apply(categorize_performance)
    
    X = esg_data_scaled
    print(X.columns)
    reg = LinearRegression()
    reg.fit(X, esg_data['Cluster'])
    
    esg_data['predicted_score'] = reg.predict(X)
    
    esg_feature_weights = pd.Series(reg.coef_, index=esg_data_numeric.columns).sort_values(ascending=False)
    
    esg_intercept_b = reg.intercept_
    
    
    return esg_data, kmeans, esg_cluster_centers, esg_feature_weights, esg_intercept_b
    

In [28]:
esg_data_scored, kmeans, esg_cluster_centers, esg_feature_weights, esg_intercept_b = ESG_scoring(esg_data)

Index(['GHG Emissions (Scope 1) (tCO2e)', 'GHG Emissions (Scope 2) (tCO2e)',
       'GHG Emissions (Scope 3) (tCO2e)', 'Total Water Consumption (ML)',
       'Total Waste Generated (t)',
       'New Hires and Turnover by Gender (Female %)',
       'Current Employees by Age Groups (Millennials %)', 'Total Turnover (%)',
       'Total Number of Employees', 'High-consequence injuries',
       'Recordable injuries', 'Anti-Corruption Training for Employees (%)'],
      dtype='object')


In [9]:
esg_data

Unnamed: 0,Sub-sector,Company Name,Year,GHG Emissions (Scope 1) (tCO2e),GHG Emissions (Scope 2) (tCO2e),GHG Emissions (Scope 3) (tCO2e),Total Energy Consumption (MWhs),Total Water Consumption (ML),Total Waste Generated (t),Current Employees by Gender (Female %),...,High-consequence injuries,Recordable injuries,Recordable work-related ill health cases,Board Independence (%),Women on the Board (%),Women in Management Team (%),Anti-Corruption Training for Employees (%),Cluster,Performance Category,predicted_score
0,Software and Services,Company_1,2020,439.330088,692.120861,934.282272,9795.219615,190.466071,10.890496,28.490657,...,1,5,0,70.013601,14.053968,38.372039,84.106898,1,Average,1.287343
1,Software and Services,Company_1,2021,378.090962,980.739585,715.153231,18050.236590,458.761923,25.159481,26.762862,...,0,5,2,57.922977,30.047620,13.504147,79.688451,1,Average,1.113681
2,Software and Services,Company_1,2022,399.235518,287.726900,1519.212044,48655.677070,963.649338,39.016966,24.904117,...,2,3,1,76.934140,32.222135,36.897553,68.053200,0,Poor,0.888878
3,Software and Services,Company_1,2023,104.208569,845.016119,1373.225502,19265.752670,815.201098,36.155657,22.467246,...,0,7,2,87.083579,28.552195,36.052544,75.196871,1,Average,1.032026
4,Software and Services,Company_1,2024,309.991397,483.561409,999.316148,7567.449918,228.246568,35.196124,28.689775,...,0,7,1,59.730267,27.132033,11.988359,66.669347,1,Average,1.521602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1245,Engineering Services,Company_50,2020,186.537926,828.858112,438.343618,20422.508250,558.859469,65.763378,40.047658,...,0,7,4,97.086754,24.967339,24.462984,79.575658,1,Average,0.889140
1246,Engineering Services,Company_50,2021,123.727518,401.307286,358.203579,40465.950530,429.302159,72.145962,35.766343,...,0,5,3,96.307085,30.545766,21.846929,93.350050,2,Good,1.576278
1247,Engineering Services,Company_50,2022,93.612265,841.584214,1743.159319,14315.390990,431.663571,51.365446,47.129219,...,0,10,3,88.293865,42.920380,19.228261,51.609238,1,Average,1.384389
1248,Engineering Services,Company_50,2023,156.467997,115.054571,1391.104798,17642.003990,144.745709,98.688071,31.141008,...,1,3,1,92.154256,26.656797,39.887140,90.081546,1,Average,1.832074


In [59]:
esg_weights = dict(zip(esg_feature_weights.index, esg_feature_weights.values))
weights_df = pd.DataFrame([esg_weights])
weights_df

Unnamed: 0,New Hires and Turnover by Gender (Female %),Current Employees by Age Groups (Millennials %),Total Turnover (%),GHG Emissions (Scope 3) (tCO2e),Anti-Corruption Training for Employees (%),Total Number of Employees,Recordable injuries,High-consequence injuries,Total Waste Generated (t),Total Water Consumption (ML),GHG Emissions (Scope 2) (tCO2e),GHG Emissions (Scope 1) (tCO2e)
0,0.115291,0.043742,-0.011126,-0.050072,-0.070935,-0.079174,-0.118035,-0.182032,-0.194137,-0.241883,-0.260472,-0.340581


In [60]:
# weights_df.to_csv("ESG Scoring Weights.csv", index = False)

In [61]:
# esg_data.to_csv("ESG Scoring data.csv", index = False)

In [62]:
esg_data_scored["predicted_score"].describe()

count    1250.000000
mean        1.044000
std         0.629875
min        -0.790077
25%         0.607696
50%         1.051628
75%         1.497920
max         2.644386
Name: predicted_score, dtype: float64

In [63]:
def ESG_trend(esg_data):
    ESG_score_trend = esg_data.groupby('Year')['predicted_score'].mean().reset_index()
    ESG_score_trend.rename(columns = {'predicted_score' : 'Industry mean'}, inplace = True)
    
    soft_serve_esg = esg_data[esg_data['Sub-sector'] == 'Software and Services'].reset_index(drop=True)
    tech_equip_esg = esg_data[esg_data['Sub-sector'] == 'Technology Hardware and Equipment'].reset_index(drop=True)
    semi_esg = esg_data[esg_data['Sub-sector'] == 'Semiconductors and Semiconductor Equipment'].reset_index(drop=True)
    info_tech_esg = esg_data[esg_data['Sub-sector'] == 'Information Technology'].reset_index(drop=True)
    engin_esg = esg_data[esg_data['Sub-sector'] == 'Engineering Services'].reset_index(drop=True)
    
    soft_serve_esg_trend = soft_serve_esg.groupby('Year')['predicted_score'].mean().reset_index()
    soft_serve_esg_trend.rename(columns = {"predicted_score" : "Software and Services"}, inplace = True)

    tech_equip_esg_trend = tech_equip_esg.groupby('Year')['predicted_score'].mean().reset_index()
    tech_equip_esg_trend.rename(columns = {"predicted_score" : "Technology Hardware and Equipment"}, inplace = True)

    semi_esg_trend = semi_esg.groupby('Year')['predicted_score'].mean().reset_index()
    semi_esg_trend.rename(columns = {"predicted_score" : "Semiconductors and Semiconductor Equipment"}, inplace = True)

    info_tech_esg_trend = info_tech_esg.groupby('Year')['predicted_score'].mean().reset_index()
    info_tech_esg_trend.rename(columns = {"predicted_score" : "Information Technology"}, inplace = True)

    engin_esg_trend = engin_esg.groupby('Year')['predicted_score'].mean().reset_index()
    engin_esg_trend.rename(columns = {"predicted_score" : "Engineering Services"}, inplace = True)
    
    sub_sectors_df = ESG_score_trend.merge(soft_serve_esg_trend, on='Year').merge(tech_equip_esg_trend, on='Year').merge(semi_esg_trend, on='Year').merge(info_tech_esg_trend, on='Year').merge(engin_esg_trend, on='Year')
    
    esg_industry_plot_data = sub_sectors_df.melt(id_vars = ["Year"],
                                                 var_name = "sub-sectors", value_name = "predicted_score")
    
    return ESG_score_trend, esg_industry_plot_data

In [64]:
ESG_score_trend, esg_industry_plot_data = ESG_trend(esg_data_scored)

In [65]:
def ESG_trend_plot(esg_industry_plot_data):
    fig_esg_trend = px.line(esg_industry_plot_data, x = "Year", y = "predicted_score", color = "sub-sectors",
                            markers = True, 
                            title = "Environment score trend of the technology industry and sub-sectors")
    
    fig_esg_trend.update_xaxes(dtick = 1)
    
    fig_esg_trend.update_traces(
        hovertemplate = 'Year: %{x} <br> ESG Score: %{y} <extra></extra>', 
        marker = dict(size = 8)
        )
    
    fig_esg_trend.update_traces(selector = dict(name = 'Industry mean'),
                                line = dict(width = 4, color = "black"), 
                                marker = dict(size = 10, color = "black")) 
    
    return fig_esg_trend

In [66]:
fig_esg_trend = ESG_trend_plot(esg_industry_plot_data)
fig_esg_trend.show()

### Company Scoring

In [11]:
company_file_path = '../data/Singtel_ESG_test.xlsx'
company_data = pd.read_excel(company_file_path)

In [12]:
company_data

Unnamed: 0,Company Name,Year,GHG Emissions (Scope 1) (tCO2e),GHG Emissions (Scope 2) (tCO2e),GHG Emissions (Scope 3) (tCO2e),GHG Emissions (Total) (tCO2e),Total Energy Consumption (MWhs),Total Water Consumption (ML),Total Waste Generated (t),Current Employees by Gender (Female %),...,Total Number of Employees,Average Training Hours per Employee,Fatalities,High-consequence injuries,Recordable injuries,Recordable work-related ill health cases,Board Independence (%),Women on the Board (%),Women in Management Team (%),Anti-Corruption Training for Employees (%)
0,Singtel,2017,,,,,,,,37.11,...,,30.5,,,,,,,,
1,Singtel,2018,4085.0,164470.0,6554.0,164629.0,387528.1,752.207,7538.0,42.0,...,13095.0,67.0,,,,,,,,
2,Singtel,2019,8000.0,154152.0,6392.0,164629.0,1347094.0,7532.385,7658.0,45.0,...,12589.0,68.0,0.0,,,2.0,70.0,,30.0,
3,Singtel,2020,95574.0,343383.0,3568343.0,165331.0,1466802.0,30000.0,7658.0,45.0,...,24000.0,75.0,5.0,12.0,50.0,15.0,85.0,35.0,35.0,1000.0
4,Singtel,2021,10500.0,540699.0,9050538.0,9586782.0,1602698.0,3000.0,4150.0,45.0,...,12391.0,48.3,0.0,0.04,15.0,2.0,75.0,30.0,40.0,
5,Singtel,2022,12000.0,534233.0,8146316.0,8641260.0,820015.4,15000.0,5000.0,45.0,...,19464.0,56.3,0.0,0.0,50.0,2.0,50.0,30.0,35.0,
6,Singtel,2023,500000.0,700000.0,3836769.22,4277369.0,808809.0,10000.0,5000.0,45.0,...,20532.0,26100000.0,1.0,3.0,15.0,3.0,75.0,46.0,45.0,0.0
7,Singtel,2024,409120.0,2962121.0,2553001.37,2962121.0,820311.2,698.819,12692.0,33.37,...,19944.0,51.2,0.0,,,,,,31.0,
8,Singtel,2025,,,,,,49.5,,,...,,,,,,,,,,


In [17]:
company_data.iloc[:,2:]

Unnamed: 0,GHG Emissions (Scope 1) (tCO2e),GHG Emissions (Scope 2) (tCO2e),GHG Emissions (Scope 3) (tCO2e),GHG Emissions (Total) (tCO2e),Total Energy Consumption (MWhs),Total Water Consumption (ML),Total Waste Generated (t),Current Employees by Gender (Female %),New Hires and Turnover by Gender (Female %),Total Turnover (%),Total Number of Employees,Average Training Hours per Employee,Fatalities,High-consequence injuries,Recordable injuries,Recordable work-related ill health cases,Board Independence (%),Women on the Board (%),Women in Management Team (%),Anti-Corruption Training for Employees (%)
0,,,,,,,,37.11,,16.4,,30.5,,,,,,,,
1,4085.0,164470.0,6554.0,164629.0,387528.1,752.207,7538.0,42.0,,17.1,13095.0,67.0,,,,,,,,
2,8000.0,154152.0,6392.0,164629.0,1347094.0,7532.385,7658.0,45.0,5.9,22.0,12589.0,68.0,0.0,,,2.0,70.0,,30.0,
3,95574.0,343383.0,3568343.0,165331.0,1466802.0,30000.0,7658.0,45.0,782.0,20.0,24000.0,75.0,5.0,12.0,50.0,15.0,85.0,35.0,35.0,1000.0
4,10500.0,540699.0,9050538.0,9586782.0,1602698.0,3000.0,4150.0,45.0,439.0,13.2,12391.0,48.3,0.0,0.04,15.0,2.0,75.0,30.0,40.0,
5,12000.0,534233.0,8146316.0,8641260.0,820015.4,15000.0,5000.0,45.0,6.7,19.9,19464.0,56.3,0.0,0.0,50.0,2.0,50.0,30.0,35.0,
6,500000.0,700000.0,3836769.22,4277369.0,808809.0,10000.0,5000.0,45.0,7.7,19.1,20532.0,26100000.0,1.0,3.0,15.0,3.0,75.0,46.0,45.0,0.0
7,409120.0,2962121.0,2553001.37,2962121.0,820311.2,698.819,12692.0,33.37,5.1,14.3,19944.0,51.2,0.0,,,,,,31.0,
8,,,,,,49.5,,,,,,,,,,,,,,


In [68]:
sub_sectors = {'Software and Services':['Captii','CSE Global','V2Y Corp','SinoCloud Grp'],
               'Technology Hardware and Equipment':['Addvalue Tech','Nanofilm','Venture'],
               'Semiconductors and Semiconductor Equipment':['AdvancedSystems','AEM SGD','Asia Vets','ASTI','UMS'],
               'Information Technology':['Audience'],
               'Engineering Services':['ST Engineering','Singtel','GSS Energy']}

In [None]:
def company_scoring(company_data, kmeans, esg_cluster_centers, esg_feature_weights, esg_intercept_b, sub_sectors, esg_industry_plot_data, ESG_score_trend):
    company_data = company_data[company_data['Year'].between(2020, 2024)]
    company_info = company_data[['Company Name', 'Year']]
    company_name = str(company_data.iloc[0]['Company Name'])
    company_numeric = company_data.drop(columns=['Company Name', 'Year','Recordable work-related ill health cases', 
                                                 'Average Training Hours per Employee', 'Total Energy Consumption (MWhs)',
                                                 'Women on the Board (%)', 'Current Employees by Gender (Female %)', 
                                                 'Women in Management Team (%)', 'Fatalities', 'Board Independence (%)'])
    
    # Handle missing values(Implement techniques to handle missing data and ensure fair comparisons across companies.)
    imputer = SimpleImputer(strategy='median')
    company_data = pd.DataFrame(imputer.fit_transform(company_numeric), columns=company_numeric.columns)
    
    # Standardize Data
    scaler = StandardScaler()
    company_scaled = pd.DataFrame(scaler.fit_transform(company_data), columns=company_numeric.columns)
    
    # Use KMeans to set Performance Category
    # Use trained KMeans on input company
    company_scaled_data = company_scaled.values  
    
    company_clusters = kmeans.predict(company_scaled_data)
    company_data['Cluster'] = company_clusters
    
    # Set Performance Category based on clusters
    def categorize_performance_by_cluster(cluster):
        if cluster == esg_cluster_centers['Cluster'].idxmax():
            return 'Good'
        elif cluster == esg_cluster_centers['Cluster'].idxmin():
            return 'Poor'
        else:
            return 'Average'
    
    company_data['Performance Category'] = company_data['Cluster'].apply(categorize_performance_by_cluster)
    
    esg_weights = np.array(esg_feature_weights)
    
    def calculate_score(features, weights, intercept):
        return np.dot(features, weights) + intercept
    
    company_scores = company_scaled.apply(lambda row: calculate_score(row, esg_weights, esg_intercept_b), axis=1)
    company_data['Calculated Score'] = company_scores
    company_data = pd.concat([company_info.reset_index(drop=True), company_data], axis=1)
    
    company_score = company_data[['Year', 'Calculated Score']]
    company_score.rename(columns = {'Calculated Score': company_name}, inplace = True)
    
    # Loop through each company and check if target_value is in its list of industries using isin
    for sub_sector in sub_sectors:
        if pd.Series(sub_sectors[sub_sector]).isin([company_name]).any():
            company_sub_sector = sub_sector
    
    sub_sector_select = esg_industry_plot_data[esg_industry_plot_data["sub-sectors"] == company_sub_sector]
    sub_sector_select = sub_sector_select.drop(columns = {'sub-sectors'})
    sub_sector_select.rename(columns = {'predicted_score':company_sub_sector}, inplace = True)
    
    compare_data = ESG_score_trend.merge(company_score, on = 'Year').merge(sub_sector_select, on = 'Year')
    compare_data = compare_data.melt(id_vars = ["Year"],
                                     var_name = "Type", value_name = "predicted_score")
    
    fig_compare = px.line(compare_data, x = "Year", y = "predicted_score", color = "Type",
                          markers = True, title = "Comparison on ESG score trend")
    
    fig_compare.update_traces(
        hovertemplate = 'Year: %{x} <br> ESG Score: %{y} <extra></extra>', 
        marker = dict(size = 8)
        )
  
    return fig_compare
    

In [70]:
fig_compare = company_scoring(company_data, kmeans, esg_cluster_centers, esg_feature_weights, 
                               esg_intercept_b, sub_sectors, esg_industry_plot_data, ESG_score_trend)
fig_compare.show()


X does not have valid feature names, but KMeans was fitted with feature names



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

