In [55]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import plotly.express as px

### Build Scoring System and Create Banckmark

In [56]:
# Step 1: Load Data
esg_file_path = './data/Technology Industry ESG data.csv'
esg_data = pd.read_csv(esg_file_path)

In [57]:
def ESG_scoring(esg_data):
    company_info = esg_data[['Sub-sector', 'Company Name', 'Year']]
    esg_data_numeric = esg_data.drop(columns=['Sub-sector', 'Company Name', 'Year','Recordable work-related ill health cases', 
                                              'Average Training Hours per Employee', 'Total Energy Consumption (MWhs)',
                                              'Women on the Board (%)', 'Current Employees by Gender (Female %)', 
                                              'Women in Management Team (%)', 'Fatalities', 'Board Independence (%)'])
    
    imputer = SimpleImputer(strategy='median')
    esg_data_imputed = pd.DataFrame(imputer.fit_transform(esg_data_numeric), columns=esg_data_numeric.columns)
    
    scaler = StandardScaler()
    esg_data_scaled = pd.DataFrame(scaler.fit_transform(esg_data_imputed), columns=esg_data_numeric.columns)
    
    wcss = []  # Within-cluster sum of squares
    for i in range(1, 11):
        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
        kmeans.fit(esg_data_scaled)
        wcss.append(kmeans.inertia_)
        
    optimal_clusters = 3
    kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=42)
    esg_data['Cluster'] = kmeans.fit_predict(esg_data_scaled)
    
    esg_cluster_centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=esg_data_numeric.columns)
    esg_cluster_centers['Cluster'] = range(optimal_clusters)
    
    def categorize_performance(cluster):
        if cluster == esg_cluster_centers['Cluster'].idxmax():
            return 'Good'
        elif cluster == esg_cluster_centers['Cluster'].idxmin():
            return 'Poor'
        else:
            return 'Average'

    esg_data['Performance Category'] = esg_data['Cluster'].apply(categorize_performance)
    
    X = esg_data_scaled
    reg = LinearRegression()
    reg.fit(X, esg_data['Cluster'])
    
    esg_data['predicted_score'] = reg.predict(X)
    
    esg_feature_weights = pd.Series(reg.coef_, index=esg_data_numeric.columns).sort_values(ascending=False)
    
    esg_intercept_b = reg.intercept_
    
    
    return esg_data, kmeans, esg_cluster_centers, esg_feature_weights, esg_intercept_b
    

In [58]:
esg_data_scored, kmeans, esg_cluster_centers, esg_feature_weights, esg_intercept_b = ESG_scoring(esg_data)

In [59]:
esg_weights = dict(zip(esg_feature_weights.index, esg_feature_weights.values))
weights_df = pd.DataFrame([esg_weights])
weights_df

Unnamed: 0,New Hires and Turnover by Gender (Female %),Current Employees by Age Groups (Millennials %),Total Turnover (%),GHG Emissions (Scope 3) (tCO2e),Anti-Corruption Training for Employees (%),Total Number of Employees,Recordable injuries,High-consequence injuries,Total Waste Generated (t),Total Water Consumption (ML),GHG Emissions (Scope 2) (tCO2e),GHG Emissions (Scope 1) (tCO2e)
0,0.115291,0.043742,-0.011126,-0.050072,-0.070935,-0.079174,-0.118035,-0.182032,-0.194137,-0.241883,-0.260472,-0.340581


In [60]:
# weights_df.to_csv("ESG Scoring Weights.csv", index = False)

In [61]:
# esg_data.to_csv("ESG Scoring data.csv", index = False)

In [62]:
esg_data_scored["predicted_score"].describe()

count    1250.000000
mean        1.044000
std         0.629875
min        -0.790077
25%         0.607696
50%         1.051628
75%         1.497920
max         2.644386
Name: predicted_score, dtype: float64

In [63]:
def ESG_trend(esg_data):
    ESG_score_trend = esg_data.groupby('Year')['predicted_score'].mean().reset_index()
    ESG_score_trend.rename(columns = {'predicted_score' : 'Industry mean'}, inplace = True)
    
    soft_serve_esg = esg_data[esg_data['Sub-sector'] == 'Software and Services'].reset_index(drop=True)
    tech_equip_esg = esg_data[esg_data['Sub-sector'] == 'Technology Hardware and Equipment'].reset_index(drop=True)
    semi_esg = esg_data[esg_data['Sub-sector'] == 'Semiconductors and Semiconductor Equipment'].reset_index(drop=True)
    info_tech_esg = esg_data[esg_data['Sub-sector'] == 'Information Technology'].reset_index(drop=True)
    engin_esg = esg_data[esg_data['Sub-sector'] == 'Engineering Services'].reset_index(drop=True)
    
    soft_serve_esg_trend = soft_serve_esg.groupby('Year')['predicted_score'].mean().reset_index()
    soft_serve_esg_trend.rename(columns = {"predicted_score" : "Software and Services"}, inplace = True)

    tech_equip_esg_trend = tech_equip_esg.groupby('Year')['predicted_score'].mean().reset_index()
    tech_equip_esg_trend.rename(columns = {"predicted_score" : "Technology Hardware and Equipment"}, inplace = True)

    semi_esg_trend = semi_esg.groupby('Year')['predicted_score'].mean().reset_index()
    semi_esg_trend.rename(columns = {"predicted_score" : "Semiconductors and Semiconductor Equipment"}, inplace = True)

    info_tech_esg_trend = info_tech_esg.groupby('Year')['predicted_score'].mean().reset_index()
    info_tech_esg_trend.rename(columns = {"predicted_score" : "Information Technology"}, inplace = True)

    engin_esg_trend = engin_esg.groupby('Year')['predicted_score'].mean().reset_index()
    engin_esg_trend.rename(columns = {"predicted_score" : "Engineering Services"}, inplace = True)
    
    sub_sectors_df = ESG_score_trend.merge(soft_serve_esg_trend, on='Year').merge(tech_equip_esg_trend, on='Year').merge(semi_esg_trend, on='Year').merge(info_tech_esg_trend, on='Year').merge(engin_esg_trend, on='Year')
    
    esg_industry_plot_data = sub_sectors_df.melt(id_vars = ["Year"],
                                                 var_name = "sub-sectors", value_name = "predicted_score")
    
    return ESG_score_trend, esg_industry_plot_data

In [64]:
ESG_score_trend, esg_industry_plot_data = ESG_trend(esg_data_scored)

In [65]:
def ESG_trend_plot(esg_industry_plot_data):
    fig_esg_trend = px.line(esg_industry_plot_data, x = "Year", y = "predicted_score", color = "sub-sectors",
                            markers = True, 
                            title = "Environment score trend of the technology industry and sub-sectors")
    
    fig_esg_trend.update_xaxes(dtick = 1)
    
    fig_esg_trend.update_traces(
        hovertemplate = 'Year: %{x} <br> ESG Score: %{y} <extra></extra>', 
        marker = dict(size = 8)
        )
    
    fig_esg_trend.update_traces(selector = dict(name = 'Industry mean'),
                                line = dict(width = 4, color = "black"), 
                                marker = dict(size = 10, color = "black")) 
    
    return fig_esg_trend

In [66]:
fig_esg_trend = ESG_trend_plot(esg_industry_plot_data)
fig_esg_trend.show()

### Company Scoring

In [67]:
company_file_path = './data/Singtel_ESG_test.xlsx'
company_data = pd.read_excel(company_file_path)

In [68]:
sub_sectors = {'Software and Services':['Captii','CSE Global','V2Y Corp','SinoCloud Grp'],
               'Technology Hardware and Equipment':['Addvalue Tech','Nanofilm','Venture'],
               'Semiconductors and Semiconductor Equipment':['AdvancedSystems','AEM SGD','Asia Vets','ASTI','UMS'],
               'Information Technology':['Audience'],
               'Engineering Services':['ST Engineering','Singtel','GSS Energy']}

In [69]:
def company_scoring(company_data, kmeans, esg_cluster_centers, esg_feature_weights, esg_intercept_b, sub_sectors, esg_industry_plot_data, ESG_score_trend):
    company_data = company_data[company_data['Year'].between(2020, 2024)]
    company_info = company_data[['Company Name', 'Year']]
    company_name = str(company_data.iloc[0]['Company Name'])
    company_numeric = company_data.drop(columns=['Company Name', 'Year','Recordable work-related ill health cases', 
                                                 'Average Training Hours per Employee', 'Total Energy Consumption (MWhs)',
                                                 'Women on the Board (%)', 'Current Employees by Gender (Female %)', 
                                                 'Women in Management Team (%)', 'Fatalities', 'Board Independence (%)'])
    
    # Handle missing values(Implement techniques to handle missing data and ensure fair comparisons across companies.)
    imputer = SimpleImputer(strategy='median')
    company_data = pd.DataFrame(imputer.fit_transform(company_numeric), columns=company_numeric.columns)
    
    # Standardize Data
    scaler = StandardScaler()
    company_scaled = pd.DataFrame(scaler.fit_transform(company_data), columns=company_numeric.columns)
    
    # Use KMeans to set Performance Category
    # Use trained KMeans on input company
    company_scaled_data = company_scaled.values  
    
    company_clusters = kmeans.predict(company_scaled_data)
    company_data['Cluster'] = company_clusters
    
    # Set Performance Category based on clusters
    def categorize_performance_by_cluster(cluster):
        if cluster == esg_cluster_centers['Cluster'].idxmax():
            return 'Good'
        elif cluster == esg_cluster_centers['Cluster'].idxmin():
            return 'Poor'
        else:
            return 'Average'
    
    company_data['Performance Category'] = company_data['Cluster'].apply(categorize_performance_by_cluster)
    
    esg_weights = np.array(esg_feature_weights)
    
    def calculate_score(features, weights, intercept):
        return np.dot(features, weights) + intercept
    
    company_scores = company_scaled.apply(lambda row: calculate_score(row, esg_weights, esg_intercept_b), axis=1)
    company_data['Calculated Score'] = company_scores
    company_data = pd.concat([company_info.reset_index(drop=True), company_data], axis=1)
    
    company_score = company_data[['Year', 'Calculated Score']]
    company_score.rename(columns = {'Calculated Score': company_name}, inplace = True)
    
    # Loop through each company and check if target_value is in its list of industries using isin
    for sub_sector in sub_sectors:
        if pd.Series(sub_sectors[sub_sector]).isin([company_name]).any():
            company_sub_sector = sub_sector
    
    sub_sector_select = esg_industry_plot_data[esg_industry_plot_data["sub-sectors"] == company_sub_sector]
    sub_sector_select = sub_sector_select.drop(columns = {'sub-sectors'})
    sub_sector_select.rename(columns = {'predicted_score':company_sub_sector}, inplace = True)
    
    compare_data = ESG_score_trend.merge(company_score, on = 'Year').merge(sub_sector_select, on = 'Year')
    compare_data = compare_data.melt(id_vars = ["Year"],
                                     var_name = "Type", value_name = "predicted_score")
    
    fig_compare = px.line(compare_data, x = "Year", y = "predicted_score", color = "Type",
                          markers = True, title = "Comparison on ESG score trend")
    
    fig_compare.update_traces(
        hovertemplate = 'Year: %{x} <br> ESG Score: %{y} <extra></extra>', 
        marker = dict(size = 8)
        )
  
    return fig_compare
    

In [70]:
fig_compare = company_scoring(company_data, kmeans, esg_cluster_centers, esg_feature_weights, 
                               esg_intercept_b, sub_sectors, esg_industry_plot_data, ESG_score_trend)
fig_compare.show()


X does not have valid feature names, but KMeans was fitted with feature names



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

