In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
esg_file_path = './data/Technology Industry ESG data.csv'
esg_data = pd.read_csv(esg_file_path)

esg_data_numeric = esg_data.drop(columns=['Sub-sector', 'Company Name', 'Year','Recordable work-related ill health cases', 
                                            'Average Training Hours per Employee', 'Total Energy Consumption (MWhs)',
                                            'Women on the Board (%)', 'Current Employees by Gender (Female %)', 
                                            'Women in Management Team (%)', 'Fatalities', 'Board Independence (%)'])

imputer = SimpleImputer(strategy='median')
esg_data_imputed = pd.DataFrame(imputer.fit_transform(esg_data_numeric), columns=esg_data_numeric.columns)

scaler = StandardScaler()
esg_data_scaled = pd.DataFrame(scaler.fit_transform(esg_data_imputed), columns=esg_data_numeric.columns)
# esg_data_scaled.to_csv('./data/technology_esg_data_scaled.csv', index=False)

# wcss = []  # Within-cluster sum of squares
# for i in range(1, 11):
#     kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
#     kmeans.fit(esg_data_scaled)
#     wcss.append(kmeans.inertia_)
        
        
scored_esg_data = esg_data.copy()
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=42)
scored_esg_data['Cluster'] = kmeans.fit_predict(esg_data_scaled)

esg_cluster_centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=esg_data_numeric.columns)
esg_cluster_centers['Cluster'] = range(optimal_clusters)

def categorize_performance(cluster):
    if cluster == esg_cluster_centers['Cluster'].idxmax():
        return 'Good'
    elif cluster == esg_cluster_centers['Cluster'].idxmin():
        return 'Poor'
    else:
        return 'Average'

scored_esg_data['Performance Category'] = scored_esg_data['Cluster'].apply(categorize_performance)

X = esg_data_scaled
reg = LinearRegression()
reg.fit(X, scored_esg_data['Cluster'])

scored_esg_data['predicted_score'] = reg.predict(X)

# esg_feature_weights = pd.Series(reg.coef_, index=esg_data_numeric.columns).sort_values(ascending=False)

# esg_intercept_b = reg.intercept_



In [None]:
scored_esg_data.to_csv('./data/scored_tech_industry_esg_data.csv', index=False)

Unnamed: 0,Sub-sector,Company Name,Year,GHG Emissions (Scope 1) (tCO2e),GHG Emissions (Scope 2) (tCO2e),GHG Emissions (Scope 3) (tCO2e),Total Energy Consumption (MWhs),Total Water Consumption (ML),Total Waste Generated (t),Current Employees by Gender (Female %),...,High-consequence injuries,Recordable injuries,Recordable work-related ill health cases,Board Independence (%),Women on the Board (%),Women in Management Team (%),Anti-Corruption Training for Employees (%),Cluster,Performance Category,predicted_score
0,Software and Services,Company_1,2020,439.330088,692.120861,934.282272,9795.219615,190.466071,10.890496,28.490657,...,1,5,0,70.013601,14.053968,38.372039,84.106898,1,Average,1.287343
1,Software and Services,Company_1,2021,378.090962,980.739585,715.153231,18050.236590,458.761923,25.159481,26.762862,...,0,5,2,57.922977,30.047620,13.504147,79.688451,1,Average,1.113681
2,Software and Services,Company_1,2022,399.235518,287.726900,1519.212044,48655.677070,963.649338,39.016966,24.904117,...,2,3,1,76.934140,32.222135,36.897553,68.053200,0,Poor,0.888878
3,Software and Services,Company_1,2023,104.208569,845.016119,1373.225502,19265.752670,815.201098,36.155657,22.467246,...,0,7,2,87.083579,28.552195,36.052544,75.196871,1,Average,1.032026
4,Software and Services,Company_1,2024,309.991397,483.561409,999.316148,7567.449918,228.246568,35.196124,28.689775,...,0,7,1,59.730267,27.132033,11.988359,66.669347,1,Average,1.521602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1245,Engineering Services,Company_50,2020,186.537926,828.858112,438.343618,20422.508250,558.859469,65.763378,40.047658,...,0,7,4,97.086754,24.967339,24.462984,79.575658,1,Average,0.889140
1246,Engineering Services,Company_50,2021,123.727518,401.307286,358.203579,40465.950530,429.302159,72.145962,35.766343,...,0,5,3,96.307085,30.545766,21.846929,93.350050,2,Good,1.576278
1247,Engineering Services,Company_50,2022,93.612265,841.584214,1743.159319,14315.390990,431.663571,51.365446,47.129219,...,0,10,3,88.293865,42.920380,19.228261,51.609238,1,Average,1.384389
1248,Engineering Services,Company_50,2023,156.467997,115.054571,1391.104798,17642.003990,144.745709,98.688071,31.141008,...,1,3,1,92.154256,26.656797,39.887140,90.081546,1,Average,1.832074
