In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import plotly.express as px

In [9]:
esg_file_path = './data/Technology Industry ESG data.csv'
esg_data = pd.read_csv(esg_file_path)

esg_data_numeric = esg_data.drop(columns=['Sub-sector', 'Company Name', 'Year','Recordable work-related ill health cases', 
                                           'Current Employees by Age Groups (Millennials %)', 
                                            'Average Training Hours per Employee', 'Total Energy Consumption (MWhs)',
                                            'Women on the Board (%)', 'Current Employees by Gender (Female %)', 
                                            'Women in Management Team (%)', 'Fatalities', 'Board Independence (%)'])

imputer = SimpleImputer(strategy='median')
esg_data_imputed = pd.DataFrame(imputer.fit_transform(esg_data_numeric), columns=esg_data_numeric.columns)

scaler = StandardScaler()
esg_data_scaled = pd.DataFrame(scaler.fit_transform(esg_data_imputed), columns=esg_data_numeric.columns)
# esg_data_scaled.to_csv('./data/technology_esg_data_scaled.csv', index=False)

# wcss = []  # Within-cluster sum of squares
# for i in range(1, 11):
#     kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
#     kmeans.fit(esg_data_scaled)
#     wcss.append(kmeans.inertia_)
        
        
scored_esg_data = esg_data.copy()
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=42)
scored_esg_data['Cluster'] = kmeans.fit_predict(esg_data_scaled)

esg_cluster_centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=esg_data_numeric.columns)
esg_cluster_centers['Cluster'] = range(optimal_clusters)

def categorize_performance(cluster):
    if cluster == esg_cluster_centers['Cluster'].idxmax():
        return 'Good'
    elif cluster == esg_cluster_centers['Cluster'].idxmin():
        return 'Poor'
    else:
        return 'Average'

scored_esg_data['Performance Category'] = scored_esg_data['Cluster'].apply(categorize_performance)

X = esg_data_scaled
reg = LinearRegression()
reg.fit(X, scored_esg_data['Cluster'])

scored_esg_data['predicted_score'] = reg.predict(X)

# esg_feature_weights = pd.Series(reg.coef_, index=esg_data_numeric.columns).sort_values(ascending=False)

# esg_intercept_b = reg.intercept_



In [11]:
reg.coef_

array([ 0.07523737,  0.24805192,  0.15405089, -0.23695093,  0.34414953,
       -0.00201837,  0.00939058, -0.01146471, -0.10783211,  0.19902809,
        0.20018062])

In [10]:
import pickle

# save
with open('scoring_model.pkl','wb') as f:
    pickle.dump(reg,f)
    
with open('cluster_model.pkl','wb') as f:
    pickle.dump(kmeans,f)

In [5]:
scored_esg_data.to_csv('./data/scored_tech_industry_esg_data.csv', index=False)

In [6]:
esg_cluster_centers.to_csv('./data/tech_esg_cluster_centers.csv', index=False)