In [23]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import plotly.express as px

In [24]:
# Step 1: Load Data
e_file_path = '../ChatGPT-simulate data/Simulate-E-data-9 years.xlsx'
e_data = pd.read_excel(e_file_path)

# Step 2: Data Preprocessing
# Drop non-numeric columns that are not needed for imputation
e_company_names = e_data['Company Name']  # Assuming the first column is 'Company Name'
e_data_numeric = e_data.drop(columns=['Company Name', 'Year', 'GHG Emissions (Total)'])

# Handle missing values(Implement techniques to handle missing data and ensure fair comparisons across
# companies.)
imputer = SimpleImputer(strategy='median')
e_data_imputed = pd.DataFrame(imputer.fit_transform(e_data_numeric), columns=e_data_numeric.columns)

# Step 3: Standardize Data
scaler = StandardScaler()
e_data_scaled = pd.DataFrame(scaler.fit_transform(e_data_imputed), columns=e_data_numeric.columns)

# Step 4: Apply KMeans Clustering
# Determine the optimal number of clusters using the Elbow Method
wcss = []  # Within-cluster sum of squares
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(e_data_scaled)
    wcss.append(kmeans.inertia_)

# Step 5: Train KMeans Model with Optimal Number of Clusters
# Assuming the optimal number of clusters is 3 based on the elbow plot
optimal_clusters = 3
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=42)
e_data['Cluster'] = kmeans.fit_predict(e_data_scaled)

# Step 6: Create General Industry Benchmark
# Since all companies are in the technology industry, create benchmark categories based on cluster assignments
e_cluster_centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=e_data_numeric.columns)
e_cluster_centers['Cluster'] = range(optimal_clusters)

# Categorize companies based on their cluster
def categorize_performance(cluster):
    if cluster == e_cluster_centers['Cluster'].idxmax():
        return 'Good'
    elif cluster == e_cluster_centers['Cluster'].idxmin():
        return 'Poor'
    else:
        return 'Average'

e_data['Performance Category'] = e_data['Cluster'].apply(categorize_performance)

# Step 7: Create Scoring System
# Use a Linear Regression model to calculate an ESG performance score
X = e_data_scaled
reg = LinearRegression()
reg.fit(X, e_data['Cluster'])

# Predicted ESG performance score
e_data['predicted_score'] = reg.predict(X)

# Get feature importance (coefficients)
e_feature_weights = pd.Series(reg.coef_, index=e_data_numeric.columns).sort_values(ascending=False)
print("Feature Weights for ESG Performance Scoring System:")
print(e_feature_weights)

# Get intercept (b)
e_intercept_b = reg.intercept_
print("Intercept (b):", e_intercept_b)

# Step 9: Save Results
e_data['Company Name'] = e_company_names  # Add company names back to the dataset

# Save the results to a CSV file
# e_data.to_csv('esg_scoring_results_E_part.csv', index=False)  

# Optional: Display some sample results
# e_data[['Company Name', 'Year', 'Cluster', 'Performance Category', 'predicted_score']].head()

# Step 10: Generate Automated ESG Performance Summaries and Recommendations
# Function to generate summaries and recommendations for new companies
def generate_summary(company_data):
    # Preprocess the new company's data
    company_imputed = pd.DataFrame(imputer.transform(company_data), columns=e_data_numeric.columns)
    company_scaled = pd.DataFrame(scaler.transform(company_imputed), columns=e_data_numeric.columns)

    # Predict cluster and performance score
    cluster = kmeans.predict(company_scaled)[0]
    predicted_score = reg.predict(company_scaled)[0]
    performance_category = categorize_performance(cluster)

    # Generate summary
    summary = f"The company belongs to the '{performance_category}' performance category with an ESG score of {predicted_score:.2f}."

    # Generate recommendations based on performance
    if performance_category == 'Good':
        recommendation = "Maintain current practices and look for areas to further innovate."
    elif performance_category == 'Average':
        recommendation = "Focus on improving key ESG metrics to move into the 'Good' category. Consider areas such as energy efficiency and waste reduction."
    else:
        recommendation = "Significant improvements are needed in ESG practices. Start by addressing major contributors to emissions and inefficiencies."

    return summary, recommendation

Feature Weights for ESG Performance Scoring System:
Total Water Consumption     0.175373
Total Waste Generated       0.079954
Total Energy Consumption   -0.030653
GHG Emissions (Scope 2)    -0.128821
GHG Emissions (Scope 3)    -0.132487
GHG Emissions (Scope 1)    -0.605261
dtype: float64
Intercept (b): 1.0466666666666664


Scoring Functon

In [25]:
e_company_file_path = '../ChatGPT-simulate data/Simulate-E-data-9 years.xlsx'
e_company_data = pd.read_excel(e_company_file_path)

In [None]:
company_name = 'Company_1'
######### replace with extracted info
company_e_data = e_company_data[e_company_data["Company Name"] == company_name].reset_index(drop=True)

e_company_info = company_e_data[['Company Name', 'Year']]
e_company_numeric = company_e_data.drop(columns=['Company Name', 'Year', 'GHG Emissions (Total)'])

# Handle missing values(Implement techniques to handle missing data and ensure fair comparisons across
# companies.)
imputer = SimpleImputer(strategy='median')
e_company_imputed = pd.DataFrame(imputer.fit_transform(e_company_numeric), columns=e_company_numeric.columns)

# Step 3: Standardize Data
scaler = StandardScaler()
e_company_scaled = pd.DataFrame(scaler.fit_transform(e_company_imputed), columns=e_company_numeric.columns)

# Add back company name and year columns
# e_company_scaled_data = pd.concat([e_company_info.reset_index(drop=True), e_company_scaled], axis=1)


In [27]:
# Step 3: 重新使用 KMeans 聚类来分配 Performance Category
# 使用之前训练好的 KMeans 模型对Company 1的数据重新进行预测
company_e_scaled_data = e_company_scaled.values  # 标准化后的数据作为特征输入

# 使用 KMeans 模型进行聚类预测
company_e_clusters = kmeans.predict(company_e_scaled_data)
company_e_data['Cluster'] = company_e_clusters

# 根据 KMeans 预测的 Cluster 给出 Performance Category
def categorize_performance_by_cluster(cluster):
    if cluster == e_cluster_centers['Cluster'].idxmax():
        return 'Good'
    elif cluster == e_cluster_centers['Cluster'].idxmin():
        return 'Poor'
    else:
        return 'Average'

# 使用新的聚类结果为 Company 1 分配 Performance Category
company_e_data['Performance Category'] = company_e_data['Cluster'].apply(categorize_performance_by_cluster)


# Step 2: 使用线性回归公式计算每年的 ESG 得分
# 假设 reg.coef_ 已经存储了模型的权重 (weights)，并且 intercept_b 已经存储了截距 (b)
e_weights = reg.coef_
e_intercept_b = reg.intercept_

# 将权重转换为NumPy数组，以便于矩阵运算
e_weights = np.array(e_weights)

# 定义计算得分的函数
def calculate_score(features, weights, intercept):
    return np.dot(features, weights) + intercept

# 计算每年 ESG 得分并保存到新的 DataFrame 中
company_e_scores = e_company_scaled.apply(lambda row: calculate_score(row, e_weights, e_intercept_b), axis=1)
company_e_data['Calculated Score'] = company_e_scores

# Add back company name and year columns
# company_e = pd.concat([e_company_info.reset_index(drop=True), company_e_data], axis=1)

company_e_data



X does not have valid feature names, but KMeans was fitted with feature names



Unnamed: 0,Company Name,Year,GHG Emissions (Scope 1),GHG Emissions (Scope 2),GHG Emissions (Scope 3),GHG Emissions (Total),Total Energy Consumption,Total Water Consumption,Total Waste Generated,Cluster,Performance Category,Calculated Score
0,Company_1,2015,170.10468,25.818188,364.565239,560.488106,4362.711869,3893.196444,797.507302,2,Good,1.907366
1,Company_1,2016,468.277567,113.632928,294.223826,876.134322,7198.907644,1591.129253,59.128381,0,Poor,-0.509667
2,Company_1,2017,127.600875,122.083167,242.773468,492.45751,7038.95915,2860.495934,856.537777,2,Good,1.724461
3,Company_1,2018,310.834606,121.007206,381.414399,813.256211,2441.767719,4908.816197,661.580852,1,Average,0.988428
4,Company_1,2019,299.986281,105.553473,195.324121,600.863875,1070.055732,4495.813645,440.079027,1,Average,1.092987
5,Company_1,2020,382.829758,162.308534,116.781928,661.92022,7776.834556,4765.693869,352.96852,0,Poor,0.454597
6,Company_1,2021,274.659308,44.037756,824.177528,1142.874592,1957.313497,1961.375173,29.056474,0,Poor,0.570073
7,Company_1,2022,172.528547,75.903863,224.105448,472.537858,1741.142296,3945.294124,826.962796,2,Good,1.860201
8,Company_1,2023,126.228425,130.83069,798.379165,1055.438279,1024.310952,3118.494861,151.294635,2,Good,1.331554


In [28]:
E_score_trend = e_data.groupby('Year')['predicted_score'].mean().reset_index()
E_score_trend.rename(columns = {'predicted_score' : 'mean'}, inplace = True)


In [29]:
company_e_score = company_e_data[['Year', 'Calculated Score']]
company_e_score.rename(columns = {'Calculated Score': company_name}, inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [30]:
e_plot_data = E_score_trend.merge(company_e_score, how = 'outer', on = 'Year')

In [31]:
e_plot_data = e_plot_data.melt(id_vars = ["Year"],
                           var_name = "Type", value_name = "predicted_score")

In [32]:
fig_e_trend = px.line(e_plot_data, x = "Year", y = "predicted_score", color = "Type",
                      markers = True, 
                      title = "Environment score trend of the technology industry")

fig_e_trend.update_traces(
    hovertemplate = 'Year: %{x} <br> Environment Score: %{y} <extra></extra>', 
    marker = dict(size = 8)
)

fig_e_trend.show()