In [1]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle

In [2]:
data = pd.read_csv('ml_lca_data_variable.csv')

# extract the independent variables (X) and dependent variable (y)
X = data[['COAL, Thousand Short Tons', 'NATURALGAS, Billion Cubic Feet', 'ELECTRICITY, Million Kilowatthours',
          'PETRO_INDUSTRIAL, Thousand Barrels per Day', 'PETRO_RESIDENTIAL, Thousand Barrels per Day',
          'PETRO_COMMERCIAL, Thousand Barrels per Day', 'PETRO_TRANSPORTATION, Thousand Barrels per Day',
          'PETRO_ELECTRICPOWER, Thousand Barrels per Day']]

y_lca = data['CO2_ based on LCA calculation (Million Metric Tons)']
y_ml = data['CO2, Million Metric Tons']

# Shuffle data
X, y_ml = shuffle(X, y_ml, random_state = 42)
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X, y_ml, test_size=0.3, random_state=42)


regressor = RandomForestRegressor(n_estimators=1000, random_state=42)#1, 10, 100, 1000, 10000
regressor.fit(X_train_ml, y_train_ml)


# Make predictions on the training data
y_pred_train_ml = regressor.predict(X_train_ml)

# Calculate the R-squared score for the training data
r2_train = r2_score(y_train_ml, y_pred_train_ml)
print("R² (Training):", r2_train)

# Use the model to make predictions on the test data
y_pred_ml = regressor.predict(X_test_ml)
X_test_ml_list = X_test_ml.values.tolist()

data_list = data.values.tolist()


def is_sublist(list1, list2):
    len1 = len(list1)
    len2 = len(list2)
    
    # Check if list1 is longer than list2
    if len1 > len2:
        return False
    
    # Iterate through list2 using sliding window
    for i in range(len2 - len1 + 1):
        if set(list2[i:i+len1]) == set(list1):
            return True
    
    return False


def findlcaml(X_test_ml_list, data_list):
    result = []
    for i in X_test_ml_list:
        for j in data_list:
            if is_sublist(i, j):
                result.append(j) 
    return result
    
result = findlcaml(X_test_ml_list, data_list)
final = zip(result, y_pred_ml.tolist())

final_result = []
for i in final:
    k = i[0] + [i[1]]
    final_result.append(k)

lca_ml_pred = []
for i in final_result:
    lca_ml_pred.append(i[-3:])

R² (Training): 0.9929874299126865


In [3]:
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from scipy.stats import pearsonr

# Sample data
lca_ttest = []
ml_ttest = []
pred_ttest = []

for i in lca_ml_pred:
    lca_ttest.append(i[0])
    
for i in lca_ml_pred:
    ml_ttest.append(i[1])

for i in lca_ml_pred:
    pred_ttest.append(i[2])
    
# Perform t-test
t_statistic_lca_ml, p_value_lca_ml = ttest_ind(lca_ttest, ml_ttest)
t_statistic_pred_ml, p_value_pred_ml = ttest_ind(pred_ttest, ml_ttest)

# Print the results
print("T-Statistic for lca and ml:", t_statistic_lca_ml)
print("P-Value for lca and ml:", p_value_lca_ml)


# Print the results
print("T-Statistic for pred and ml:", t_statistic_pred_ml)
print("P-Value for pred and ml:", p_value_pred_ml)

print('############################################')

# Perform ANOVA
f_statistic, p_value = f_oneway(lca_ttest, ml_ttest, pred_ttest)

# Print the results
print("F-Statistic:", f_statistic)
print("P-Value:", p_value)

print('############################################')



# Calculate Pearson correlation coefficient and p-value
corr_coeff_lca_ml, p_value_lca_ml = pearsonr(lca_ttest, ml_ttest)
corr_coeff_pred_ml, p_value_pred_ml = pearsonr(pred_ttest, ml_ttest)

# Print the results
print("Pearson Correlation Coefficient lca vs ml:", corr_coeff_lca_ml)
print("P-Value lca vs ml:", p_value_lca_ml)
print("Pearson Correlation Coefficient pred vs ml:", corr_coeff_pred_ml)
print("P-Value pred vs ml:", p_value_pred_ml)

T-Statistic for lca and ml: -10.864374383228395
P-Value for lca and ml: 5.911342581703951e-24
T-Statistic for pred and ml: -0.020312039344838096
P-Value for pred and ml: 0.9838057676290923
############################################
F-Statistic: 89.56449439280836
P-Value: 2.7098418368738755e-34
############################################
Pearson Correlation Coefficient lca vs ml: 0.9136968970039316
P-Value lca vs ml: 1.6139983161249754e-71
Pearson Correlation Coefficient pred vs ml: 0.982708610030719
P-Value pred vs ml: 2.5498389311715003e-132
