In [20]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn import svm
from sklearn.feature_selection import RFE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle

In [21]:
# Read the data
data = pd.read_csv('ml_lca_data_variable.csv')

# extract the independent variables (X) and dependent variable (y)
X = data[['COAL, Thousand Short Tons', 'NATURALGAS, Billion Cubic Feet', 'ELECTRICITY, Million Kilowatthours',
          'PETRO_INDUSTRIAL, Thousand Barrels per Day', 'PETRO_RESIDENTIAL, Thousand Barrels per Day',
          'PETRO_COMMERCIAL, Thousand Barrels per Day', 'PETRO_TRANSPORTATION, Thousand Barrels per Day',
          'PETRO_ELECTRICPOWER, Thousand Barrels per Day']]

y_lca = data['CO2_ based on LCA calculation (Million Metric Tons)']
y = data['CO2, Million Metric Tons']

# Shuffle data
X, y = shuffle(X, y, random_state=42)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature selection using RFE with cross-validation
svr_model = SVR(kernel='linear')  # We will use the linear kernel for feature selection
rfe = RFE(estimator=svr_model, n_features_to_select=4)
X_train_selected = rfe.fit_transform(X_train_scaled, y_train)
X_test_selected = rfe.transform(X_test_scaled)

# Perform hyperparameter tuning using GridSearchCV with RBF kernel
parameters = {
    'kernel': ['rbf'],
    'C': [1, 10, 100],
    'gamma': [0.1, 1, 10]
}

regressor = GridSearchCV(SVR(), parameters, cv=3)
regressor.fit(X_train_selected, y_train)

# Make predictions on the training data
y_pred_train_ml = regressor.predict(X_train_selected)

# Calculate the R-squared score for the training data
r2_train = r2_score(y_train, y_pred_train_ml)
print("R² (Training):", r2_train)

# Use the model to make predictions on the test data
y_pred_ml = regressor.predict(X_test_selected)
X_test_ml_list = X_test_ml.values.tolist()

data_list = data.values.tolist()


def is_sublist(list1, list2):
    len1 = len(list1)
    len2 = len(list2)
    
    # Check if list1 is longer than list2
    if len1 > len2:
        return False
    
    # Iterate through list2 using sliding window
    for i in range(len2 - len1 + 1):
        if set(list2[i:i+len1]) == set(list1):
            return True
    
    return False


def findlcaml(X_test_ml_list, data_list):
    result = []
    for i in X_test_ml_list:
        for j in data_list:
            if is_sublist(i, j):
                result.append(j) 
    return result
    
result = findlcaml(X_test_ml_list, data_list)
final = zip(result, y_pred_ml.tolist())

final_result = []
for i in final:
    k = i[0] + [i[1]]
    final_result.append(k)

lca_ml_pred = []
for i in final_result:
    lca_ml_pred.append(i[-3:])

R² (Training): 0.9805122986273803


In [22]:
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from scipy.stats import pearsonr

# Sample data
lca_ttest = []
ml_ttest = []
pred_ttest = []

for i in lca_ml_pred:
    lca_ttest.append(i[0])
    
for i in lca_ml_pred:
    ml_ttest.append(i[1])

for i in lca_ml_pred:
    pred_ttest.append(i[2])
    
# Perform t-test
t_statistic_lca_ml, p_value_lca_ml = ttest_ind(lca_ttest, ml_ttest)
t_statistic_pred_ml, p_value_pred_ml = ttest_ind(pred_ttest, ml_ttest)

# Print the results
print("T-Statistic for lca and ml:", t_statistic_lca_ml)
print("P-Value for lca and ml:", p_value_lca_ml)


# Print the results
print("T-Statistic for pred and ml:", t_statistic_pred_ml)
print("P-Value for pred and ml:", p_value_pred_ml)

print('############################################')

# Perform ANOVA
f_statistic, p_value = f_oneway(lca_ttest, ml_ttest, pred_ttest)

# Print the results
print("F-Statistic:", f_statistic)
print("P-Value:", p_value)

print('############################################')



# Calculate Pearson correlation coefficient and p-value
corr_coeff_lca_ml, p_value_lca_ml = pearsonr(lca_ttest, ml_ttest)
corr_coeff_pred_ml, p_value_pred_ml = pearsonr(pred_ttest, ml_ttest)

# Print the results
print("Pearson Correlation Coefficient lca vs ml:", corr_coeff_lca_ml)
print("P-Value lca vs ml:", p_value_lca_ml)
print("Pearson Correlation Coefficient pred vs ml:", corr_coeff_pred_ml)
print("P-Value pred vs ml:", p_value_pred_ml)

T-Statistic for lca and ml: -10.864374383228395
P-Value for lca and ml: 5.911342581703951e-24
T-Statistic for pred and ml: 0.07285664256842568
P-Value for pred and ml: 0.9419608838289476
############################################
F-Statistic: 86.98948371941195
P-Value: 1.8817305068048797e-33
############################################
Pearson Correlation Coefficient lca vs ml: 0.9136968970039316
P-Value lca vs ml: 1.6139983161249754e-71
Pearson Correlation Coefficient pred vs ml: 0.989416452810979
P-Value pred vs ml: 3.626253524671529e-151


In [23]:
from scipy.stats import mannwhitneyu

# Perform the Mann-Whitney U test
lca_ml_statistic, lca_ml_p_value = mannwhitneyu(lca_ttest, ml_ttest)

# Print the results
print("Mann-Whitney U Statistic:", lca_ml_statistic)
print("P-value:", lca_ml_p_value)


# Perform the Mann-Whitney U test
ml_pred_statistic, ml_pred_p_value = mannwhitneyu(ml_ttest, pred_ttest)

# Print the results
print("Mann-Whitney U Statistic:", ml_pred_statistic)
print("P-value:", ml_pred_p_value)


Mann-Whitney U Statistic: 7100.0
P-value: 3.059847613500028e-20
Mann-Whitney U Statistic: 16185.0
P-value: 0.9882819062701224
