# descriptors

In [20]:
import pandas as pd
from rdkit.Chem import Descriptors
from rdkit import Chem
df = pd.read_csv('/home/hkqiu/work/PolyGPT/polymer properties/reg-cls/exp_val/8个验证Tg的smiles.csv')
can_smiles = df['Smiles'].tolist()

# 计算描述符并添加到DataFrame中
num_processed = 0
print_frequency = 20

for desc_name, desc_func in Descriptors.descList:
    try:
        descriptors = [desc_func(Chem.MolFromSmiles(smiles)) if Chem.MolFromSmiles(smiles) else None for smiles in can_smiles] 
    except Exception as e:
        print(f"SMILES Parse Error: {e}")
        descriptors = None
    df[desc_name] = descriptors

    # 打印处理进度
    num_processed += 1
    if num_processed % print_frequency == 0:
        print(f"已计算 {num_processed} 个描述符")

# 保存包含描述符的DataFrame到CSV文件
df.to_csv('/home/hkqiu/work/PolyGPT/polymer properties/reg-cls/exp_val/date_with_descriptors.csv', index=False)

已计算 20 个描述符
已计算 40 个描述符
已计算 60 个描述符
已计算 80 个描述符
已计算 100 个描述符
已计算 120 个描述符
已计算 140 个描述符
已计算 160 个描述符
已计算 180 个描述符
已计算 200 个描述符


# ML models

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from sklearn.svm import NuSVR
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_df = pd.read_csv('/home/hkqiu/work/PolyGPT/T5/0data&code/data/train/merge/merged_date_with_descriptors_filtered_data.csv')
test_df = pd.read_csv('/home/hkqiu/work/PolyGPT/T5/0data&code/data/test/merge/merged_date_with_descriptors_filtered_data.csv')


X_train = train_df.drop(['prompt','target'], axis=1)
X_test = test_df.drop(['prompt','target'], axis=1)
y_train = train_df[['task','target']]
y_test = test_df[['task','target']]


unique_tasks = X_train['task'].unique()
unique_tasks

array(['atomization energy', 'bandgap-crystal', 'Tg',
       'heat resistance class'], dtype=object)

In [3]:
import pandas as pd


# 假设元素类别列表
categories = ['atomization energy', 'bandgap-crystal', 'Tg',
       'heat resistance class']

# 创建一个空字典来存储每种元素对应的索引范围
train_index_ranges = {}
test_index_ranges = {}

# 遍历每个元素类别
for category in categories:
    # 获取特定元素类别的索引范围
    indices = train_df[train_df['task'] == category].index
    train_index_ranges[category] = (indices.min(), indices.max())

# 打印每种元素对应的索引范围
print("Train set:")
for category, index_range in train_index_ranges.items():
    
    print(f"{category}: {index_range}")


# 遍历每个元素类别
for category in categories:
    # 获取特定元素类别的索引范围
    indices = test_df[test_df['task'] == category].index
    test_index_ranges[category] = (indices.min(), indices.max())

# 打印每种元素对应的索引范围
print("Test set:")
for category, index_range in test_index_ranges.items():
    
    print(f"{category}: {index_range}")

Train set:
atomization energy: (0, 5264)
bandgap-crystal: (5265, 9504)
Tg: (9505, 15654)
heat resistance class: (15655, 20649)
Test set:
atomization energy: (0, 584)
bandgap-crystal: (585, 1064)
Tg: (1065, 1764)
heat resistance class: (1765, 2319)


In [18]:
def get_data(property):
    """
    a:tg_X_train
    b:tg_X_test
    c:tg_y_train
    d:tg_y_test
    """
    a = X_train[X_train['task']==property].drop('task', axis=1)
    b = X_test[X_test['task']==property].drop('task', axis=1)
    c = y_train[y_train['task']==property].drop('task', axis=1)
    d = y_test[y_test['task']==property].drop('task', axis=1)

    return a,b,c,d

"""['band gap chain', 'atomization energy', 'ionization energy',
       'band gap bulk', 'electron affinity', 'bandgap-crystal', 'Tg',
       'crystallization tendency', 'dielectric constant',
       'refractive index', 'heat resistance class']
"""


tg_X_train, tg_X_test, tg_y_train, tg_y_test = get_data('Tg')
bc_X_train, bc_X_test, bc_y_train, bc_y_test = get_data('bandgap-crystal')
ae_X_train, ae_X_test, ae_y_train, ae_y_test = get_data('atomization energy')

hrc_X_train, hrc_X_test, hrc_y_train, hrc_y_test = get_data('heat resistance class')

# Reg

In [29]:
# Create an empty DataFrame to store the model performance data
performance_data = pd.DataFrame(columns=['Model', 'R2', 'MAE', 'MSE'])

def save_performance_data(model_name, r2, mae, mse):
    global performance_data
    new_row = pd.DataFrame([[model_name, r2, mae, mse]], columns=['Model', 'R2', 'MAE', 'MSE'])
    performance_data = pd.concat([performance_data, new_row], ignore_index=True)

def RandomForest(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=1, random_state=42)

    # 在训练集上训练模型
    model.fit(X_train, y_train)

    # 在测试集上进行预测
    y_pred = model.predict(X_test)

    # 计算准确率
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    save_performance_data('RandomForest', r2, mae, MSE)
    print(f"RandomForest R2: {r2}")
    print(f"RandomForest MAE: {mae}")
    print(f"RandomForest MSE: {MSE}")
    print("=======================")

def Linear(X_train, X_test, y_train, y_test):
    model = LinearRegression()

    # 在训练集上训练模型
    model.fit(X_train, y_train)

    # 在测试集上进行预测
    y_pred = model.predict(X_test)

    # 计算准确率
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    save_performance_data('Linear', r2, mae, MSE)
    print(f"LinearRegression R2: {r2}")
    print(f"LinearRegression MAE: {mae}")
    print(f"LinearRegression MSE: {MSE}")
    print("=======================")

def Svr(X_train, X_test, y_train, y_test):
    model = SVR()

    # 在训练集上训练模型
    model.fit(X_train, y_train)

    # 在测试集上进行预测
    y_pred = model.predict(X_test)

    # 计算准确率
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)

    print(f"SVR R2: {r2}")
    print(f"SVR MAE: {mae}")
    print(f"SVR MSE: {MSE}")
    print("=======================")

    # 保存性能数据
    save_performance_data("SVR", r2, mae, MSE)

def DecisionTree(X_train, X_test, y_train, y_test):
    model = DecisionTreeRegressor(random_state=42)

    # 在训练集上训练模型
    model.fit(X_train, y_train)

    # 在测试集上进行预测
    y_pred = model.predict(X_test)

    # 计算准确率
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)

    print(f"DecisionTree R2: {r2}")
    print(f"DecisionTree MAE: {mae}")
    print(f"DecisionTree MSE: {MSE}")
    print("=======================")

    # 保存性能数据
    save_performance_data("DecisionTree", r2, mae, MSE)

def RidgeRegression(X_train, X_test, y_train, y_test):
    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    print(f"Ridge Regression R2: {r2}")
    print(f"Ridge Regression MAE: {mae}")
    print(f"Ridge Regression MSE: {MSE}")
    print("=======================")

    # 保存性能数据
    save_performance_data("Ridge Regression", r2, mae, MSE)

def GaussianProcess(X_train, X_test, y_train, y_test):
    model = GaussianProcessRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    print(f"GaussianProcessRegressor R2: {r2}")
    print(f"GaussianProcessRegressor MAE: {mae}")
    print(f"GaussianProcessRegressor MSE: {MSE}")
    print("=======================")

    # 保存性能数据
    save_performance_data("GaussianProcessRegressor", r2, mae, MSE)

def AdaBoost(X_train, X_test, y_train, y_test):
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    save_performance_data("AdaBoost", r2, mae, MSE)
    print(f"AdaBoostRegressor R2: {r2}")
    print(f"AdaBoostRegressor MAE: {mae}")
    print(f"AdaBoostRegressor MSE: {MSE}")
    print("=======================")

def GradientBoosting(X_train, X_test, y_train, y_test):
    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    save_performance_data("GradientBoosting", r2, mae, MSE)
    print(f"GradientBoostingRegressor R2: {r2}")
    print(f"GradientBoostingRegressor MAE: {mae}")
    print(f"GradientBoostingRegressor MSE: {MSE}")
    print("=======================")

def Bagging(X_train, X_test, y_train, y_test):
    model = BaggingRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    save_performance_data("Bagging", r2, mae, MSE)
    print(f"BaggingRegressor R2: {r2}")
    print(f"BaggingRegressor MAE: {mae}")
    print(f"BaggingRegressor MSE: {MSE}")
    print("=======================")

def ExtraTrees(X_train, X_test, y_train, y_test):
    model = ExtraTreesRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    MSE = mean_squared_error(y_test, y_pred)
    save_performance_data("ExtraTrees", r2, mae, MSE)
    print(f"ExtraTreesRegressor R2: {r2}")
    print(f"ExtraTreesRegressor MAE: {mae}")
    print(f"ExtraTreesRegressor MSE: {MSE}")
    print("=======================")


def train_predict(X_train, X_test, y_train, y_test):
    RandomForest(X_train, X_test, y_train, y_test)
    Linear(X_train, X_test, y_train, y_test)
    Svr(X_train, X_test, y_train, y_test)
    DecisionTree(X_train, X_test, y_train, y_test)
    RidgeRegression(X_train, X_test, y_train, y_test)
    GaussianProcess(X_train, X_test, y_train, y_test)
    AdaBoost(X_train, X_test, y_train, y_test)
    Bagging(X_train, X_test, y_train, y_test)
    ExtraTrees(X_train, X_test, y_train, y_test)

In [20]:
print("***Tg***")
train_predict(tg_X_train, tg_X_test, tg_y_train, tg_y_test)
performance_data.to_csv('/home/hkqiu/work/PolyGPT/T5/0data&code/ml models/model_performance/Tg.csv', index=False)


***Tg***
RandomForest R2: 0.7877011170752712
RandomForest MAE: 40.48248727880983
RandomForest MSE: 3671.944417743777
LinearRegression R2: -0.00017849022655913593
LinearRegression MAE: 111.20036190435586
LinearRegression MSE: 17299.19523522385
SVR R2: -0.06575069272608647
SVR MAE: 106.07285704606691
SVR MSE: 18433.339134665242
DecisionTree R2: 0.7871190673834627
DecisionTree MAE: 39.0525
DecisionTree MSE: 3682.0116121031742
Ridge Regression R2: 0.8910406051493795
Ridge Regression MAE: 32.17445792077355
Ridge Regression MSE: 1884.5734662877614
GaussianProcessRegressor R2: -1.905496239946293
GaussianProcessRegressor MAE: 188.0253322547167
GaussianProcessRegressor MSE: 50253.77690201503
AdaBoostRegressor R2: 0.8683799411630398
AdaBoostRegressor MAE: 35.934305296038396
AdaBoostRegressor MSE: 2276.5147590571137
BaggingRegressor R2: 0.8359317519911869
BaggingRegressor MAE: 30.63707833220199
BaggingRegressor MSE: 2837.7421449672133
ExtraTreesRegressor R2: 0.8477586146264753
ExtraTreesRegressor

In [21]:
print("***bandgap_crystal***")
performance_data = pd.DataFrame(columns=['Model', 'R2', 'MAE', 'MSE'])
train_predict(bc_X_train, bc_X_test, bc_y_train, bc_y_test)
performance_data.to_csv('./model_performance/bandgap_crystal.csv', index=False)


***bandgap_crystal***
RandomForest R2: 0.7726795069833294
RandomForest MAE: 0.803925
RandomForest MSE: 1.003354361666667
LinearRegression R2: 0.583020059358073
LinearRegression MAE: 0.9918139570307278
LinearRegression MSE: 1.8404792133716836
SVR R2: -0.10742247588789988
SVR MAE: 1.7029543823655093
SVR MSE: 4.887976251698249
DecisionTree R2: 0.7602583212723335
DecisionTree MAE: 0.8172874999999997
DecisionTree MSE: 1.0581793829166666
Ridge Regression R2: 0.8242596072821184
Ridge Regression MAE: 0.6467277824526799
Ridge Regression MSE: 0.7756884881538952
GaussianProcessRegressor R2: -5.984309079857041
GaussianProcessRegressor MAE: 5.139429166666667
GaussianProcessRegressor MSE: 30.827563698749994
AdaBoostRegressor R2: 0.8308044513458083
AdaBoostRegressor MAE: 0.6708269388039324
AdaBoostRegressor MSE: 0.7468006490040402
BaggingRegressor R2: 0.8263578736730555
BaggingRegressor MAE: 0.6479037499999999
BaggingRegressor MSE: 0.7664270937791665
ExtraTreesRegressor R2: 0.8568143755853017
ExtraTr

In [22]:
print("***atomization energy***")
performance_data = pd.DataFrame(columns=['Model', 'R2', 'MAE', 'MSE'])
train_predict(ae_X_train, ae_X_test, ae_y_train, ae_y_test)
performance_data.to_csv('./model_performance/atomization energy.csv', index=False)

***atomization energy***
RandomForest R2: 0.955150215095907
RandomForest MAE: 0.0497435897435901
RandomForest MSE: 0.005025641025641048
LinearRegression R2: -14.791845637509647
LinearRegression MAE: 0.2795157935647559
LinearRegression MSE: 1.7695546918713607
SVR R2: -0.010928641842724574
SVR MAE: 0.27042470423729176
SVR MSE: 0.11327957240608147
DecisionTree R2: 0.917531313880434
DecisionTree MAE: 0.06871794871794865
DecisionTree MSE: 0.00924102564102558
Ridge Regression R2: 0.9552727203657102
Ridge Regression MAE: 0.05417531247403009
Ridge Regression MSE: 0.005011913702063093
GaussianProcessRegressor R2: -318.3582902954816
GaussianProcessRegressor MAE: 5.96007647623907
GaussianProcessRegressor MSE: 35.78568167093004
AdaBoostRegressor R2: 0.8797193381717081
AdaBoostRegressor MAE: 0.0926267400743825
AdaBoostRegressor MSE: 0.01347804521176991
BaggingRegressor R2: 0.9650000058673188
BaggingRegressor MAE: 0.04028205128205136
BaggingRegressor MSE: 0.003921923076923073
ExtraTreesRegressor R2:

## validation

In [55]:
def RandomForest(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(n_estimators=1, random_state=42)

    # 在训练集上训练模型
    model.fit(X_train, y_train)

    return model

def Linear(X_train, X_test, y_train, y_test):
    model = LinearRegression()

    # 在训练集上训练模型
    model.fit(X_train, y_train)

    return model

def Svr(X_train, X_test, y_train, y_test):
    model = SVR()

    # 在训练集上训练模型
    model.fit(X_train, y_train)

    return model

def DecisionTree(X_train, X_test, y_train, y_test):
    model = DecisionTreeRegressor(random_state=42)

    # 在训练集上训练模型
    model.fit(X_train, y_train)

    return model

def RidgeRegression(X_train, X_test, y_train, y_test):
    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)
    return model

def GaussianProcess(X_train, X_test, y_train, y_test):
    model = GaussianProcessRegressor()
    model.fit(X_train, y_train)
    return model

def AdaBoost(X_train, X_test, y_train, y_test):
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    return model

def GradientBoosting(X_train, X_test, y_train, y_test):
    model = GradientBoostingRegressor()
    model.fit(X_train, y_train)
    return model

def Bagging(X_train, X_test, y_train, y_test):
    model = BaggingRegressor()
    model.fit(X_train, y_train)
    return model

def ExtraTrees(X_train, X_test, y_train, y_test):
    model = ExtraTreesRegressor()
    model.fit(X_train, y_train)
    return model

def train_predict_val(X_train, X_test, y_train, y_test):
    """
    only for getting the trained models here
    """
    rf = RandomForest(X_train, X_test, y_train, y_test)
    lr = Linear(X_train, X_test, y_train, y_test)
    svr = Svr(X_train, X_test, y_train, y_test)
    dt = DecisionTree(X_train, X_test, y_train, y_test)
    rg = RidgeRegression(X_train, X_test, y_train, y_test)
    gp = GaussianProcess(X_train, X_test, y_train, y_test)
    ada = AdaBoost(X_train, X_test, y_train, y_test)
    bag = Bagging(X_train, X_test, y_train, y_test)
    ext = ExtraTrees(X_train, X_test, y_train, y_test)
    
    return rf,lr,svr,dt,rg,gp,ada,bag,ext

In [60]:
df_val = pd.read_csv('/home/hkqiu/work/PolyGPT/polymer properties/reg-cls/exp_val/data_with_descriptors.csv')
df_val = df_val.drop("Smiles", axis=1)

print("***Tg***")
rf,lr,svr,dt,rg,gp,ada,bag,ext = train_predict_val(tg_X_train, tg_X_test, tg_y_train, tg_y_test)
rf_pred = rf.predict(df_val)
lr_pred = lr.predict(df_val)
svr_pred = svr.predict(df_val)
dt_pred = dt.predict(df_val)

rg_pred = rg.predict(df_val)
gp_pred = gp.predict(df_val)
ada_pred = ada.predict(df_val)
bag_pred = bag.predict(df_val)

ext_pred = ext.predict(df_val)
# xgb_pred = xgb.predict(df_val)

print("Ground truth: 361, 381")
print(f"RF prediction: {rf_pred}")
print(f"LR prediction: {lr_pred}")
print(f"SVR prediction: {svr_pred}")
print(f"DT prediction: {dt_pred}")
print(f"RG prediction: {rg_pred}")
print(f"GP prediction: {gp_pred}")
print(f"ADA prediction: {ada_pred}")
print(f"BAG prediction: {bag_pred}")
print(f"EXT prediction: {ext_pred}")
# print(f"XGB prediction: {xgb_pred}")

***Tg***
Ground truth: 361, 381
RF prediction: [274.         330.         394.85       374.54545455 330.
 378.875      332.         420.         332.         276.85      ]
LR prediction: [[194.85355533]
 [194.8879272 ]
 [194.85352993]
 [194.86679209]
 [194.88790605]
 [194.84727622]
 [194.85125875]
 [194.92762258]
 [194.85130049]
 [194.86684644]]
SVR prediction: [228.89999996 228.89999996 228.89999996 228.89999996 228.89999996
 228.89999996 228.89999996 228.89999996 228.89999996 228.89999996]
DT prediction: [274.   400.   394.85 510.   400.   380.5  321.   330.   321.   276.85]
RG prediction: [[289.42955484]
 [291.77911777]
 [297.91116513]
 [313.55233816]
 [297.32353536]
 [344.9166446 ]
 [297.04468341]
 [317.42285777]
 [290.60898823]
 [313.38074568]]
GP prediction: [[  0.       ]
 [  0.       ]
 [  0.       ]
 [  0.       ]
 [  0.       ]
 [380.4977417]
 [  0.       ]
 [  0.       ]
 [  0.       ]
 [  0.       ]]
ADA prediction: [313.93513816 319.30694444 319.78571739 313.444869   319.7

In [61]:
# 先创建一个空的DataFrame，用于保存预测结果
predictions = pd.DataFrame()

# 进行预测并将结果保存到DataFrame中
predictions['RF'] = rf_pred
predictions['LR'] = lr_pred
predictions['SVR'] = svr_pred
predictions['DT'] = dt_pred
predictions['RG'] = rg_pred
predictions['GP'] = gp_pred
predictions['ADA'] = ada_pred
predictions['BAG'] = bag_pred
predictions['EXT'] = ext_pred

# 将DataFrame保存为CSV文件
predictions.to_csv('/home/hkqiu/work/PolyGPT/polymer properties/reg-cls/exp_val/ml_predictions.csv', index=False)


In [54]:
print("***atomization energy***")
train_predict(ae_X_train, ae_X_test, ae_y_train, ae_y_test)

***atomization energy***
RandomForest R2: 0.955150215095907
RandomForest MAE: 0.0497435897435901
RandomForest MSE: 0.005025641025641048
LinearRegression R2: -17.259364423441333
LinearRegression MAE: 0.2956599531004891
LinearRegression MSE: 2.0460524202024186
SVR R2: -0.010928641842724574
SVR MAE: 0.27042470423729176
SVR MSE: 0.11327957240608147
DecisionTree R2: 0.917531313880434
DecisionTree MAE: 0.06871794871794865
DecisionTree MSE: 0.00924102564102558
MLP R2: -86098.6933184913
MLP MAE: 29.443596991899526
MLP MSE: 9647.898021402405


## all properties

In [30]:
X_train = train_df.drop(['prompt','target','task'], axis=1)
X_test = test_df.drop(['prompt','target','task'], axis=1)
y_train = train_df[['target']]
y_test = test_df[['target']]

X_train = X_train.head(15654)
X_test = X_test.head(1764)
y_train = y_train.head(15654)
y_test = y_test.head(1764)

print("***all***")
performance_data = pd.DataFrame(columns=['Model', 'R2', 'MAE', 'MSE'])
train_predict(X_train, X_test, y_train, y_test)
performance_data.to_csv('./model_performance/reg-all.csv', index=False)


***all***
RandomForest R2: 0.8388710101234426
RandomForest MAE: 27.30894414478224
RandomForest MSE: 2591.1833496087784
LinearRegression R2: 0.002284679254769717
LinearRegression MAE: 107.18461334337869
LinearRegression MSE: 16044.68152345037
SVR R2: -0.3152832949069524
SVR MAE: 84.33632152195749
SVR MSE: 21151.626261621073
DecisionTree R2: 0.827571049549177
DecisionTree MAE: 26.761988916256158
DecisionTree MSE: 2772.9027888835126
Ridge Regression R2: 0.8362969987616238
Ridge Regression MAE: 37.012967155790115
Ridge Regression MSE: 2632.5771136208186
GaussianProcessRegressor R2: -0.2628318323923091
GaussianProcessRegressor MAE: 81.3712345257274
GaussianProcessRegressor MSE: 20308.13213660548
AdaBoostRegressor R2: 0.8540478706203829
AdaBoostRegressor MAE: 34.8719277487462
AdaBoostRegressor MSE: 2347.1178450143857
BaggingRegressor R2: 0.8483749016867923
BaggingRegressor MAE: 25.765970519369553
BaggingRegressor MSE: 2438.3472547862043
ExtraTreesRegressor R2: 0.9029059806371644
ExtraTreesRe

# Cls

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import xgboost as xgb

# Create a DataFrame to store the performance data
performance_data = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Function to save performance data
def save_performance_data(model_name, accuracy, precision, recall, f1):
    global performance_data
    new_row = pd.DataFrame([[model_name, accuracy, precision, recall, f1]], columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    performance_data = pd.concat([performance_data, new_row], ignore_index=True)

# Function to train and evaluate the Random Forest model
def RandomForest(X_train, X_test, y_train, y_test):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    save_performance_data('Random Forest', accuracy, precision, recall, f1)
    print(f"Random Forest Accuracy: {accuracy}")
    print(f"Random Forest Precision: {precision}")
    print(f"Random Forest Recall: {recall}")
    print(f"Random Forest F1 Score: {f1}")
    print("=======================")

# Function to train and evaluate the Logistic Regression model
def LogisticReg(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    save_performance_data('Logistic Regression', accuracy, precision, recall, f1)
    print(f"Logistic Regression Accuracy: {accuracy}")
    print(f"Logistic Regression Precision: {precision}")
    print(f"Logistic Regression Recall: {recall}")
    print(f"Logistic Regression F1 Score: {f1}")
    print("=======================")

# Function to train and evaluate the SVC model
def SVC_Classifier(X_train, X_test, y_train, y_test):
    from sklearn.svm import SVC
    model = SVC()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    save_performance_data('SVC', accuracy, precision, recall, f1)
    print(f"SVC Accuracy: {accuracy}")
    print(f"SVC Precision: {precision}")
    print(f"SVC Recall: {recall}")
    print(f"SVC F1 Score: {f1}")
    print("=======================")

# Function to train and evaluate the Decision Tree model
def DecisionTree(X_train, X_test, y_train, y_test):
    from sklearn.tree import DecisionTreeClassifier
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    save_performance_data('Decision Tree', accuracy, precision, recall, f1)
    print(f"Decision Tree Accuracy: {accuracy}")
    print(f"Decision Tree Precision: {precision}")
    print(f"Decision Tree Recall: {recall}")
    print(f"Decision Tree F1 Score: {f1}")
    print("=======================")

# Function to train and evaluate the AdaBoost model
def AdaBoost(X_train, X_test, y_train, y_test):
    from sklearn.ensemble import AdaBoostClassifier
    model = AdaBoostClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    save_performance_data('AdaBoost', accuracy, precision, recall, f1)
    print(f"AdaBoost Accuracy: {accuracy}")
    print(f"AdaBoost Precision: {precision}")
    print(f"AdaBoost Recall: {recall}")
    print(f"AdaBoost F1 Score: {f1}")
    print("=======================")

# Function to train and evaluate the XGBoost model
def XGBoost(X_train, X_test, y_train, y_test):
    # 将字符串类别标签转换为数值
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # 训练 XGBoost 模型
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train_encoded)
    y_pred_encoded = model.predict(X_test)

    # 将预测标签转换回原始的字符串类别标签
    y_pred = label_encoder.inverse_transform(y_pred_encoded)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    save_performance_data('XGBoost', accuracy, precision, recall, f1)
    print(f"XGBoost 精确度：{precision}")
    print(f"XGBoost 召回率：{recall}")
    print(f"XGBoost F1 分数：{f1}")
    print("=======================")

# Function to train and evaluate the K-Nearest Neighbors model
def KNN(X_train, X_test, y_train, y_test):
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    save_performance_data('K-Nearest Neighbors', accuracy, precision, recall, f1)
    print(f"K-Nearest Neighbors Accuracy: {accuracy}")
    print(f"K-Nearest Neighbors Precision: {precision}")
    print(f"K-Nearest Neighbors Recall: {recall}")
    print(f"K-Nearest Neighbors F1 Score: {f1}")
    print("=======================")

# Function to train and evaluate the Naive Bayes model
def NaiveBayes(X_train, X_test, y_train, y_test):
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    save_performance_data('Naive Bayes', accuracy, precision, recall, f1)
    print(f"Naive Bayes Accuracy: {accuracy}")
    print(f"Naive Bayes Precision: {precision}")
    print(f"Naive Bayes Recall: {recall}")
    print(f"Naive Bayes F1 Score: {f1}")
    print("=======================")

def cls_train_predict(X_train, X_test, y_train, y_test):
    RandomForest(X_train, X_test, y_train, y_test)
    LogisticReg(X_train, X_test, y_train, y_test)
    SVC_Classifier(X_train, X_test, y_train, y_test)
    DecisionTree(X_train, X_test, y_train, y_test)
    AdaBoost(X_train, X_test, y_train, y_test)
    KNN(X_train, X_test, y_train, y_test)
    NaiveBayes(X_train, X_test, y_train, y_test)
    XGBoost(X_train, X_test, y_train, y_test)


In [25]:
cls_train_predict(hrc_X_train, hrc_X_test, hrc_y_train, hrc_y_test)
performance_data.to_csv('./model_performance/heat resistence.csv', index=False)

Random Forest Accuracy: 0.7297297297297297
Random Forest Precision: 0.6655405405405406
Random Forest Recall: 0.7297297297297297
Random Forest F1 Score: 0.6954954954954955
Logistic Regression Accuracy: 0.08108108108108109
Logistic Regression Precision: 0.006574141709276844
Logistic Regression Recall: 0.08108108108108109
Logistic Regression F1 Score: 0.012162162162162163
SVC Accuracy: 0.5675675675675675
SVC Precision: 0.32213294375456536
SVC Recall: 0.5675675675675675
SVC F1 Score: 0.4109972041006524
Decision Tree Accuracy: 0.6756756756756757
Decision Tree Precision: 0.6400635930047694
Decision Tree Recall: 0.6756756756756757
Decision Tree F1 Score: 0.6537903757415953
AdaBoost Accuracy: 0.4864864864864865
AdaBoost Precision: 0.7913851351351351
AdaBoost Recall: 0.4864864864864865
AdaBoost F1 Score: 0.4251651651651651
K-Nearest Neighbors Accuracy: 0.7027027027027027
K-Nearest Neighbors Precision: 0.7327702702702703
K-Nearest Neighbors Recall: 0.7027027027027027
K-Nearest Neighbors F1 Score