In [1]:
import os
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.cross_decomposition import PLSRegression
from xgboost import XGBRegressor
from sklearn.datasets import dump_svmlight_file

# Initialize a DataFrame to store the results
results = pd.DataFrame(columns=["Classifier", "Input CSV", "R2 Train", "RMSE Train",
                                 "Q2 CV", "RMSE CV", "Q2 Test", "RMSE Test"])

# Initialize arrays for storing all features and check for first-time assignment
feat_train = None  # Change initialization to None to detect first-time assignment
feat_test = None   # Change initialization to None to detect first-time assignment

def correlation(df, threshold):
    des3 = len(df.columns)
    corr_matrix = df.corr()
    col_corr = set()  # Set of all names of deleted columns
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if corr_matrix.iloc[i, j] >= threshold:
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
                if colname in df.columns:
                    del df[colname]  # deleting the column from the dataset
    
    des4 = len(df.columns)
    print(f"Correlation filter reduced descriptors from {des3} to {des4}.")
    return df, des3, des4

def build_model(X, Y, seed, hx=None, input_csv_name="unknown"):
    global feat_train, feat_test  # Declare global variables
    
    # Split data into internal (train) and external (test)
    X_internal, X_external, Y_internal, Y_external = train_test_split(X, Y, test_size=0.2, random_state=seed)

    # Models to evaluate
    models = {
        "RF": RandomForestRegressor(n_estimators=400, max_features='sqrt',
                                    min_samples_leaf=1, random_state=13, n_jobs=-1),
        "MLP": MLPRegressor(hidden_layer_sizes=(100,), activation='relu',
                            solver='adam', random_state=42, max_iter=1000),
        "XGB": XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1,
                            subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1),
        "SVMR": SVR(kernel='rbf', C=1.0, epsilon=0.1),
        "SVMP": SVR(kernel='poly', degree=1, C=1.0, epsilon=0.1),
        "PLS": PLSRegression(n_components=2)
    }

    global results   # Use the global results DataFrame
    
    for name, model in models.items():
        # Cross-validation prediction to generate features
        cv_pred = cross_val_predict(model, X_internal, Y_internal, cv=10, n_jobs=-1)
        model.fit(X_internal, Y_internal)

        # Predictions
        train_pred = model.predict(X_internal)
        test_pred = model.predict(X_external)

        # Check if feat_train and feat_test have been initialized
        if feat_train is None:
            feat_train = cv_pred.reshape(-1, 1)  # Initialize feat_train on the first pass
            feat_test = test_pred.reshape(-1, 1)  # Initialize feat_test on the first pass
        else:
            feat_train = np.hstack((feat_train, cv_pred.reshape(-1, 1)))  # Concatenate new features
            feat_test = np.hstack((feat_test, test_pred.reshape(-1, 1)))

        # Calculate metrics
        r2_train = r2_score(Y_internal, train_pred)
        rmse_train = np.sqrt(mean_squared_error(Y_internal, train_pred))
        q2_cv = r2_score(Y_internal, cv_pred)  # R2 of cross-validation
        rmse_cv = np.sqrt(mean_squared_error(Y_internal, cv_pred))
        q2_test = r2_score(Y_external, test_pred)
        rmse_test = np.sqrt(mean_squared_error(Y_external, test_pred))

        # Append results
        new_row = pd.DataFrame({
            "Classifier": [name],
            "Input CSV": [input_csv_name],
            "R2 Train": [r2_train],
            "RMSE Train": [rmse_train],
            "Q2 CV": [q2_cv],
            "RMSE CV": [rmse_cv],
            "Q2 Test": [q2_test],
            "RMSE Test": [rmse_test]
        })
        results = pd.concat([results, new_row], ignore_index=True)

    return results

# Main script
pIC50=[4.046723663,5.110138279,5.345630909,4.287552343,4.647817482,4.616184634,4.910094889,4.017728767,4.017728767,10,7.835647144,5.526221165,6.187086643,7.397940009,6.142667504,6.387216143,5.657577319,7.045757491,8.522878745,9,5.8569852,3,4.312203589,5.493494968,5.096910013,5.455931956,5.086186148,6.036212173,5.59345982,6.327902142,6.823908741,6.619788758,6.004364805,7.397940009,6.638272164,5.872895202,5.075720714,6.638272164,5.193820026,8.15490196,8.104577454,9.113509275,4.562249437,6.236572006,4.397940009,7.619788758,5.619788758,6.494850022,5.522878745,4.327902142,8.045757491,7.920818754,8.096910013,7.337242168,3.886056648,7.853871964,4.721246399,8.301029996,7.283996656,5.537602002,6.508638306,4.721246399,3.37675071,4.920818754,4.853871964,4.638272164,4.004364805,5.193820026,5.785156152,9,7.657577319,7.22184875,6.698970004,6.647817482,7.045757491,6.207608311,6.27572413,5.823908741,7.602059991,4.698970004,4.060480747,7.431798276,6.552841969,5.346787486,7.096910013,8.397940009,5,5,7.962573502,-2.060701617,6.77314243,6.622875958,6.835052627,7.698970004,8.004364805,2.420216403,4.522878745,4.036212173,2,6.958607315,6.698970004,7,4.508638306,7.494850022,7.292429824,6,6.508638306,7.602059991,7.455931956,7.585026652,7.124938737,8.096910013,5.721246399,6.585026652,5.200659451,7,5.638272164,8.522878745,7.677780705,-1.6021,-1.7782,8.625251654,5.239577517,3.985437462,4.276134036,5.061980903,7.15490196,6.585026652,9,3.84575867,6.619788758,7.200659451,3.797457195,5.454692884,3.037228288,6.795880017,5.647817482,6.970616222,3.162171756,4.221631657,6.522878745,4.63638802,5.187086643,5.66756154,4.477555766,5.147520006,3.291664097,3.304868702,3.912573543,3.118729496,3.286509457,10.52287875,7.698970004,8.638272164,7.853871964,7.823908741,7.537602002,7.920818754,8.045757491,7.853871964,8,7.920818754,7.823908741,8.638272164,3.24131515,4.164309429,3.948847478,3.151995729,5.180456064,3.994390555,3.178486472,6.236572006,4.235823868,3.201487467,4.264800452,3.406381692,4.651695137,3.109746695,3.529883647,4.172630727,3.04885684,3.171082438,4.243363892,4.15490196,4.096910013,7.920818754,5.544698228,5.399462706,6.001740662,6.229147988,5.406713933,4.195179321,4.958607315,6.301029996,4.251811973,7,5.004364805,6.698970004,4.48148606,6.301029996,4.677780705,6,5.008773924,5.749579998,2.318758763,2.552841969,2.142667504,2.568636236,2.301029996,2.903089987,2.193820026,2.301029996,2.301029996,3,7.384049948,8.508638306,7.931814138,8.537602002,8.26760624,8.065501549,8.537602002,9.15490196,9.22184875,7.995678626,8.070581074,8.431798276,7.958607315,3.602059991,-2.875100059,4.752026734,4.913640169,7.30980392,7.387216143,-3.7403998,5,4.382999659,5.779999191,6.292429824,6.589999287,5.844663963,5.889999849,5.849999245,5.749999856,3.987162775,4.301029996,3.974694135,4.473660723,5.379863945,5.619788758,5.369572125,5.54515514,4.089375595,4.197910742,4.467245621,4.392544977,4.325138859,4.392544977,6.602059991,5.549750892,6.292429824,4.518557371,3.583692413,6.301029996,5.417936637,4.721246399,4.935542011,3.869666232,3.829738285,5.212539525,5.412289035,4.938924676,5.522878745,5.397940009,4.886056648,5.397940009,4.721246399,4.721246399,5.886056648,5.886056648,5.698970004,5.698970004,4.337242168,5.677780705,5.677780705,4.337242168,4.677780705,4.677780705,5.886056648,5.886056648,2.397940009,5.870000928,3.943095149,3.782516056,4.428291168,5.367542708,5.215382707,5.334419009,5.022733788,4.166852888,6.397940009,5.954677021,5.189095719,4.732828272,4.847711656,4.431798276,4.001479117,3.792688849,4.280088936,3.783860499,6.387216143,4.795880017,4.795880017,4.504455662,5.568636236,4.754487332,5.801342913,6.387216143,6.698970004,8,3.301000004,6.022276395,6.13076828,5.798602876,5.527243551,4.657577319,4.657577319,4.406713933,5.397940009,4.943095149,4.785156152,4.684029655,5.142667504,4.26520017,4.079876674,5.236572006,4.847711656,5.036212173,4.303643611,5.026872146,5.102372909,5.070581074,5.008773924,4.333482019,4.160521953,5.494850022,4.801342913,4.104577454,6.022276395,4.946921557,7.568636236,6.698970004,8.22184875,7.096910013,7.22184875,4.602059991,7.958607315,7.318758763,3.653647026,9.004364805,9.455931956,6.301029996,5.638272164,8.522878745,6.13667714,7.552841969,5.600326279,6.869666232,5.392544977,5.281498311,6.337242168,5.665546249,5.372634143,4.818728228,5.498940738,5.052566278,5.458420756,5.247183569,5.653647026,6.017728767,5.110698297,4.526221165,5.684029655,4.876801925,5.327902142,5.076755981,5.512861625,5.056505484,5.536107011,6.026872146,4.562407968,6.026872146,4.654822383,4.695294102,4.460422117,5.079354999,5.407823243,5.519993057,5.293282218,5.005682847,5.203425667,5.801342913,6.107905397,6.420216403,5.402304814,6.443697499,5.498940738,5.946921557,7.30980392,5.707743929,5.661543506,6.086186148,5.896196279,5.698970004,5.669586227,6.065501549,5.555955204,5.634512015,6.036212173,5.966576245,6.236572006,6.180456064,6.193820026,6.853871964,5.798602876,6.065501549,6.356547324,6.958607315,6.795880017,7.342944147,8.166215625,8.193820026,6.820448209,7.507239611,8.07109231,5.919734373,7.026872146,6.241088108,7.424812155,7.170696227,7.709965389,5.886056648,7.054039296,4.154715874,5.617982957,6.677780705,5.029653124,4.333107789,5.89279003,6.517126416,5.972242795,5.1837587,4.659754238,4.93330145,5.835647144,6.638272164,6.431798276,6.102372909,6.823908741,5.841637508,5.931814138,6.602059991,7.055517328,10,7.27572413,8.761953897,8.431798276,9.823908741,6.575118363,6.721246399,6.642065153,5.573488739,7.924453039,7.528708289,5.105407052,5.91829273,5.633763876,5.386052523,5.814457845,5.038389092,5.608711951,6.057991947,6.246416941,5.232844134,6.4867824,5.362010219,6.232102384,5.558304864,9.522878745,7.782516056,8.060480747,7.853871964,5.888065724,7.107905397,6.022276395,9.229147988,7.494850022,9.301029996,8.725842151,5.785818691,7.565590792,7.15739076,7.716698771,7.7044329,7.563837353,8.175223538,8.787812396,6.331427731,5.372378149,6.647238808,6.08249449,6.227238353,6.375408541,6.598944274,6.776504059,6.387216143,6.593971055,6.45432185,5.684113584,6.703839586,6.019165216,7.230253533,8.119758224,7.285837954,5.721475035,6.270373613,8.721246399,8.494850022,8.063989204,7.958607315,8.054531415,6.244918265,5.652149144,6.055566682,5.07494588,7.072167867,7.576918042,6.095825632,8.677780705,6.878112015,8.068542129,7.841637508,8.585026652,6.785156152,6.410497204,5.7012124,7.621057301,9.022276395,7.807711387,8.568636236,8.327902142,6.541362151,7.418961051,6.573537539,5.496713336,7.249029016,8.388276692,8.640164518,9.455931956,6.560193789,8.742321425,7.778846678,7.315334136,7.962175249,7.562407968,6.880645119,7.37716452,7.356547324,6.021531875,8.725842151,8.459670525,7.841637508,7.625251654,5.270592203,6.309272456,8.42945706,9.13667714,6.709809516,6.534617149,8.148741651,7.886056648,8.443697499,8,7.677780705,7.508638306,7.229147988,7.823908741,8.716698771,6.494850022,5.823908741,4.631397041,10,8.657577319,9.283996656,10.39794001,8.769551079,8.744727495,8.494850022,6.48148606,8.008773924,8.744727495,8.886056648,7.346787486,8.698970004,6.744727495,8.619788758,5.468521083,7.698970004,7.657577319,8.657577319,4.958607315,5.318758763,5.94195377,9.522878745,8.537602002,7.752026734,10.15490196,9.045757491,9.026872146,9.356547324,9.193820026,9.045757491,9.173925197,5.657577319,6.576754126,6.537602002,6.744727495,7.283996656,4.366531544,5.602059991,5.443697499,5.795880017,5.494850022,6.301029996,6.823908741,7.397940009,6.961777362,5.060480747,5.744727495,7.301029996,6.744727495,8.096910013,7.406713933,6.229147988,5.812479279,5.57024772,6.22184875,7.638272164,8.15490196,6.677780705,6.744727495,6.240332155,6.568636236,9,4.906578315,5.302509113,8.657577319,8.387216143,8.823908741,8.795880017,9.301029996,6.823908741,6.107905397,5.95939766,6.698970004,5.943095149,5.619788758,6.744727495,6.366531544,7.070581074,8.397940009,7,7.795880017,5.004364805,4.669586227,7.416801226,7.721246399,8.161150909,8.173925197,7.568636236,8.48148606,6.522878745,8.522878745,7.15490196,7.397940009,6.853871964,6.638272164,6.183096161,7.381951903,4.512296137,7.886056648,4.835647144,4.835647144,7.602059991,7.886056648,6.444905551,6.48148606,4.251811973,6.173925197,6.026872146,6.958607315,8.522878745,7.187086643,6.886056648,8.522878745,7.045757491,6.698970004,7.22184875,6.327902142,5.30980392,4.494850022,4.494850022,7.366531544,2.638272164,5.30980392,6.853871964,4.769551079,9.522878745,8.301029996,8.522878745,5.522878745,8.698970004,6.619788758,6.37675071,7.920818754,7.744727495,8.698970004,7.920818754,6.568636236,8.22184875,8.045757491,8.096910013,8.698970004,8.522878745,8.301029996,8.15490196,7.769551079,8.522878745,7.301029996,9.15490196,8.698970004,8.301029996,8.15490196,8.045757491,5.37675071,4.721246399,6.301029996,6.617982957,6.698970004,8.337242168,6.853871964,5.853871964,5.823908741,6.886056648,4.602059991,8.744727495,5.899629455,6.142667504,5.447331784,5.033389013,7.15490196,4.920818754,6.552841969,5.124938737,3.886056648,5.975001527,4.886056648,5.795880017,6.552841969,5.124938737,3.886056648,7.823908741,8.15490196,5.795880017,0.699,4.602059991,4.602059991,8.153972325,8.522878745,9,6.142667504,8.301029996,6.173925197,6.48148606,4.886056648,3.526075307,6.239577517,10.22184875,9.301029996,5.709520187,3.904412253,3.915852867,3.958528359,3.921688213,3.839831707,3.724412259,3.887830852,4.061130178,4.050414849,4.121018877,4.155895769,3.973303344,4.147032309,4.527389802,3.783781299,4.047498152,4.545765104,4.015697768,3.857110195,3.780520262,3.839771643,4.240105626,4.43568909,4.332920795,4.753501419,7.886056648,5.585026652,8.657577319,8.958607315,9.22184875,9,9.045757491,5.698970004,7.494850022,6.508638306,7.443697499,5.66756154,5.639999555,5.782516056,5.695007824,6.657577319,6.509999358,7.100015437,8,9.301029996,6.346787486,7.795880017,9,7.522878745,9.522878745,6.206999324,8.045757491,9.301029996,8.769551079,3.987162775,3.102372909,5.677780705,5.567030709,5.66756154,5.93930216,7.397940009,7.920818754,7.379031564,3.520410295,4.269999997,3.829999997,3.369999999,3.721246399,5.15490196,-0.699,5.740000205,4.318668294,4.041961984,7.387216143,6.387216143,4.077326432,6.638272164,7.420216403,6.013228266,6.920818754,6.244125144,6.48148606,5.528708289,4.392544977,6.004364805,5.274088368,6.30980392,5.419075024,4.876148359,4.510041521,5.356547324,5.698970004,8.88941029,5.431798276,5.480172006,5.298432015,5.59345982,3.548858579,3.440248935,3.407812135,3.537691865,3.591336126,4.585026652,5.091514981,5.501689446,5.583359493,5.330683119,5.913640169,5.889999849,5.292429824,5.589999287,4.584692708,4.288192771,4.612610174,4.614393726,6.013228266,5.723538196,5.752026734,4.732828272,4.987162775,4.09420412,4.179142011,4.077793723,4.525783736,5.058488567,7.920818754,5.863279433,4.425968732,5.625251654,4.066006836,4.375717904,3.066644336,3.071552937,4.610833916,5.193820026,4.598599459,7.508638306,6.698970004,4.492144128,4.968187729,2.214670165,4.910094889,5.101274818,6.853871964,5.790484985,5.142667504,8.096910013,5.148741651,2.283996656,4.04769199,5.096910013,-2.146100116,-2,-0.9542,3.91721463,3.391902054,5.806875402,3.728158393,3.229999997,4.361910278,4.492144128,4.463441557,4.498940738,3.962573502,4.247951552,3.978810701,4.037630664,4.898940645,4.800793521,4.518557371,4.621602099,3.434305628,6.239999953,5.870000928,5.679999196,5.14327111,6.443697499,6.494850022,5.070000019,4.211124884,4.294992041,5.119186408,4.48148606,4.630784143,5.063486258,6.091514981,5.607303047,4.302770657,4.943095149,2.397940009,-2.699000404,5.301029996,4.698970004,3.602059991,3.716698771,-3.176099945,6.27572413,6.552841969,6.677780705,5.991399828,6.008773924,3.8569852,3.684029655,3.060480747,2.050609993,2.585026652,3.966576245,3.879426069,3.66756154,3.844663963,3.723538196,3.886056648,1.431798276,5.447331784,4.78968148,5.330683119,5.214670165,4.195179321,4.55129368,4.46470588,4.565431096,4.427128398,4.739928612,4.19314197,7.397940009,4.238072162,3.962573502,9.229147988,5.779999191,4.839999931,5.359518563,5.188424994,5.068542129,4.774690718,4.869666232,4.363512104,4.860120914,4.671620397,5.123205024,5.91721463,4.694648631,5.233587153,6.187086643,4.647817482,5.116338565,5.728158393,4.36552273,4.88941029,5.539102157,4.703115524,4.325138859,5.537602002,5.552841969,3.698970004,4.397940009,3.119186408,4.863279433,4.742321425,4.970616222,5.716698771,5.036212173,4.924453039,4.343326954,4.528708289,4.387216143,4.651695137,4.657577319,4.142667504,5.684029655,5.522878745,4.080398976,6.468521083,-2.300999594,5.026872146,5.036212173,5.102372909,4.954677021,5.119186408,4.112945622,4.575118363,4.335358024,5.585026652,5.657577319,5.050609993,4.950781977,3.431563586,4.354577731,4.415668776,4.204119983,4.558304864,4.19239733,4.200000031,4.20155654,4.090000009,4.958607315,4.324221658,4.389999976,4.33999999,3.780000002,4.343422709,4.351932871,4.303556237,4.313900228,4.300000029,4.194159451,4.345823458,4.539102157,4.328827157,4.879426069,4.4867824,4.813608784,3.330000001,4.079999985,4.040000019,3.530000004,4.169999984,3.589999997,3.970169981,3.74052558,3.983509868,3.560000001,3.869999994,3.350000002,4.476773958,4.772113295,4.910094889,5.055517328,4.484126156,4.381951903,5.384049948,4.52651303,6.040958608,5.716698771,3.900664722,4.596879479]

positions_to_remove = [89, 119, 120, 228, 233, 773, 858, 940, 941, 942, 979, 984, 1061]
pIC50_cleaned = [value for index, value in enumerate(pIC50) if index not in positions_to_remove]



# Path to the Feature folder
feature_folder = 'Feature'

# Process all CSV files in the Feature folder
for file_name in os.listdir(feature_folder):
    if file_name.endswith('.csv'):
        input_csv_name = file_name
        print(f"Processing file: {input_csv_name}")
        
        # Read the CSV file and remove the first column
        file_path = os.path.join(feature_folder, input_csv_name)
        df = pd.read_csv(file_path)
        df_cleaned = df.iloc[:, 1:]  # Remove the first column
        
        # Remove specified rows
        rows_to_remove = positions_to_remove
        df_cleaned = df_cleaned.drop(rows_to_remove)
        
        # Remove correlated descriptors
        df_cleaned, des3, des4 = correlation(df_cleaned, threshold=0.9)
        
        # Run the model
        hx = df_cleaned.columns.tolist()
        results = build_model(df_cleaned, pIC50_cleaned, seed=67, hx=hx, input_csv_name=input_csv_name)

# Convert feature lists to DataFrames
feat_train_df = pd.DataFrame(feat_train)
feat_test_df = pd.DataFrame(feat_test)

# Save all features as CSV files
feat_train_df.to_csv(f'feat_train.csv', index=False)
feat_test_df.to_csv(f'feat_test.csv', index=False)




Processing file: AP2D.csv
Correlation filter reduced descriptors from 780 to 694.
Processing file: AP2DC.csv
Correlation filter reduced descriptors from 780 to 641.
Processing file: CDK.csv
Correlation filter reduced descriptors from 1024 to 1023.
Processing file: CDKExt.csv
Correlation filter reduced descriptors from 1024 to 1022.
Processing file: CDKGraph.csv
Correlation filter reduced descriptors from 1024 to 921.
Processing file: Estate.csv
Correlation filter reduced descriptors from 79 to 78.
Processing file: FP4.csv
Correlation filter reduced descriptors from 307 to 296.
Processing file: FP4C.csv
Correlation filter reduced descriptors from 307 to 283.
Processing file: KR.csv
Correlation filter reduced descriptors from 4860 to 4459.
Processing file: KRC.csv
Correlation filter reduced descriptors from 4860 to 4390.
Processing file: MACCS.csv
Correlation filter reduced descriptors from 166 to 146.
Processing file: PubChem.csv
Correlation filter reduced descriptors from 881 to 626.
P

223