In [12]:
import numpy as np
import pandas as pd
import json
#Classification
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
#Regression
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

In [13]:
#Classification
def model_prediction_dict(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 2025, stratify = y)
    if model_name == "catb":
        model = CatBoostClassifier(random_state=2025, verbose=False)
    elif model_name == "ligb":
        model = LGBMClassifier(random_state=2025)
    elif model_name == "xgb":
        model = XGBClassifier(random_state=2025, use_label_encoder=False, eval_metric='logloss')
    elif model_name == "gbdt":
        model = GradientBoostingClassifier(random_state=2025)
    elif model_name == "rf":
        model = RandomForestClassifier(random_state=2025)
    elif model_name == "logistic":
        model = LogisticRegression(random_state=2025)
    elif model_name == "svm":
        model = SVC(probability=True,random_state=2025)
    elif model_name == "nn":
        model = MLPClassifier(random_state=2025)
    elif model_name == "knn":
        model = KNeighborsClassifier()
    else:
        raise ValueError(f"Unrecognized model name: {model_name}. Please check your input.")
        
    model.fit(X_train, y_train)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_prob)
    print(f"{model_name} Test AUC: {auc_score}")
        
    predictions_df = pd.DataFrame({
        'Actual': y,
        'Predicted': model.predict_proba(X)[:, 1]
            
    })
    
    return predictions_df

In [14]:
def model_prediction_quan(X, y, model_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2025)

    if model_name == "catb":
        model = CatBoostRegressor(random_state=2025, verbose=False)
    elif model_name == "ligb":
        model = LGBMRegressor(random_state=2025)
    elif model_name == "xgb":
        model = XGBRegressor(random_state=2025, use_label_encoder=False)
    elif model_name == "gbdt":
        model = GradientBoostingRegressor(random_state=2025)
    elif model_name == "rf":
        model = RandomForestRegressor(random_state=2025)
    elif model_name == "linear":
        model = LinearRegression()
    elif model_name == "svm":
        model = SVR()
    elif model_name == "nn":
        model = MLPRegressor(random_state=2025)
    elif model_name == "knn":
        model = KNeighborsRegressor()
    else:
        raise ValueError(f"Unrecognized model name: {model_name}. Please check your input.")
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse_score = mean_squared_error(y_test, y_pred)
    print(f"{model_name} Test MSE: {mse_score}")

    predictions_df = pd.DataFrame({
        'Actual': y,
        'Predicted': model.predict(X)
    })

    return predictions_df

In [15]:
def koml_prediction(X, y, models, binary=True, colpath="", outpath=""):

    y=y.to_numpy().ravel()
    
    for model_name in models:
        
        file_select = f'{colpath}/{model_name}_select.csv'
        select_info = pd.read_csv(file_select)
        selected_features = select_info[select_info['select']]['Feature'].tolist()
        X_new = X[selected_features]
        
        if X_new.empty:
            print(f"Warning: No selected features for model '{model_name}'. Skipping this model.")
        else:
            if binary == True:
                prediction = model_prediction_dict(X_new, y, model_name=model_name)
            else:
                prediction = model_prediction_quan(X_new, y, model_name=model_name)
            file_path = f"{outpath}/{model_name}_prediction.csv"
            prediction.to_csv(file_path, index=False)
            print(f"{model_name}: Write Prediction, Done!")
        
        
    

In [None]:
X = pd.read_csv('/Data/X.csv')
y = pd.read_csv('/Data/y_dict.csv')
koml_prediction(X, y, models=['catb', 'ligb', 'xgb', 'gbdt', 'rf'], binary=True,
                colpath="/Data",
                outpath="/Data")

In [None]:
X = pd.read_csv('/X.csv')
y = pd.read_csv('/Data/y_quan.csv')
koml_prediction(X, y, models=['catb', 'ligb', 'xgb', 'gbdt', 'rf'], binary=False,
                colpath="/Data",
                outpath="/Data")