In [None]:
import sys, os
sys.path.append(os.path.dirname(os.getcwd()))
from utils import custom_optuna_score, task_score, task_score_cv
import json
import pickle

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import optuna 
from optuna.study import *

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.pipeline import make_pipeline

from tqdm.notebook import tqdm  
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score


In [None]:
x_train = pd.read_csv("../../../data/x_train.txt", header=None, sep=" ")
y_train = pd.read_csv("../../../data/y_train.txt", header=None, sep=" ")

df_train = pd.concat([x_train, y_train], axis=1)
df_train.columns = ["x" + str(i) for i in range(1, df_train.shape[1])] + ["y"]


X, y = df_train.drop(columns="y"), df_train["y"]

In [None]:
possible_best_features = {
    "Iterative":
    [['x103', 'x106', 'x101', 'x5', 'x102', 'x105'],
    ['x103', 'x106', 'x101', 'x102', 'x105'],
    ['x103','x106','x101','x102'],
    ['x103','x106','x101','x105'],
    ['x103','x106','x102','x105'],
    ['x103','x101','x102','x105'],
    ['x106','x101','x102','x105']],   
    "TreeMIF": [
    ["x105","x102","x103","x106","x101"],
    ["x102","x103","x106", "x101","x104"],
    ["x102","x106","x101","x104"],
    ["x102","x103","x106","x101"],
    ["x102","x106","x104"],
    ["x102","x103","x101","x104"],
    ["x105","x9","x103","x101","x104"],],
    "Genetic": [
    ["x423"],
    ["x459"],
    ["x329", "x352","x413"],],
    "LassoSVC": [
    ['x106', 'x140', 'x153', 'x156', 'x176', 'x22', 'x221', 'x253', 'x324',
       'x329', 'x336', 'x352', 'x36', 'x404', 'x413', 'x459', 'x499', 'x58',
       'x65', 'x81'],],
    "LassoLR": [
    ['x1', 'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x132', 'x140',
       'x149', 'x153', 'x156', 'x176', 'x191', 'x22', 'x221', 'x229', 'x253',
       'x286', 'x304', 'x322', 'x323', 'x324', 'x329', 'x336', 'x35', 'x352',
       'x36', 'x40', 'x404', 'x413', 'x423', 'x459', 'x463', 'x499', 'x5',
       'x58', 'x65', 'x74', 'x8', 'x81', 'x99'],],
   "Boruta": [
       ["x101", "x102", "x103", "x104", "x105", "x106", "x9"],
   ],
   "Filters": [["x106", "x176", "x413", "x459"]],
}

# SVM

In [None]:
results = []


for feature_selection_method, features_list in tqdm(possible_best_features.items(), total=len(possible_best_features), desc="Feature selection method"):
    print(f"Feature selection method: {feature_selection_method}")
    for features in tqdm(features_list, total=len(features_list), desc="Features"):
        print(f"Features: {features}")
            
        X_train_ = X[features]
        y_train_ = y
        
        def objective(trial):
            C = trial.suggest_loguniform('C', 1e-3, 1e1)
            gamma = trial.suggest_loguniform('gamma', 1e-3, 1e1)
            svc = SVC(C=C, gamma=gamma, kernel='rbf', probability=True)
            pipeline = make_pipeline(StandardScaler(), svc)
            scores = cross_val_score(pipeline, X_train_, y_train_, cv=5, scoring=make_scorer(custom_optuna_score, greater_is_better=True,needs_proba=True))
            return scores.mean()
        
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10, n_jobs=-1)
        
        
        print("Best hyperparameters: ", study.best_params,)
        final_model = SVC(**study.best_params, probability=True)
        final_pipeline = make_pipeline(StandardScaler(), final_model)

        for i in range(5):
            X_train, X_test, y_train, y_test = train_test_split(
                X_train_, y_train_, test_size=0.2, stratify=df_train["y"], random_state=i
            )
            final_pipeline.fit(X_train, y_train)
            score = task_score(final_pipeline, X_test, y_test)
            print(f"Random state: {i}, score: {score}")

            
            precision = precision_score(y_test, final_pipeline.predict(X_test))
            f1 = f1_score(y_test, final_pipeline.predict(X_test))
            recall = recall_score(y_test, final_pipeline.predict(X_test))
            accuracy = accuracy_score(y_test, final_pipeline.predict(X_test))

        
            results.append({
                **study.best_params,
                "features": "_".join(features),
                "test_score": score,
                "random_state": i,
                "feature_selection_method": feature_selection_method,
                "precision": precision,
                "f1": f1,
                "recall": recall,
                "accuracy": accuracy,
                

                
            })
path = os.path.join("optuna")
if not os.path.exists(path):
    os.makedirs(path)

df = pd.DataFrame(results)
df.to_csv(f"{path}/results_svm.csv", index=False)


        # BEZ SENSU ZAPISYWAC MODEL, BO NA KONIEC I TAK TRZEBA ZROBIC FIT NA CALYM ZBIORZE
        # with open(f"{path}/svm.pkl", "wb") as f:
        #     pickle.dump(final_pipeline, f)

# QDA

In [None]:
results = []

for feature_selection_method, features_list in tqdm(possible_best_features.items(), total=len(possible_best_features), desc="Feature selection method"):
    print(f"Feature selection method: {feature_selection_method}")
    for features in tqdm(features_list, total=len(features_list), desc="Features"):
        print(f"Features: {features}")
        
        X_train_ = X[features]
        y_train_ = y
        
        def objective(trial):
            reg_param = trial.suggest_uniform('reg_param', 0.0, 1.0)
            qda = QDA(reg_param=reg_param)
            pipeline = make_pipeline(StandardScaler(), qda)
            scores = cross_val_score(pipeline, X_train_, y_train_, cv=5, scoring=make_scorer(custom_optuna_score, greater_is_better=True,needs_proba=True))
            return scores.mean()
        
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10, n_jobs=-1)
        
        
        print("Best hyperparameters: ", study.best_params,)
        final_model = QDA(**study.best_params)
        final_pipeline = make_pipeline(StandardScaler(), final_model)

        for i in range(5):
            X_train, X_test, y_train, y_test = train_test_split(
                X_train_, y_train_, test_size=0.2, stratify=df_train["y"], random_state=i
            )
            final_pipeline.fit(X_train, y_train)
            score = task_score(final_pipeline, X_test, y_test)
            print(f"Random state: {i}, score: {score}")
        

            precision = precision_score(y_test, final_pipeline.predict(X_test))
            f1 = f1_score(y_test, final_pipeline.predict(X_test))
            recall = recall_score(y_test, final_pipeline.predict(X_test))
            accuracy = accuracy_score(y_test, final_pipeline.predict(X_test))


            results.append({
                **study.best_params,
                "features": "_".join(features),
                "test_score": score,
                "random_state": i,
                "feature_selection_method": feature_selection_method,
                "precision": precision,
                "f1": f1,
                "recall": recall,
                "accuracy": accuracy
           
            })
path = os.path.join("optuna")
if not os.path.exists(path):
    os.makedirs(path)

df = pd.DataFrame(results)
df.to_csv(f"{path}/results_qda.csv", index=False)


        # BEZ SENSU ZAPISYWAC MODEL, BO NA KONIEC I TAK TRZEBA ZROBIC FIT NA CALYM ZBIORZE
        # with open(f"{path}/svm.pkl", "wb") as f:
        #     pickle.dump(final_pipeline, f)

# LightGBM


In [None]:
from lightgbm import LGBMClassifier

In [None]:
results = []

for feature_selection_method, features_list in tqdm(possible_best_features.items(), total=len(possible_best_features), desc="Feature selection method"):
    print(f"Feature selection method: {feature_selection_method}")
    for features in tqdm(features_list, total=len(features_list), desc="Features"):
        print(f"Features: {features}")
        
        X_train_ = X[features]
        y_train_ = y
        
        def objective(trial):
            max_depth = trial.suggest_int('max_depth', 2, 10)
            n_estimators = trial.suggest_int('n_estimators', 50, 500)
            lgbm = LGBMClassifier(max_depth=max_depth, n_estimators=n_estimators)
            pipeline = make_pipeline(StandardScaler(), lgbm)
            scores = cross_val_score(pipeline, X_train_, y_train_, cv=5, scoring=make_scorer(custom_optuna_score, greater_is_better=True,needs_proba=True))
            return scores.mean()
        
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10, n_jobs=-1)
        
        
        print("Best hyperparameters: ", study.best_params,)
        final_model = LGBMClassifier(**study.best_params)
        final_pipeline = make_pipeline(StandardScaler(), final_model)

        for i in range(5):
            X_train, X_test, y_train, y_test = train_test_split(
                X_train_, y_train_, test_size=0.2, stratify=df_train["y"], random_state=i
            )
            final_pipeline.fit(X_train, y_train)
            score = task_score(final_pipeline, X_test, y_test)
            print(f"Random state: {i}, score: {score}")
        

            precision = precision_score(y_test, final_pipeline.predict(X_test))
            f1 = f1_score(y_test, final_pipeline.predict(X_test))
            recall = recall_score(y_test, final_pipeline.predict(X_test))
            accuracy = accuracy_score(y_test, final_pipeline.predict(X_test))


            results.append({
                **study.best_params,
                "features": "_".join(features),
                "test_score": score,
                "random_state": i,
                "feature_selection_method": feature_selection_method,
                "precision": precision,
                "f1": f1,
                "recall": recall,
                "accuracy": accuracy
           
            })
path = os.path.join("optuna")
if not os.path.exists(path):
    os.makedirs(path)

df = pd.DataFrame(results)
df.to_csv(f"{path}/results_lgbm.csv", index=False)


        # BEZ SENSU ZAPISYWAC MODEL, BO NA KONIEC I TAK TRZEBA ZROBIC FIT NA CALYM ZBIORZE
        # with open(f"{path}/svm.pkl", "wb") as f:
        #     pickle.dump(final_pipeline, f)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
results = []

for feature_selection_method, features_list in tqdm(possible_best_features.items(), total=len(possible_best_features), desc="Feature selection method"):
    print(f"Feature selection method: {feature_selection_method}")
    for features in tqdm(features_list, total=len(features_list), desc="Features"):
        print(f"Features: {features}")
        
        X_train_ = X[features]
        y_train_ = y
        
        def objective(trial):
            max_depth = trial.suggest_int('max_depth', 2, 10)
            n_estimators = trial.suggest_int('n_estimators', 100, 500)
            rf = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators)
            pipeline = make_pipeline(StandardScaler(), rf)
            scores = cross_val_score(pipeline, X_train_, y_train_, cv=5, scoring=make_scorer(custom_optuna_score, greater_is_better=True,needs_proba=True))
            return scores.mean()
        
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10, n_jobs=-1)
        
        
        print("Best hyperparameters: ", study.best_params,)
        final_model = RandomForestClassifier(**study.best_params)
        final_pipeline = make_pipeline(StandardScaler(), final_model)

        for i in range(5):
            X_train, X_test, y_train, y_test = train_test_split(
                X_train_, y_train_, test_size=0.2, stratify=df_train["y"], random_state=i
            )
            final_pipeline.fit(X_train, y_train)
            score = task_score(final_pipeline, X_test, y_test)
            print(f"Random state: {i}, score: {score}")
        

            precision = precision_score(y_test, final_pipeline.predict(X_test))
            f1 = f1_score(y_test, final_pipeline.predict(X_test))
            recall = recall_score(y_test, final_pipeline.predict(X_test))
            accuracy = accuracy_score(y_test, final_pipeline.predict(X_test))


            results.append({
                **study.best_params,
                "features": "_".join(features),
                "test_score": score,
                "random_state": i,
                "feature_selection_method": feature_selection_method,
                "precision": precision,
                "f1": f1,
                "recall": recall,
                "accuracy": accuracy
           
            })
path = os.path.join("optuna")
if not os.path.exists(path):
    os.makedirs(path)

df = pd.DataFrame(results)
df.to_csv(f"{path}/results_rf.csv", index=False)


        # BEZ SENSU ZAPISYWAC MODEL, BO NA KONIEC I TAK TRZEBA ZROBIC FIT NA CALYM ZBIORZE
        # with open(f"{path}/svm.pkl", "wb") as f:
        #     pickle.dump(final_pipeline, f)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
results = []

for feature_selection_method, features_list in tqdm(possible_best_features.items(), total=len(possible_best_features), desc="Feature selection method"):
    print(f"Feature selection method: {feature_selection_method}")
    for features in tqdm(features_list, total=len(features_list), desc="Features"):
        print(f"Features: {features}")
        
        X_train_ = X[features]
        y_train_ = y
        
        def objective(trial):
            penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
            c = trial.suggest_loguniform('C', 1e-3, 1e1)
            lr = LogisticRegression(penalty=penalty, C=c, solver='liblinear')
            pipeline = make_pipeline(StandardScaler(), lr)
            scores = cross_val_score(pipeline, X_train_, y_train_, cv=5, scoring=make_scorer(custom_optuna_score, greater_is_better=True,needs_proba=True))
            return scores.mean()
        
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=10, n_jobs=-1)
        
        
        print("Best hyperparameters: ", study.best_params,)
        final_model = LogisticRegression(**study.best_params, solver='liblinear')
        final_pipeline = make_pipeline(StandardScaler(), final_model)

        for i in range(5):
            X_train, X_test, y_train, y_test = train_test_split(
                X_train_, y_train_, test_size=0.2, stratify=df_train["y"], random_state=i
            )
            final_pipeline.fit(X_train, y_train)
            score = task_score(final_pipeline, X_test, y_test)
            print(f"Random state: {i}, score: {score}")
        

            precision = precision_score(y_test, final_pipeline.predict(X_test))
            f1 = f1_score(y_test, final_pipeline.predict(X_test))
            recall = recall_score(y_test, final_pipeline.predict(X_test))
            accuracy = accuracy_score(y_test, final_pipeline.predict(X_test))


            results.append({
                **study.best_params,
                "features": "_".join(features),
                "test_score": score,
                "random_state": i,
                "feature_selection_method": feature_selection_method,
                "precision": precision,
                "f1": f1,
                "recall": recall,
                "accuracy": accuracy
           
            })
path = os.path.join("optuna")
if not os.path.exists(path):
    os.makedirs(path)

df = pd.DataFrame(results)
df.to_csv(f"{path}/results_lr.csv", index=False)


        # BEZ SENSU ZAPISYWAC MODEL, BO NA KONIEC I TAK TRZEBA ZROBIC FIT NA CALYM ZBIORZE
        # with open(f"{path}/svm.pkl", "wb") as f:
        #     pickle.dump(final_pipeline, f)