In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularDataset, TabularPredictor

import sys, os
sys.path.append(os.path.dirname(os.getcwd()))
from utils import task_score

from tqdm.notebook import tqdm


In [None]:
x_train = pd.read_csv("../../../data/x_train.txt", header=None, sep=" ")
y_train = pd.read_csv("../../../data/y_train.txt", header=None, sep=" ")

df_train = pd.concat([x_train, y_train], axis=1)
df_train.columns = ["x" + str(i) for i in range(1, df_train.shape[1])] + ["y"]

In [None]:
possible_best_features = {
    "Iterative":
    [['x103', 'x106', 'x101', 'x5', 'x102', 'x105'],
    ['x103', 'x106', 'x101', 'x102', 'x105'],
    ['x103','x106','x101','x102'],
    ['x103','x106','x101','x105'],
    ['x103','x106','x102','x105'],
    ['x103','x101','x102','x105'],
    ['x106','x101','x102','x105']],   
    "TreeMIF": [
    ["x105","x102","x103","x106","x101"],
    ["x102","x103","x106", "x101","x104"],
    ["x102","x106","x101","x104"],
    ["x102","x103","x106","x101"],
    ["x102","x106","x104"],
    ["x102","x103","x101","x104"],
    ["x105","x9","x103","x101","x104"],],
    "Genetic": [
    ["x423"],
    ["x459"],
    ["x329", "x352","x413"],],
    "LassoSVC": [
    ['x106', 'x140', 'x153', 'x156', 'x176', 'x22', 'x221', 'x253', 'x324',
       'x329', 'x336', 'x352', 'x36', 'x404', 'x413', 'x459', 'x499', 'x58',
       'x65', 'x81'],],
    "LassoLR": [
    ['x1', 'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x132', 'x140',
       'x149', 'x153', 'x156', 'x176', 'x191', 'x22', 'x221', 'x229', 'x253',
       'x286', 'x304', 'x322', 'x323', 'x324', 'x329', 'x336', 'x35', 'x352',
       'x36', 'x40', 'x404', 'x413', 'x423', 'x459', 'x463', 'x499', 'x5',
       'x58', 'x65', 'x74', 'x8', 'x81', 'x99'],],
   "Boruta": [
       ["x101", "x102", "x103", "x104", "x105", "x106", "x9"],
   ],
   "Filters": [["x106", "x176", "x413", "x459"]],
}

In [None]:
for i in tqdm(range(5), desc="Random state"):

    X_train, X_test, y_train, y_test = train_test_split(
        df_train.drop(columns="y"), df_train["y"], test_size=0.2, stratify=df_train["y"], random_state=i
    )

    for method, best_features_list in tqdm(possible_best_features.items(), desc="Method", total=len(possible_best_features)):
        for best_features in tqdm(best_features_list, desc="Best features", total=len(best_features_list)):
            print(best_features, i)
            path = f"./AutogluonModels/{'_'.join(best_features)}_{method}/{i}"
            if not os.path.exists(path):
                os.makedirs(path)
            train = pd.concat([X_train[best_features], y_train], axis=1)
            train_dataset = TabularDataset(train)

            model = TabularPredictor(label='y', eval_metric='precision',  path=path)
            model.fit(train_dataset, presets=['best_quality'], time_limit=600)
            
            test = pd.concat([X_test[best_features], y_test], axis=1)
            test_dataset = TabularDataset(test)

            # after training....
            leaderboard = model.leaderboard(test_dataset)
            leaderboard['random_state'] = i
            leaderboard['features'] = str(best_features)
            y_test = test['y']
            test_data = test.drop(columns=['y'])
            y_pred = model.predict(test_data)
            perf = model.evaluate_predictions(y_true=y_test, y_pred=y_pred)
            best_models = list(leaderboard.iloc[:10,0].values)

            for model_name in best_models:
                perf['task_score'] = perf.get('task_score', [])
                perf['task_score'].append({
                    'model_name' : model_name, 
                    'score' :  task_score(model, test_data, y_test, model_name=model_name),
                    'random_state' : i
                })
        
            # save results
            leaderboard.to_csv(f'{path}/leaderboard.csv')
            with open(f'{path}/performance.json', 'w') as f:
                json.dump(perf, f, indent=4)