In [None]:
import pandas as pd
import json
from data_loader import DataLoader
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from tabpfn import TabPFNClassifier
from autogluon.tabular import TabularPredictor
from supervised.automl import AutoML 
from tabpfn import TabPFNClassifier

Autogluon dla wszystkich cech

In [None]:
X, y = DataLoader.read_train_data()
predictor = TabularPredictor(label='target', eval_metric='balanced_accuracy')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_y = pd.concat([X_train, y_train], axis=1)

predictor.fit(X_y, time_limit=60*60*6, presets='best_quality')
y_pred = predictor.predict(X_test)
balanced_accuracy_score(y_pred, y_test)

Autogluon dla 18 wybranych cech

In [None]:
X, y = DataLoader.read_train_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

feature_selector = SelectFromModel(ExtraTreesClassifier(random_state=0), max_features=18)
feature_selector.fit(X_train, y_train)
top_columns = X_train.columns[feature_selector.get_support()]
X_train = X_train[top_columns]

predictor = TabularPredictor(label='target', eval_metric='balanced_accuracy')
X_y = pd.concat([X_train, y_train], axis=1)

predictor.fit(X_y, time_limit=60*60*6, presets='best_quality')
y_pred = predictor.predict(X_test)
balanced_accuracy_score(y_pred, y_test)

In [None]:
x_final_test = DataLoader.read_test_data()
x_final_test = x_final_test[top_columns]

y_pred = predictor.predict_proba(x_final_test)
y_pred = DataFrame(y_pred)
DataLoader.save_results(y_pred.reset_index(drop=True)[1])

MLJar dla wszystkich cech

In [None]:
predictor = AutoML(mode="Compete", total_time_limit=60)
X, y = DataLoader.read_train_data()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

predictor.fit(X_train, y_train)
predictions = predictor.predict(X_test)
balanced_accuracy_score(y_test, predictions)

TabPFN dla wszystkich cech

In [None]:
predictor = TabPFNClassifier(device='cpu', N_ensemble_configurations=32, subsample_features=True)
X, y = DataLoader.read_train_data()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

predictor.fit(X_train, y_train, overwrite_warning = True)
predictions = predictor.predict(X_test)
balanced_acc = balanced_accuracy_score(y_test, predictions)

TabPFN z forward feature selection

In [None]:
def tabPFN_forward_feature_selection(X : DataFrame, y : DataFrame): 
    best_results = DataFrame()
    best_score = 0.0
    firstRecord = True

    with open('tabPFN_feature_selection_results.json', 'w') as file:
        file.write('[]')

        for _ in range(X.columns.size):
            localBest = best_results
            localScore = best_score
            scoreImproved = False

            for column in X.columns:

                if column in best_results.columns:
                    continue

                X_current = pd.concat([best_results, X[column]], axis=1)

                (X_train, X_test, y_train, y_test) = train_test_split(X_current, y, test_size=0.3, random_state=42)

                predictor = TabPFNClassifier(device='cuda', subsample_features=True)

                predictor.fit(X_train, y_train, overwrite_warning = True)
                y_pred = predictor.predict(X_test)

                balanced_acc = balanced_accuracy_score(y_pred, y_test)

                current_record = {
                    'score': balanced_acc,
                    'columns': list(X_current.columns)
                }

                file.seek(0, 2)
                end_position = file.tell() - 1
                file.seek(end_position)

                if(firstRecord == False):
                    file.write(',')
                else:
                    firstRecord = False

                json.dump(current_record, file)
                file.write(']')
                
                file.flush()

                if(balanced_acc > localScore):
                    localScore = balanced_acc
                    localBest = X_current
                    scoreImproved = True

                predictor.remove_models_from_memory()

            print("score: " + localScore.__str__())

            if(scoreImproved == False):
                break

            if(localScore > best_score):
                best_score = localScore
                best_results = localBest

    return best_results, best_score

In [None]:
X, y = DataLoader.read_train_data()
result, score = tabPFN_forward_feature_selection(X, y)
top_columns = result.columns

In [None]:
X, y = DataLoader.read_train_data()
X = X[top_columns]

In [None]:
predictor = TabPFNClassifier(device='cuda', N_ensemble_configurations=32, subsample_features=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

predictor.fit(X_train, y_train, overwrite_warning = True)
predictions = predictor.predict(X_test)
balanced_accuracy_score(y_test, predictions)