In [1]:
from sklearn.metrics import balanced_accuracy_score
from autogluon.tabular import TabularPredictor, TabularDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

train = pd.read_table('../../../artificial_train.data', sep=' ', header=None)
train = train.iloc[:, :-1]

y_train = pd.read_table('../../../artificial_train.labels', sep=' ', header=None)
y_train.rename(columns={0: 'label'}, inplace=True)

train = pd.concat([train, y_train], axis=1)

X_test = pd.read_table('../../../artificial_test.data', sep=' ', header=None)
X_test = X_test.iloc[:, :-1]

# X_train, X_val, y_train, y_val = train_test_split(train.iloc[:, :-1], train.iloc[:, -1], test_size=0.15, stratify=train.iloc[:, -1], random_state=42)
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

In [4]:
predictor = TabularPredictor(
            label="label",
            path="autogluon",
            problem_type="binary",
            eval_metric='balanced_accuracy'
        )



In [5]:
predictor.fit(
    train_data=TabularDataset(
        pd.concat([X_train, y_train], axis=1)
    ),
    time_limit=1000,
)

No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ... Time limit = 1000s
AutoGluon will save models to "autogluon"
AutoGluon Version:  1.0.0
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
CPU Count:          8
Memory Avail:       4.83 GB / 15.70 GB (30.8%)
Disk Space Avail:   288.19 GB / 463.96 GB (62.1%)
Train Data Rows:    2000
Train Data Columns: 500
Label Column

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x1e440753f10>

In [15]:
predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                  model  score_val        eval_metric  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2     0.8600  balanced_accuracy       1.413107  195.526870                0.004024           1.550800            2       True         14
1               XGBoost     0.8475  balanced_accuracy       0.054931   18.210153                0.054931          18.210153            1       True         11
2              CatBoost     0.8450  balanced_accuracy       0.035621   60.934261                0.035621          60.934261            1       True          7
3              LightGBM     0.8350  balanced_accuracy       0.024110   12.372045                0.024110          12.372045            1       True          4
4         LightGBMLarge     0.8300  balanced_accuracy       0.052202   52.187498                0.052202          52.187498            1       



{'model_types': {'KNeighborsUnif': 'KNNModel',
  'KNeighborsDist': 'KNNModel',
  'LightGBMXT': 'LGBModel',
  'LightGBM': 'LGBModel',
  'RandomForestGini': 'RFModel',
  'RandomForestEntr': 'RFModel',
  'CatBoost': 'CatBoostModel',
  'ExtraTreesGini': 'XTModel',
  'ExtraTreesEntr': 'XTModel',
  'NeuralNetFastAI': 'NNFastAiTabularModel',
  'XGBoost': 'XGBoostModel',
  'NeuralNetTorch': 'TabularNeuralNetTorchModel',
  'LightGBMLarge': 'LGBModel',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel'},
 'model_performance': {'KNeighborsUnif': 0.6825,
  'KNeighborsDist': 0.6825,
  'LightGBMXT': 0.77,
  'LightGBM': 0.835,
  'RandomForestGini': 0.6825,
  'RandomForestEntr': 0.6975,
  'CatBoost': 0.845,
  'ExtraTreesGini': 0.6875,
  'ExtraTreesEntr': 0.6625,
  'NeuralNetFastAI': 0.5575,
  'XGBoost': 0.8474999999999999,
  'NeuralNetTorch': 0.5675,
  'LightGBMLarge': 0.83,
  'WeightedEnsemble_L2': 0.86},
 'model_best': 'WeightedEnsemble_L2',
 'model_paths': {'KNeighborsUnif': ['KNeighborsUnif'],
  'KN

In [6]:
train_acc = balanced_accuracy_score(y_train, predictor.predict(TabularDataset(X_train)))

In [7]:
print(f"Train acc: {train_acc}")

Train acc: 0.972


In [8]:
test_data = pd.read_table('../../../artificial_test.data', sep=' ', header=None)

In [9]:
probabilities = predictor.predict_proba(TabularDataset(test_data))
probabilities
preds = probabilities[1]

In [10]:
preds.columns = ['313325_313342']

In [11]:
preds.to_csv('autogluon_predictions.txt', index=False)

In [12]:
preds

0      0.379835
1      0.248115
2      0.417705
3      0.673912
4      0.551963
         ...   
595    0.693915
596    0.374941
597    0.717032
598    0.717608
599    0.226864
Name: 1, Length: 600, dtype: float64