# AutoML - Praca domowa 2
Piotr Nieciecki

Laura Korona

### Wczytanie danych jako DataFrames

In [1]:
import pandas as pd

traindata_X_full  = pd.read_csv("Data/artificial_train.data", sep=" ", header=None)
traindata_X_full.drop(traindata_X_full.columns[500], axis=1, inplace = True)

traindata_y_full = pd.read_csv("Data/artificial_train.labels", sep=" ", header=None)
traindata_y_full.rename(columns={traindata_y_full.columns[0]: 'Y'},inplace=True)

testdata_X_full = pd.read_csv("Data/artificial_test.data", sep=" ", header=None)
testdata_X_full.drop(testdata_X_full.columns[500], axis=1, inplace = True)

In [2]:
traindata_X_full

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,485,477,537,479,452,471,491,476,475,473,...,477,481,477,485,511,485,481,479,475,496
1,483,458,460,487,587,475,526,479,485,469,...,463,478,487,338,513,486,483,492,510,517
2,487,542,499,468,448,471,442,478,480,477,...,487,481,492,650,506,501,480,489,499,498
3,480,491,510,485,495,472,417,474,502,476,...,491,480,474,572,454,469,475,482,494,461
4,484,502,528,489,466,481,402,478,487,468,...,488,479,452,435,486,508,481,504,495,511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,490,505,503,474,463,461,519,476,518,467,...,467,479,449,588,499,506,475,463,507,501
1996,480,475,476,480,495,482,515,479,480,484,...,464,474,473,424,454,570,476,493,465,485
1997,480,517,631,470,485,474,535,476,493,466,...,501,483,479,687,488,488,483,500,523,481
1998,484,481,505,478,542,477,518,477,510,472,...,487,483,526,750,486,529,484,473,527,485


### Podział danych treningowych (czyli danych z pliku `artificial_train.data`) na pomocniczy zbiór treningowy i testowy

In [3]:
from sklearn.model_selection import train_test_split

traindata_X_train, traindata_X_test, traindata_y_train, traindata_y_test = train_test_split(traindata_X_full, traindata_y_full, test_size=0.33, random_state=42)

### Wyszukiwanie najistotniejszych cech w zbiorze danych

Utworzenie modelu, na podst. którego będą wyszukiwane najważniejsze cechy w zbiorze danych

In [4]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0).fit(traindata_X_train, traindata_y_train)

y_pred = clf.predict(traindata_X_test)

balanced_accuracy_score(traindata_y_test, y_pred)

  clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0).fit(traindata_X_train, traindata_y_train)


0.6305649426976198

Wyszukiwanie najważniejszych cech z wykorzystaniem `permutation_importance`

In [5]:
from sklearn.inspection import permutation_importance

r = permutation_importance(
    clf, traindata_X_train, traindata_y_train,
    n_repeats=50, random_state=0, n_jobs=-1,
    scoring='roc_auc',
)

In [6]:
import numpy as np

sort_idx = r.importances_mean.argsort()

r.importances_mean[r.importances_mean > 3e-3]

array([0.01006238, 0.00445203, 0.0030162 , 0.00450193, 0.00495447,
       0.00429632, 0.01488706, 0.00920971, 0.00322805, 0.00347911])

Usunięcie mało ważnych cech

In [7]:
mask = np.where(r.importances_mean <= 3e-3)

traindata_X_train_important = traindata_X_train.drop(traindata_X_train.columns[mask], axis=1)
traindata_X_train_important

Unnamed: 0,48,105,128,153,281,318,338,378,442,472
81,512,675,493,554,507,528,428,538,480,473
915,513,517,481,563,512,533,483,532,391,432
1018,440,333,464,449,453,440,540,436,484,475
380,497,667,493,587,516,495,398,508,467,467
1029,457,464,476,558,498,523,426,458,366,426
...,...,...,...,...,...,...,...,...,...,...
1130,486,554,485,610,528,456,488,500,613,530
1294,513,523,482,567,516,521,419,536,350,412
860,473,453,475,736,581,478,480,480,551,504
1459,499,435,473,737,590,421,272,518,318,391


In [8]:
traindata_X_test_important = traindata_X_test.drop(traindata_X_test.columns[mask], axis=1)
traindata_X_test_important

Unnamed: 0,48,105,128,153,281,318,338,378,442,472
1860,513,560,483,566,517,533,397,539,327,399
353,513,590,485,370,412,467,532,564,550,511
1333,542,572,486,664,561,517,384,582,358,422
905,421,605,487,659,557,514,334,414,429,452
1289,479,530,483,569,501,523,508,487,524,494
...,...,...,...,...,...,...,...,...,...,...
118,441,232,456,370,414,464,643,434,543,500
1249,457,505,479,359,412,521,518,454,467,466
1993,462,493,478,665,543,432,478,465,606,533
522,488,554,482,617,543,412,477,504,655,549


### Random Search

In [9]:
from sklearn.pipeline import Pipeline

rf_pipe = Pipeline([('rf', RandomForestClassifier())])

rf_params = {'rf__n_estimators': np.arange(100, 250 + 1),
              'rf__criterion': ['gini', 'entropy', 'log_loss'],
              'rf__max_depth': np.arange(5, 50 + 1),
              'rf__min_samples_split': np.arange(2, 50 + 1)}

random_state = 1
scoring = 'balanced_accuracy'
n_iter = 300 # configurations count

In [10]:
traindata_X_full_important = traindata_X_full.drop(traindata_X_full.columns[mask], axis=1)
testdata_X_full_important = testdata_X_full.drop(testdata_X_full.columns[mask], axis=1)

In [11]:
from sklearn.model_selection import RandomizedSearchCV

rs_accs = np.empty((n_iter))

rs = RandomizedSearchCV(
    rf_pipe,
    param_distributions=rf_params,
    n_iter=n_iter,
    scoring=scoring,
    n_jobs=-1,
    random_state = random_state
)

rs.fit(traindata_X_full_important, traindata_y_full)
rs_accs = rs.cv_results_['mean_test_score']

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Najlepszy wynik

In [12]:
rs_accs.max()

0.8654999999999999

In [13]:
best_params_index = np.argmax(rs_accs)

Najlepsze znalezione wartości dla badanych hiperparametrów

In [14]:
best_params = rs.cv_results_['params'][best_params_index]
best_params

{'rf__n_estimators': 162,
 'rf__min_samples_split': 2,
 'rf__max_depth': 45,
 'rf__criterion': 'gini'}

### Utworzenie modelu z powyższymi wartościami hiperparametrów

In [15]:
del clf

clf = RandomForestClassifier(n_estimators=best_params['rf__n_estimators'],
                                max_depth=best_params['rf__max_depth'],
                                min_samples_split=best_params['rf__min_samples_split'],
                                criterion=best_params['rf__criterion'],
                                random_state=random_state).fit(traindata_X_full_important, traindata_y_full)

  random_state=random_state).fit(traindata_X_full_important, traindata_y_full)


In [16]:
clf.classes_

array([-1,  1], dtype=int64)

### Zapisanie wyniku do pliku

In [17]:
np.savetxt("313288_313368_artifical_model_prediction.txt", clf.predict_proba(testdata_X_full_important)[:,1],header='313288_313368', comments='')