In [1]:
import pandas as pd
from itertools import chain, combinations
from sklearn.model_selection import StratifiedKFold
import numpy as np
from scoring import scoring_function
from tqdm.auto import tqdm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
# Read x_train.txt
X = pd.read_csv('../data/x_train.txt', sep=' ', header=None)

# Read y_train.txt
y = pd.read_csv('../data/y_train.txt', sep=' ', header=None)


# Display the data
print("x:")
print(X.head())

print("\ny:")
print(y.head())

x:
        0         1         2         3         4         5         6    \
0 -2.619773 -2.619533 -1.199350 -1.083335 -1.000910 -0.366967 -2.164037   
1 -1.415579 -1.782544 -2.880270 -1.958863  1.159968  0.273030 -1.628728   
2 -2.745092 -1.382945 -1.626015 -1.282560 -0.663146  0.052349 -2.403322   
3  0.618998  0.455364 -0.115081  0.649040 -0.862207  2.308504  0.526114   
4 -0.070694 -0.550509 -0.565556 -0.693065 -0.573089 -0.395862  0.003170   

        7         8         9    ...        490        491        492  \
0 -1.210001 -0.658311 -1.489539  ...  10.849925  10.343346  10.717519   
1 -0.175813 -0.916857 -0.570166  ...  11.489417   5.195818   3.494627   
2 -0.765073 -0.394354 -0.806624  ...  13.934934   9.267515   4.705604   
3 -1.094852  1.088656 -0.481210  ...  12.021328   3.852231  11.059702   
4 -0.981609 -0.505775 -0.758430  ...   7.537788  11.229665  11.318915   

        493        494        495        496        497        498        499  
0  7.709295   5.894554  12.

## Boruta

In [7]:
idc2 = [0,   1,   2,   3,   4,   5,   6,   7,   8,   9, 100, 101, 102,
       103, 104, 105]

In [12]:
combs = chain(*map(lambda x: combinations(idc2, x), range(1, 6)))
print(len(list(combs)))

6884


In [13]:
rarr=[]

combs = chain(*map(lambda x: combinations(idc2, x), range(1, 6)))

for i, c in tqdm(enumerate(combs), total=6884):
    selected_features = list(c)
    X_selected = X.iloc[:, selected_features]

    # Stratified KFold
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    accuracies = []
    scores = []

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        y_train = y_train.to_numpy().flatten()
        y_test = y_test.to_numpy().flatten()

        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()

        # Standardize the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        for model in [GaussianNB()]:

            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            accuracy = balanced_accuracy_score(y_test, y_pred)
            score = scoring_function(model, X_test, y_test)

            rarr.append({'i': i, 'accuracy': accuracy, 'score': score,
                         'extra': str(selected_features), 'model':model.__class__.__name__})

  0%|          | 0/6884 [00:00<?, ?it/s]

In [14]:
resdf = pd.DataFrame(rarr, columns=['i', 'extra', 'accuracy', 'score'])
resdf.groupby('extra').mean()['score'].sort_values(ascending=False)[:10]

extra
[101, 102, 103, 105]    6990.0
[101, 102, 103]         6950.0
[102, 103, 105]         6940.0
[100, 102, 103]         6930.0
[102, 103, 104, 105]    6930.0
[101, 102, 105]         6930.0
[100, 101, 102, 103]    6890.0
[100, 102, 105]         6880.0
[100, 101, 102, 105]    6870.0
[100, 101, 103, 104]    6870.0
Name: score, dtype: float64

## XGBClassifier feature importance

In [15]:
idc2 = [100, 101, 102, 103, 104, 105, 0, 285, 391, 182]

In [17]:
combs = chain(*map(lambda x: combinations(idc2, x), range(1, 6)))
print(len(list(combs)))

637


In [18]:
rarr=[]

combs = chain(*map(lambda x: combinations(idc2, x), range(1, 6)))

for i, c in tqdm(enumerate(combs), total=637):
    selected_features = list(c)
    X_selected = X.iloc[:, selected_features]

    # Stratified KFold
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    accuracies = []
    scores = []

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        y_train = y_train.to_numpy().flatten()
        y_test = y_test.to_numpy().flatten()

        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()

        # Standardize the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        for model in [GaussianNB()]:

            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            accuracy = balanced_accuracy_score(y_test, y_pred)
            score = scoring_function(model, X_test, y_test)

            rarr.append({'i': i, 'accuracy': accuracy, 'score': score,
                         'extra': str(selected_features), 'model':model.__class__.__name__})

  0%|          | 0/637 [00:00<?, ?it/s]

In [19]:
resdf = pd.DataFrame(rarr, columns=['i', 'extra', 'accuracy', 'score'])
resdf.groupby('extra').mean()['score'].sort_values(ascending=False)[:10]

extra
[101, 102, 103, 105]    7090.0
[102, 103, 105]         7000.0
[101, 102, 105]         6970.0
[101, 102, 103]         6950.0
[100, 102, 103, 105]    6930.0
[100, 102, 103]         6920.0
[100, 102, 105]         6910.0
[100, 101, 102, 103]    6890.0
[102, 103, 104, 105]    6880.0
[102, 104, 105]         6870.0
Name: score, dtype: float64

## Boruta + SequenceFeatureSelector

In [20]:
idc2 = [101, 403, 285, 155, 337, 471, 412, 131, 241, 335]

In [21]:
combs = chain(*map(lambda x: combinations(idc2, x), range(1, 6)))
print(len(list(combs)))

637


In [22]:
rarr=[]

combs = chain(*map(lambda x: combinations(idc2, x), range(1, 6)))

for i, c in tqdm(enumerate(combs), total=637):
    selected_features = list(c)
    X_selected = X.iloc[:, selected_features]

    # Stratified KFold
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    accuracies = []
    scores = []

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        y_train = y_train.to_numpy().flatten()
        y_test = y_test.to_numpy().flatten()

        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()

        # Standardize the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        for model in [GaussianNB()]:

            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            accuracy = balanced_accuracy_score(y_test, y_pred)
            score = scoring_function(model, X_test, y_test)

            rarr.append({'i': i, 'accuracy': accuracy, 'score': score,
                         'extra': str(selected_features), 'model':model.__class__.__name__})

  0%|          | 0/637 [00:00<?, ?it/s]

In [23]:
resdf = pd.DataFrame(rarr, columns=['i', 'extra', 'accuracy', 'score'])
resdf.groupby('extra').mean()['score'].sort_values(ascending=False)[:10]

extra
[101]              6220.0
[101, 155]         6130.0
[101, 335]         6080.0
[101, 471]         6070.0
[101, 285]         6040.0
[101, 241]         6020.0
[101, 131]         6010.0
[101, 337]         6000.0
[101, 285, 337]    5990.0
[101, 155, 337]    5970.0
Name: score, dtype: float64

## Boruta + RecursiveFeatureElimination

In [25]:
idc2 = [458, 131, 215, 316, 63, 360, 328, 133, 75, 412]

In [26]:
combs = chain(*map(lambda x: combinations(idc2, x), range(1, 6)))
print(len(list(combs)))

637


In [27]:
rarr=[]

combs = chain(*map(lambda x: combinations(idc2, x), range(1, 6)))

for i, c in tqdm(enumerate(combs), total=637):
    selected_features = list(c)
    X_selected = X.iloc[:, selected_features]

    # Stratified KFold
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    accuracies = []
    scores = []

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        y_train = y_train.to_numpy().flatten()
        y_test = y_test.to_numpy().flatten()

        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()

        # Standardize the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        for model in [GaussianNB()]:

            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            accuracy = balanced_accuracy_score(y_test, y_pred)
            score = scoring_function(model, X_test, y_test)

            rarr.append({'i': i, 'accuracy': accuracy, 'score': score,
                         'extra': str(selected_features), 'model':model.__class__.__name__})

  0%|          | 0/637 [00:00<?, ?it/s]

In [28]:
resdf = pd.DataFrame(rarr, columns=['i', 'extra', 'accuracy', 'score'])
resdf.groupby('extra').mean()['score'].sort_values(ascending=False)[:10]

extra
[412]         5090.0
[360]         5070.0
[215]         5070.0
[328]         5030.0
[316, 328]    5030.0
[458, 412]    5010.0
[75]          5000.0
[458, 360]    4970.0
[75, 412]     4960.0
[316, 412]    4950.0
Name: score, dtype: float64

## Boruta + Mutual Info score

In [29]:
idc2 = [101, 296, 328, 103, 412, 105,   0, 131, 351, 323]

In [30]:
combs = chain(*map(lambda x: combinations(idc2, x), range(1, 6)))
print(len(list(combs)))

637


In [31]:
rarr=[]

combs = chain(*map(lambda x: combinations(idc2, x), range(1, 6)))

for i, c in tqdm(enumerate(combs), total=637):
    selected_features = list(c)
    X_selected = X.iloc[:, selected_features]

    # Stratified KFold
    skf = StratifiedKFold(n_splits=5, shuffle=True)
    accuracies = []
    scores = []

    for train_index, test_index in skf.split(X_selected, y):
        X_train, X_test = X_selected.iloc[train_index], X_selected.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        y_train = y_train.to_numpy().flatten()
        y_test = y_test.to_numpy().flatten()

        X_train = X_train.to_numpy()
        X_test = X_test.to_numpy()

        # Standardize the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        for model in [GaussianNB()]:

            model.fit(X_train, y_train)

            y_pred = model.predict(X_test)
            accuracy = balanced_accuracy_score(y_test, y_pred)
            score = scoring_function(model, X_test, y_test)

            rarr.append({'i': i, 'accuracy': accuracy, 'score': score,
                         'extra': str(selected_features), 'model':model.__class__.__name__})

  0%|          | 0/637 [00:00<?, ?it/s]

In [32]:
resdf = pd.DataFrame(rarr, columns=['i', 'extra', 'accuracy', 'score'])
resdf.groupby('extra').mean()['score'].sort_values(ascending=False)[:10]

extra
[101, 103, 105]         6740.0
[101, 105]              6670.0
[101, 296, 103, 105]    6620.0
[101, 103, 105, 351]    6610.0
[101, 103, 105, 323]    6600.0
[103, 105]              6600.0
[101, 103, 105, 0]      6600.0
[101, 103]              6600.0
[101, 328, 103, 105]    6590.0
[101, 103, 412, 105]    6530.0
Name: score, dtype: float64