In [1]:
import pandas as pd
import numpy as np
from boruta import BorutaPy
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
x_train = pd.read_csv('../../../data/x_train.txt', header=None, sep=' ')
y_train = pd.read_csv('../../../data/y_train.txt', header=None, sep=' ')
df_train = pd.concat([x_train, y_train], axis=1)
df_train.columns = [ 'x'+str(i) for i in range(1, df_train.shape[1]) ] + ['y']    

initial_features =  ['x1', 'x10', 'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x132', 'x140', 'x149', 'x153', 'x156', 'x176', 'x191', 'x2', 'x22', 'x221', 'x229', 'x253', 'x286', 'x3', 'x304', 'x322', 'x323', 'x324', 'x329', 'x336', 'x35', 'x352', 'x36', 'x4', 'x40', 'x404', 'x413', 'x423', 'x459', 'x463', 'x499', 'x5', 'x58', 'x6', 'x65', 'x7', 'x74', 'x8', 'x81', 'x9', 'x99']

df_train = df_train[initial_features + ['y']]


In [3]:
import concurrent.futures

# Assuming df_train is already defined
iterations = 100
num_cores = 16

def boruta_iteration(i, df_train):
    x_train, x_test, y_train, y_test = train_test_split(df_train.drop('y', axis=1), df_train['y'], test_size=0.2, random_state=i)
    bt = BorutaPy(n_estimators='auto', verbose=1, random_state=i, estimator=RandomForestClassifier())
    bt.fit(x_train.values, y_train.values)
    x_train_filtered = x_train.iloc[:, bt.support_]
    return x_train_filtered.columns

columns_selected = []

with concurrent.futures.ThreadPoolExecutor(max_workers=num_cores) as executor:
    futures = [executor.submit(boruta_iteration, i, df_train) for i in range(iterations)]
    for future in tqdm(concurrent.futures.as_completed(futures), total=iterations):
        columns_selected.append(future.result())



  0%|          | 0/100 [00:00<?, ?it/s]

Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 1 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 2 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 3 / 100
Iteration: 4 / 100
Iteration: 4 / 100
Iteration: 4 / 100
Iteration: 4 / 100
Iteration: 4

In [4]:
columns_selected_ = [np.array(list(x)) for x in columns_selected]
columns_selected_ = [list(x) for x in columns_selected_]
columns_selected_ = pd.Series(columns_selected_)
columns_selected_

0     [x101, x102, x103, x104, x105, x106, x3, x4, x...
1      [x101, x102, x103, x104, x105, x106, x4, x6, x9]
2          [x101, x102, x103, x104, x105, x106, x4, x9]
3     [x10, x101, x102, x103, x104, x105, x106, x4, ...
4      [x101, x102, x103, x104, x105, x106, x4, x6, x9]
                            ...                        
95    [x1, x10, x101, x102, x103, x104, x105, x106, ...
96    [x101, x102, x103, x104, x105, x106, x3, x4, x...
97    [x10, x101, x102, x103, x104, x105, x106, x3, ...
98         [x101, x102, x103, x104, x105, x106, x4, x9]
99    [x10, x101, x102, x103, x104, x105, x106, x3, ...
Length: 100, dtype: object

In [5]:
list_all = []
for list_ in columns_selected_:
    for column in list_:
        list_all.append(column)
list_all = pd.Series(list_all)
list_all.value_counts()

# get the most common columns (the ones that were selected in most of the iterations)
list_all = list_all.value_counts()

# get most frequent column
max_freq = list_all[0]

# get the columns that were selected max_freq times
columns_selected = list_all[list_all == max_freq].index
columns_selected

  max_freq = list_all[0]


Index(['x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'], dtype='object')

In [8]:
list_all

x101    100
x102    100
x103    100
x104    100
x105    100
x106    100
x9      100
x4       86
x6       74
x3       70
x10      60
x7       18
x1       17
x221      4
x404      2
x5        2
x2        1
x352      1
x286      1
x463      1
Name: count, dtype: int64

In [12]:
# save the selected columns to json

import json

dict_columns = { 'columns': list(columns_selected),
                'method': f'boruta: {iterations} iterations of boruta, then pick only the columns, which were selected the most times (max_freq) (say a the most frequent column had selection count 95 and there were 4 such columns, then we only pick those 4)', 
                'iterations': iterations,
                'max_freq': int(max_freq) }

with open('results/3_boruta_columns_selected.json', 'w') as f:
    json.dump(dict_columns, f)
                

In [7]:
# # old code withotu multithreading
# #----------------------------------------------------------------------------------------------------------------

# iterations = 100
# columns_selected = []
# for i in tqdm(range(iterations)):
#     x_train, x_test, y_train, y_test = train_test_split(df_train.drop('y', axis=1), df_train['y'], test_size=0.2)
#     print(f'Iteration {i+1}/{iterations}')
#     bt = BorutaPy(n_estimators='auto', verbose=1, random_state=i, estimator=RandomForestClassifier())
#     print('Fitting Boruta...')
#     bt.fit(x_train.values, y_train.values)
#     print('Boruta fitted.')
#     x_train_filtered = x_train.iloc[:, bt.support_]
#     columns = x_train_filtered.columns
#     columns_selected.append(columns)