In [41]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
import time
from boruta import boruta_py
import numpy as np
from sklearn.model_selection import cross_val_score
import winsound
import matplotlib.pyplot as plt
from boruta import boruta_py

In [42]:
settings = {
                "dataset":"data_after_cleaning.csv",
                "columns_dataset": "scores_after_multicol.csv",
                "wrapper_tech": {
                        'rfc':{},
                        'ga':{},
                        'Boruta':{
                            "n_estimators":'auto',
                            "verbose":2
                            },
                        'xgboost':{}
                },
                "models":
                    {
                        'svm':
                             {
                                  'kernel':"linear"
                             },
                        'rf':
                             {
                                     'random_state':101,
                                     'max_depth':10
                             }
                    },
                "scoring_type":'neg_mean_absolute_error',
                "cross_val_times":2,
                "datapoints":60000, #fill in all if you want to use all datapoints,
                "allow_creating_new_results_df":True, #only allow making new results df when True (risky to keep on true after testing)
                "threshold_xgboost": 0.1#0.006535948

}

In [31]:
data = pd.read_csv('data/' + settings["dataset"])


# columns = [column for column in pd.read_csv('data/'+settings["columns_dataset"])["features"].values if column != 'Intercept']

columns = list(data.columns)
columns.append("close_price_next_min") 
final_columns = [column for column in columns if column!="close_price_next_min"]
X = data[final_columns].values
y = data["close_price_next_min"].values

In [32]:
#reduce dataset if needed
if settings['datapoints'] is not 'all':
    X = X[:settings['datapoints']]
    y = y[:settings['datapoints']]
        

In [33]:
len(list(data.columns))

345

In [None]:
class Datahandling():
    def __init__(self,name_dataframe):
        self.name_dataframe = name_dataframe
        self.columns = ['iteration','wrapper','optimal_number_of_features','score_optimal_features','scoring_type',
                                            'cross_val_times','datapoints','params','running_time']
        try:
            self.current_df = pd.read_csv('results/'+name_dataframe)
            self.highest_iteration = max(self.current_df['iteration'].values)
        except Exception as e:
            #print(f"Error {e} occured")
            print('New dataframe will be made')
            if not settings["allow_creating_new_results_df"]:
                if not input("Type YES if this is oke") == "YES": 
                    raise TypeError("User decided to cancel process, so canceling. Program will crash bc no highest iter and df")
                else: #not pretty to have 2 times same else, but very tired at the moment
                    self.current_df = pd.DataFrame(columns=self.columns)
                    self.highest_iteration = 0
            
            else:
                self.current_df = pd.DataFrame(columns=self.columns)
                self.highest_iteration = 0
            
    def save_after_iter(self,final_model):
        final_model.to_csv('results/'+self.name_dataframe,index=False)
        
        
        
            
        
        
                                       
        


<h1> Method 2: Boruta

In [None]:
df_scores = pd.DataFrame(index=final_columns)
dataaa = Datahandling('scores_boruta_columns.csv')
highest_iter = dataaa.highest_iteration

df_models_scores = dataaa.current_df

if 'Boruta' in settings['wrapper_tech'].keys():
    t1 = time.time()
    type_model = 'rf'

    model = RandomForestRegressor(**settings['models']['rf'])


    borutaPY = boruta_py.BorutaPy(model,
                               n_estimators=settings['wrapper_tech']['Boruta']["n_estimators"],
                               verbose=settings['wrapper_tech']['Boruta']["verbose"])
    
    borutaPY.fit(X, y)     
    score =  np.mean(-1*cross_val_score(model,
                                       X[:,borutaPY.support_],
                                       y,
                                       cv=settings['cross_val_times'],
                                       scoring=settings['scoring_type']))
    scores = {column:{} for column in final_columns}
    for i in range(len(final_columns)):
        scores[final_columns[i]] = {
            f"selected_{type_model}":borutaPY.support_[i],
            f"ranking_{type_model}":borutaPY.ranking_[i], #selected features are always 1 
            f"tentative_{type_model}":borutaPY.support_weak_[i]} #Those for which Boruta could not justify whether they are relevant or not
           # "grid_score":borutaPY.grid_scores_[i],The cross-validation scores such that grid_scores_[i] corresponds to the CV score of the i-th subset of features.
            #"estimator":borutaPY.estimator_[i]
    
    summary = [{
        'wrapper':type_model,
        'iteration':highest_iter+1,
      #  'grid_scores': borutaPY.grid_scores_,
        'optimal_number_of_features':borutaPY.n_features_,
        'score_optimal_features':score,
        'scores':scores,

        'scoring_type':settings['scoring_type'],
         "cross_val_times":settings['cross_val_times'],
        "datapoints":settings['datapoints'],
        'params':settings['models'][type_model],
        'running_time':time.time() - t1
    }
    ]

    df_models_scores = df_models_scores.append(summary,ignore_index=True)
dataaa.save_after_iter(df_models_scores)
print(f"total running time: {time.time() - t1}")
        
        
    

    

# testing lex

In [None]:
from boruta import boruta_py

model = RandomForestRegressor(n_jobs=-1,  max_depth=10)

borutaPY = boruta_py.BorutaPy(model, n_estimators='auto',verbose=2,random_state=1)
    
borutaPY.fit(X, y)     

borutaPY.ranking_

In [43]:
data2 = data[:60000]
data2
data2['difference'] = data2['close_price_next_min']-data2['ETHBTC__ticker_info__close_price']
data2['dummy_next_start_time'] = data2['difference'].apply(lambda x: 1 if x > 0 else 0) 
data2 = data2.drop(columns=['difference','close_price_next_min','last_start_time'])

# y2= data2['close_price_next_min']
# X2 = data2.drop(columns=['close_price_next_min'])

# X_train, X_test = X2.loc[:int(0.8*len(data2))], X2.loc[int(0.8*len(data2)):]
# y_train, y_test = y2.loc[:int(0.8*len(data2))], y2.loc[int(0.8*len(data2)):]

train = data2.loc[:int(0.8*len(data2))-1]
test = data2.loc[len(train):]
test_results = pd.DataFrame(test['dummy_next_start_time'])
test = test.drop(columns=['dummy_next_start_time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [44]:
data2

Unnamed: 0,ETHBTC__technical_analysis_candles__rsi,ETHBTC__technical_analysis_candles__macd,ETHBTC__technical_analysis_candles__signal,ETHBTC__technical_analysis_candles__macdhist,ETHBTC__technical_analysis_candles__sma_5,ETHBTC__technical_analysis_candles__sma_10,ETHBTC__technical_analysis_candles__sma_21,ETHBTC__technical_analysis_candles__sma_50,ETHBTC__technical_analysis_candles__sma_100,ETHBTC__technical_analysis_candles__sma_200,...,general_info__exchange_info__Turkish_Lira,general_info__exchange_info__New_Taiwan_Dollar,general_info__exchange_info__Ukrainian_hryvnia,general_info__exchange_info__Venezuelan_bolivar_fuerte,general_info__exchange_info__Vietnamese_dong,general_info__exchange_info__South_African_Rand,general_info__exchange_info__IMF_Special_Drawing_Rights,general_info__exchange_info__Silver_Troy_Ounce,general_info__exchange_info__Gold_Troy_Ounce,dummy_next_start_time
0,36.739428,-0.000003,-6.158753e-08,3.061318e-06,0.021819,0.021829,0.021828,0.021830,0.021792,0.021746,...,58304.408,292569.228,239251.021,2.420021e+09,2.266090e+08,144919.089,7082.754,546.108,6.221,1
1,36.364695,-0.000005,-1.014827e-06,3.812957e-06,0.021811,0.021827,0.021826,0.021829,0.021792,0.021746,...,58303.749,292539.523,239246.639,2.419976e+09,2.266805e+08,144894.981,7082.624,546.098,6.222,1
2,35.957189,-0.000006,-2.049515e-06,4.138754e-06,0.021808,0.021823,0.021824,0.021829,0.021793,0.021746,...,58309.177,292566.759,239268.913,2.420202e+09,2.267016e+08,144908.471,7083.283,546.148,6.223,1
3,30.947809,-0.000008,-3.283751e-06,4.936942e-06,0.021800,0.021817,0.021823,0.021828,0.021794,0.021747,...,58317.066,292606.338,239301.282,2.420529e+09,2.267322e+08,144928.074,7084.242,546.222,6.224,1
4,30.583641,-0.000010,-4.586828e-06,5.212308e-06,0.021794,0.021811,0.021820,0.021827,0.021795,0.021747,...,58328.985,292666.141,239350.191,2.421024e+09,2.267786e+08,144957.695,7085.690,546.334,6.225,1
5,28.358945,-0.000011,-5.950016e-06,5.452753e-06,0.021790,0.021805,0.021819,0.021826,0.021796,0.021748,...,58328.985,292666.141,239350.191,2.421024e+09,2.267786e+08,144957.695,7085.690,546.334,6.225,1
6,26.569413,-0.000013,-7.345631e-06,5.582460e-06,0.021785,0.021798,0.021816,0.021825,0.021797,0.021748,...,58337.416,292685.822,239366.279,2.421186e+09,2.267595e+08,144868.296,7086.166,546.251,6.224,1
7,22.472700,-0.000015,-8.879076e-06,6.133777e-06,0.021777,0.021792,0.021812,0.021823,0.021799,0.021748,...,58336.678,292682.119,239363.250,2.421156e+09,2.267567e+08,144866.463,7086.076,546.244,6.224,1
8,23.481108,-0.000016,-1.038233e-05,6.013023e-06,0.021772,0.021786,0.021809,0.021822,0.021800,0.021748,...,58340.526,292701.425,239379.039,2.421315e+09,2.267716e+08,144876.019,7086.544,546.280,6.225,0
9,31.411269,-0.000017,-1.163656e-05,5.016929e-06,0.021769,0.021782,0.021806,0.021821,0.021801,0.021749,...,58333.643,292666.890,239350.795,2.421030e+09,2.267449e+08,144858.925,7085.707,546.216,6.224,0


In [45]:
columns = list(data2.columns)
columns.append("dummy_next_start_time") 
final_columns = [column for column in columns if column!="dummy_next_start_time"]
X2 = data2[final_columns].values
y2 = data2["dummy_next_start_time"].values

In [46]:
#reduce dataset if needed
if settings['datapoints'] is not 'all':
    X2 = X2[:settings['datapoints']]
    y2 = y2[:settings['datapoints']]
        

In [47]:

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [None]:
# def load_data():
#     # URLS for dataset via UCI
#     train_data_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.data'
#     train_label_url='https://archive.ics.uci.edu/ml/machine-learning-databases/madelon/MADELON/madelon_train.labels'

#     X_data = pd.read_csv(train_data_url, sep=" ", header=None)
#     y_data = pd.read_csv(train_label_url, sep=" ", header=None)
#     data = X_data.loc[:, :499]
#     data['target'] = y_data[0]
#     return data

In [None]:
# data = load_data()

In [None]:
# data2

In [None]:
# y = data.pop('target')
# X = data.copy().values

In [49]:
rf = RandomForestClassifier(n_jobs=-1, max_depth=10, random_state=0)
feat_selector = BorutaPy(rf, n_estimators=100, verbose=2, random_state=0)

In [50]:
feat_selector.fit(X2, y2)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	343
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	343
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	343
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	343
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	343
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	343
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	343
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	49
Rejected: 	294
Iteration: 	9 / 100
Confirmed: 	16
Tentative: 	33
Rejected: 	294
Iteration: 	10 / 100
Confirmed: 	16
Tentative: 	33
Rejected: 	294
Iteration: 	11 / 100
Confirmed: 	16
Tentative: 	33
Rejected: 	294
Iteration: 	12 / 100
Confirmed: 	18
Tentative: 	23
Rejected: 	302
Iteration: 	13 / 100
Confirmed: 	18
Tentative: 	23
Rejected: 	302
Iteration: 	14 / 100
Confirmed: 	18
Tentative: 	23
Rejected: 	302
Iteration: 	15 / 100
Confirmed: 	18
Tentative: 	23
Rejected: 	302
Iteration: 	16 / 100
Confirmed: 	1

BorutaPy(alpha=0.05,
         estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                          class_weight=None, criterion='gini',
                                          max_depth=10, max_features='auto',
                                          max_leaf_nodes=None, max_samples=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          n_estimators=100, n_jobs=-1,
                                          oob_score=False,
                                          random_state=RandomState(MT19937) at 0x2010DE68378,
                                          verbose=0, warm_start=False),
         max_iter=100, n_estimators=100,

In [55]:
feat_selector

AttributeError: 'BorutaPy' object has no attribute 'ImpHistory'

In [22]:

# Check selected features
print(feat_selector.support_)
# Select the chosen features from our dataframe.
selected = X2[:, feat_selector.support_]


[ True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False Fa

In [23]:
feat_selector.ranking_

array([  1,  70, 180,  10,  78,  40,  48,  81, 179, 149, 123, 119,  87,
       103, 185, 165, 171, 136, 226,   8,  84, 153,  25,  14, 143,  57,
        70,   2,   5,  25,  75,  17,  47,  81, 191,  70, 273, 315,  48,
       110, 112,  78, 239, 219, 315, 315, 281, 315, 315, 315, 281, 315,
       315, 315, 267, 315, 315, 315, 267, 315, 315, 315, 315, 315, 315,
       315, 281, 315,  51,  16,  65,  39,  97,  57,   7,  43,  30, 237,
       281, 244, 259, 221, 214, 190, 211, 236, 226, 234, 232, 230, 248,
       249, 217, 247, 258, 241, 252, 267, 225, 223, 239,  74,  92,  19,
        34,   3,  93,  35,  20,  27, 315, 243, 315, 238, 254, 130, 165,
       219, 231, 267, 267, 140, 250, 281, 262, 200, 260, 267, 262, 257,
       315, 281, 255, 273, 315, 267, 256, 197, 251, 273, 260, 253, 273,
       315, 315,  54, 159, 315, 315, 315, 315, 315, 315, 315, 315, 120,
       315, 315, 104,  76, 152, 151, 315, 315, 315, 315, 315, 315, 315,
       113, 116, 173, 315, 315, 315, 221,  91,  93,  44,  68,  5

In [24]:
data2.columns

Index(['ETHBTC__technical_analysis_candles__rsi',
       'ETHBTC__technical_analysis_candles__macd',
       'ETHBTC__technical_analysis_candles__signal',
       'ETHBTC__technical_analysis_candles__macdhist',
       'ETHBTC__technical_analysis_candles__sma_5',
       'ETHBTC__technical_analysis_candles__sma_10',
       'ETHBTC__technical_analysis_candles__sma_21',
       'ETHBTC__technical_analysis_candles__sma_50',
       'ETHBTC__technical_analysis_candles__sma_100',
       'ETHBTC__technical_analysis_candles__sma_200',
       ...
       'general_info__exchange_info__Turkish_Lira',
       'general_info__exchange_info__New_Taiwan_Dollar',
       'general_info__exchange_info__Ukrainian_hryvnia',
       'general_info__exchange_info__Venezuelan_bolivar_fuerte',
       'general_info__exchange_info__Vietnamese_dong',
       'general_info__exchange_info__South_African_Rand',
       'general_info__exchange_info__IMF_Special_Drawing_Rights',
       'general_info__exchange_info__Silver_Troy_Ou