In [1]:
import numpy as np
import pandas as pd
import scipy
from sys import getsizeof

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [2]:
meta_train = pd.read_csv('./VSB_unpacked/metadata_train.csv')
df_train = pd.read_parquet('VSB_unpacked/train.parquet').T

In [3]:
df_train.index = df_train.index.map(int)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df_train.index, 
    meta_train['target'], 
    stratify = meta_train['target'], 
    test_size = .2,
    random_state = 510
)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6969,), (1743,), (6969,), (1743,))

In [6]:
y_train_resamp = y_train[y_train == 1]
X_train_resamp = X_train[y_train == 1]

y_test_resamp = y_test[y_test == 1]
X_test_resamp = X_test[y_test == 1]

In [7]:
X_train_resamp = np.concatenate([X_train_resamp, np.random.choice(X_train[y_train == 0], y_train.sum(), replace=False)])
y_train_resamp = np.concatenate([y_train_resamp, [0]* y_train.sum()])

In [8]:
X_test_resamp = np.concatenate([X_test_resamp, np.random.choice(X_test[y_test == 0], y_test.sum(), replace=False)])
y_test_resamp = np.concatenate([y_test_resamp, [0]* y_test.sum()])

X_train, X_test represent indices corresponding to rows in df_train data file. 

In [9]:
meta_train.head()

Unnamed: 0,signal_id,id_measurement,phase,target
0,0,0,0,0
1,1,0,1,0
2,2,0,2,0
3,3,1,0,1
4,4,1,1,1


In [10]:
df_train.shape

(8712, 800000)

In [11]:
X_train_data = df_train.iloc[X_train, :]
X_test_data = df_train.iloc[X_test, :]
X_train_resamp_data = df_train.iloc[X_train_resamp, :]
X_test_resamp_data = df_train.iloc[X_test_resamp, :]

In [12]:
X_test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,799990,799991,799992,799993,799994,799995,799996,799997,799998,799999
4874,16,14,12,12,14,16,17,16,14,13,...,14,15,16,17,16,15,14,15,16,17
1222,-22,-23,-23,-22,-21,-21,-22,-21,-22,-22,...,-21,-22,-21,-21,-21,-21,-20,-21,-21,-20
6121,18,17,17,17,17,17,18,18,17,17,...,18,19,19,19,18,18,17,18,18,18
8065,-12,-12,-11,-12,-12,-13,-13,-13,-12,-12,...,-12,-12,-13,-13,-12,-11,-11,-12,-12,-12
7689,16,16,16,16,18,17,17,17,15,17,...,16,15,18,17,15,17,14,17,16,16


In [13]:
type(X_train_data), type(X_test_data), type(X_train_resamp_data), type(X_test_resamp_data), 

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [14]:
type(y_test), type(y_train), type(y_train_resamp), type(y_test_resamp)

(pandas.core.series.Series,
 pandas.core.series.Series,
 numpy.ndarray,
 numpy.ndarray)

In [15]:
X_train_data = np.asarray(X_train_data)
X_test_data = np.asarray(X_test_data)
X_train_resamp_data = np.asarray(X_train_resamp_data)
X_test_resamp_data = np.asarray(X_test_resamp_data)
y_test = np.asarray(y_test)
y_train = np.asarray(y_train)
y_train_resamp = np.asarray(y_train_resamp)
y_test_resamp = np.asarray(y_test_resamp)

In [17]:
X_train_data.shape
X_train_resamp_data.shape

(840, 800000)

In [26]:
%%time
#np.save('./X_train_data', X_train_data)

CPU times: user 915 ms, sys: 27.8 s, total: 28.7 s
Wall time: 42.5 s


In [24]:
#np.save('./X_test_data', X_test_data)

In [25]:
%%time
#np.save('./X_train_resamp_data', X_train_resamp_data)

CPU times: user 120 ms, sys: 4.16 s, total: 4.28 s
Wall time: 6.42 s


In [27]:
%%time
#np.save('./y_test', y_test)

CPU times: user 881 µs, sys: 2.86 ms, total: 3.74 ms
Wall time: 4.21 ms


In [28]:
#np.save('./y_train', y_train)

In [29]:
#np.save('./y_train_resamp', y_train_resamp)
#np.save('./y_test_resamp', y_test_resamp)

In [18]:
y_test_resamp.shape

(210,)

In [17]:
X_test_resamp_data.shape

(210, 800000)

In [19]:
X_train_resamp_data.shape

(840, 800000)

In [20]:
840+210

1050

In [16]:
%%time
np.save('./X_test_resamp_data', X_test_resamp_data)

CPU times: user 25.4 ms, sys: 764 ms, total: 789 ms
Wall time: 1.14 s


In [63]:
params = {
    'min_samples_split': [2,3,5,7,10,13]
}

In [66]:
gs = GridSearchCV(RandomForestClassifier(n_estimators=100), n_jobs = 6, verbose = 2,
                  param_grid = params, 
                  return_train_score = True,
                  cv = 3)

In [None]:
gs.fit(X_train_data, y_train)
gs.best_params_

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
