In [1]:
from exp.features import create_train_features
from exp.run import run_experiment
from exp.mappings import alg_map
from exp.train import train_model
import pandas as pd
import json
import numpy as np
import os

### Create Training Features

In [2]:
X_save = "X_tr.csv"
y_save = "y_tr.csv"
X_save_scaled = "X_tr_scaled.csv"
scale_params_pickle = "scale_params.pickle"
other_params_json = "other.json"
tr_scaler = None
classic_sta_lta5_mean_fill = None
classic_sta_lta7_mean_fill = None

if not (os.path.exists(X_save_scaled) and os.path.exists(y_save)):
    if os.path.exists(X_save) and os.path.exists(y_save):
        X_tr = pd.read_csv(X_save, index_col=0)
        y_tr = pd.read_csv(y_save, index_col=0)

        scale_params_pickle_on = open(scale_params_pickle, "rb")
        tr_scaler = pickle.load(scale_params_pickle_on)
        scale_params_pickle_on.close()
        
        X_train_scaled = pd.DataFrame(tr_scaler.transform(X_tr), columns=X_tr.columns)
        X_train_scaled.to_csv(X_save_scaled)
    else:
        X_tr, X_train_scaled, y_tr, tr_scaler, classic_sta_lta5_mean_fill, classic_sta_lta7_mean_fill  = create_train_features(r'C:\Users\arvin\dev\lanl\train.csv')
        X_tr.to_csv(X_save)
        y_tr.to_csv(y_save)
        X_train_scaled.to_csv(X_save_scaled)

        scale_params_pickle_on = open(scale_params_pickle, "wb")
        pickle.dump(tr_scaler, scale_params_pickle_on)
        scale_params_pickle_on.close()

        with open(other_params_json, 'w') as fp:
            json.dump({"classic_sta_lta5_mean_fill": classic_sta_lta5_mean_fill,
                       "classic_sta_lta7_mean_fill": classic_sta_lta7_mean_fill}, fp)
else:
    X_train_scaled = pd.read_csv(X_save_scaled, index_col=0)
    y_tr = pd.read_csv(y_save, index_col=0)

In [3]:
print(X_train_scaled.head())

       mean       std       max       min  mean_change_abs  mean_change_rate  \
0  1.424140 -0.170214 -0.218194  0.193218        -1.326420         -1.569265   
1  0.805716  0.004734  0.063936 -0.018037         0.002747         -1.040206   
2  1.511155  0.049252 -0.086289  0.163039        -0.218781          0.949925   
3  1.494934  0.043950  0.122560 -0.187796         0.002747         -0.634909   
4  1.520242  0.088495 -0.067969  0.087590        -0.108017          0.094279   

    abs_max  abs_min  std_first_50000  std_last_50000  ...  \
0 -0.222567      0.0         0.052067       -0.285579  ...   
1  0.036797      0.0         0.153858       -0.076987  ...   
2 -0.101306      0.0         0.004241        0.277885  ...   
3  0.097427      0.0         0.020852       -0.058805  ...   
4 -0.084464      0.0        -0.093104        0.174161  ...   

   std_roll_mean_1000  max_roll_mean_1000  min_roll_mean_1000  \
0            0.268470           -0.004742            0.178278   
1           -0.1

### hyper-parameter experiments

In [4]:
"""
Example of Cartesian Product of Hyper-parameters for Linear Regression

"lr": {"fit_intercept": [False, True], "normalize": [False, True]}

Cartesian Product: {fit_intercept} x {normalize}

Hyper-parameter choices:
"fit_intercept": False, "normalize": False
"fit_intercept": True, "normalize": False
"fit_intercept": False, "normalize": True
"fit_intercept": True, "normalize": True
"""

params={"lr": {"fit_intercept": [False, True], "normalize": [False, True]},
       "ridge": {"alpha": [.000001, .00001, .0001, .001, .01, .1, 1.0, 10, 100],
                 "fit_intercept": [False, True], "normalize": [False, True]},
       "lasso": {"alpha": [.000001, .00001, .0001, .001, .01, .1, 1.0, 10, 100],
                 "fit_intercept": [False, True], "normalize": [False, True],
                 "positive": [False, False, False, False, False, True],
                 "selection": ["cyclic", "cyclic", "cyclic", "cyclic", "cyclic", "random"]},
       "mtlasso": {"alpha": [.000001, .00001, .0001, .001, .01, .1, 1.0, 10, 100],
                   "fit_intercept": [False, True], "normalize": [False, True],
                   "selection": ["cyclic", "cyclic", "cyclic", "cyclic", "cyclic", "random"]},
       "elastic": {"alpha": [.000001, .00001, .0001, .001, .01, .1, 1.0, 10, 100],
                   "fit_intercept": [False, True], "normalize": [False, True], 
                   "positive": [False, False, False, False, False, True],
                   "l1_ratio": [.01, .99, .2, .4, .6, .8], 
                   "selection": ["cyclic", "cyclic", "cyclic", "cyclic", "cyclic", "random"]},
       "lars": {"fit_intercept": [False, True], "normalize": [False, True],
                "fit_path": [False], "n_nonzero_coefs": [10, 100, 500, 1000, 10000, np.inf]},
       "llars": {"alpha": [.000001, .00001, .0001, .001, .01, .1, 1.0, 10, 100],
                 "fit_intercept": [False, True], "normalize": [False, True],
                "fit_path": [False], "positive": [False, False, False, False, False, True]},
       "omp": {"fit_intercept": [False, True], "normalize": [False, True],
               "n_nonzero_coefs": [10, 100, None, None, None]},
       "sgdreg": {"loss": ["squared_loss", "squared_loss", "squared_loss", "huber", "epsilon_insensitive",
                           "squared_epsilon_insensitive"],
                  "penalty": ["none", "l2", "l1", "elasticnet"], 
                  "alpha": [.000001, .00001, .0001, .001, .01, .1, 1.0, 10, 100],
                  "l1_ratio": [.01, .99, .2, .4, .6, .8], "fit_intercept": [False, True],
                  "learning_rate": ["constant", "optimal", "optimal", "optimal", "invscaling", "adaptive"],
                  "eta0": [1.0, 10.0, .1, .01, .001, .0001],
                  "early_stopping": [False, False, False, False, True]},
       "pareg": {"C": [.001, .01, .1, 1.0, 1.0, 1.0, 10.0, 100.0],
                 "loss": ["epsilon_insensitive", "squared_epsilon_insensitive"],
                 "epsilon": [.01, .05, .1, .1, .1, .5],
                 "early_stopping": [False, False, False, False, True]},
        # "tsreg": {"fit_intercept": [False, True]},
        "hreg": {"epsilon": [1.1, 1.2, 1.35, 1.35, 1.35, 1.35, 1.5, 1.6, 1.8, 2.0, 2.5],
                 "alpha": [.000001, .00001, .0001, .001, .01, .1, 1.0, 10, 100],
                 "fit_intercept": [False, True]},
        "kreg": {"alpha": [.000001, .00001, .0001, .001, .01, .1, 1.0, 10, 100],
                 "kernel": ["linear", "linear", "poly", "rbf", "sigmoid"],
                 "gamma": [None, None, None, None, .001, .0001, .01, .1]}}

### PCA on feature vectors

In [5]:
print("X_train_scaled shape: ", X_train_scaled.shape)

#need to use numpy array for PCA
Ymat = y_tr.as_matrix()
Xmat = X_train_scaled.as_matrix()

from sklearn.decomposition import PCA

pca = PCA(n_components=50)
X_r = pca.fit(Xmat).transform(Xmat)
#print("X_r", X_r[1:5])
print(len(X_r))

print('PCA explained variance ratio (first 50 components): %s'
      % str(pca.explained_variance_ratio_))

print("cumulative variance PCA")
variance = pca.explained_variance_ratio_ #calculate variance ratios

var=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)
print(var) #cumulative sum of variance explained with [n] features


pca = PCA(n_components=40)
X_r = pca.fit(Xmat).transform(Xmat)
print("cumulative variance PCA - 40")
variance = pca.explained_variance_ratio_ #calculate variance ratios
var=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)
print(var) #cumulative sum of variance explained with [n] features

print("\nX_train_scaled.shape: ",X_train_scaled.shape)
print("(PCA) X_r.shape: ",X_r.shape)

X_train_scaled shape:  (4194, 138)
4194
PCA explained variance ratio (first 50 components): [4.67527491e-01 1.31896039e-01 8.03803322e-02 6.13857166e-02
 2.83457425e-02 2.18411289e-02 1.89761955e-02 1.74868597e-02
 1.61917596e-02 1.32777708e-02 1.24577647e-02 1.01964819e-02
 8.92283770e-03 7.74412458e-03 7.51586380e-03 7.06741373e-03
 6.64359994e-03 6.21740502e-03 5.73372613e-03 5.72723554e-03
 5.08934278e-03 4.68587039e-03 4.52714989e-03 4.22541710e-03
 3.97234346e-03 3.25515367e-03 3.07030383e-03 2.87927045e-03
 2.57634585e-03 2.40247232e-03 2.30163654e-03 2.21519737e-03
 2.04076390e-03 1.88802447e-03 1.68950822e-03 1.62597775e-03
 1.53856217e-03 1.28517069e-03 1.12919067e-03 1.09043990e-03
 9.21865509e-04 9.08570581e-04 8.58039676e-04 8.15359349e-04
 7.10127140e-04 6.64709283e-04 5.57369136e-04 5.42748514e-04
 4.54130642e-04 4.06653623e-04]
cumulative variance PCA
[46.8 60.  68.  74.1 76.9 79.1 81.  82.7 84.3 85.6 86.8 87.8 88.7 89.5
 90.3 91.  91.7 92.3 92.9 93.5 94.  94.5 95.  95.

#### first 40 principal components explains 98.9% of variance

### PCA on acoustic

In [7]:
train = pd.read_csv('train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

MemoryError: 

In [None]:
print(train.head())

In [2]:

#need to use numpy array for PCA
Ymat = y_tr.as_matrix()
Xmat = train.as_matrix()

from sklearn.decomposition import PCA

pca = PCA(n_components=50)
X_r = pca.fit(Xmat).transform(Xmat)
#print("X_r", X_r[1:5])
print(len(X_r))

print('PCA explained variance ratio (first 50 components): %s'
      % str(pca.explained_variance_ratio_))

print("cumulative variance PCA")
variance = pca.explained_variance_ratio_ #calculate variance ratios

var=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)
print(var) #cumulative sum of variance explained with [n] features


pca = PCA(n_components=40)
X_r = pca.fit(Xmat).transform(Xmat)
print("cumulative variance PCA - 40")
variance = pca.explained_variance_ratio_ #calculate variance ratios
var=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)
print(var) #cumulative sum of variance explained with [n] features

print("\nX_train_scaled.shape: ",X_train_scaled.shape)
print("(PCA) X_r.shape: ",X_r.shape)

NameError: name 'pd' is not defined

### K-Means

### SSA

### Scaling

In [10]:
#StandardScaler normalizes the features so that each feature has mean=0, std=1
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train_scaled)
y_tr = sc.fit_transform(y_tr)


from scipy.stats import boxcox
#can also normalize the data with the Box-Cox transformation
# transform training set
X_train_scaled = stats.boxcox(X_train_scaled)

NameError: name 'stats' is not defined

### Overlapping segments

In [4]:
#obtained from https://www.kaggle.com/alinealmeida/basic-feature-benchmark-with-quantiles-augmenting
train = pd.read_csv('train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64})

print(train.head())
rows=150000
#stride of 75000
shift_step = int(np.floor(rows / 2))
segments = int(np.floor(train.shape[0] / rows))
segments_augmented = 2*segments - 1

X_train = pd.DataFrame(index=range(segments_augmented), dtype=np.float64,
                       columns=['ave', 'std', 'max', 'min','q95','q99', 'q05','q01'])
y_train = pd.DataFrame(index=range(segments_augmented), dtype=np.float64,
                       columns=['time_to_failure'])

for segment in tqdm(range(segments)):
    for do_shift in [False,True]:        
        if(do_shift):
            shift = shift_step
            idx = segments + segment            
            if(segment==segments-1): #last segment would be incomplete for the shifted version
                continue
        else:
            shift = 0
            idx = segment
        
        seg = train.iloc[segment*rows+shift:segment*rows+shift+rows]

        x = seg['acoustic_data'].values
        y = seg['time_to_failure'].values[-1]

        y_train.loc[idx, 'time_to_failure'] = y

        X_train.loc[idx, 'ave'] = x.mean()
        X_train.loc[idx, 'std'] = x.std()
        X_train.loc[idx, 'max'] = x.max()
        X_train.loc[idx, 'min'] = x.min()
        X_train.loc[idx, 'q95'] = np.quantile(x,0.95)
        X_train.loc[idx, 'q99'] = np.quantile(x,0.99)
        X_train.loc[idx, 'q05'] = np.quantile(x,0.05)
        X_train.loc[idx, 'q01'] = np.quantile(x,0.01)

NameError: name 'pd' is not defined

### Run Experiment

In [None]:
num_searches=20
n_fold=10
save_results= "exp.csv"

In [11]:
for alg in params.keys():
    print(alg)

lr
ridge
lasso
mtlasso
elastic
lars
llars
omp
sgdreg
pareg
hreg
kreg


In [None]:
for alg in params.keys():
    print(alg)
    score_df = run_experiment(X=X_train_scaled, Y=y_tr, n_fold=n_fold, alg=alg, alg_params=params[alg], search_type="random", num_searches=num_searches, save_results=save_results)

### Display models ranked by CV scores

In [None]:
score_df = score_df.sort_values(by="score", axis=0)
display(score_df)

### Load results from CSV File and re-produce models ranked by CV scores

In [None]:
score_df = pd.read_csv(save_results)
score_df = score_df.sort_values(by="score", axis=0)

In [None]:
display(score_df)

### Load best model from CSV File

In [None]:
# retrieve top scoring row
best = score_df.iloc[0]
display(best)

# retrieve model parameters from pandas row
alg = best["alg"]
params_json = best["params_json"]
print("alg: {}".format(alg))
print("params_json: {}".format(params_json))

# retrieve relevant values
alg_cls = alg_map[alg]
params = json.loads(params_json)

# initialize model
model = alg_cls(**params)

# train algorithm
train_model(X=X_train_scaled, Y=y_tr, n_fold=n_fold, model=model)