In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterSampler
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import re
import random
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from src.models.utils import milling_add_y_label_anomaly, under_over_sampler
from src.models.random_search_setup import general_params
from src.models.classifiers import (
    rf_classifier,
    xgb_classifier,
    knn_classifier,
    lr_classifier,
    sgd_classifier,
    ridge_classifier,
    svm_classifier,
    nb_classifier,
)

from src.models.random_search_setup import (
    rf_params,
    xgb_params,
    knn_params,
    lr_params,
    sgd_params,
    ridge_params,
    svm_params,
    nb_params,
)

%load_ext autoreload
%autoreload 2

In [2]:
path_data_folder = Path().cwd().parent.parent / 'data'
print(path_data_folder)

/home/tim/Documents/feat-store/data


In [3]:
folder_raw_data_milling = path_data_folder / "raw/milling"
folder_interim_data_milling = path_data_folder / "interim/milling"
folder_processed_data_milling = path_data_folder / "processed/milling"

In [4]:
# read in the milling features to a pandas dataframe
df_feat = pd.read_csv(
    folder_processed_data_milling / "milling.csv",
    )

df_feat.head()

Unnamed: 0,cut_id,ae_spindle__length,ae_spindle__kurtosis,ae_table__length,ae_table__kurtosis,vib_spindle__length,vib_spindle__kurtosis,vib_table__length,vib_table__kurtosis,smcdc__length,smcdc__kurtosis,smcac__length,smcac__kurtosis,cut_no,case,tool_class
0,0_0,64.0,-0.348446,64.0,-0.190215,64.0,-0.126833,64.0,2.144662,64.0,-0.256599,64.0,-1.367461,0,1,0
1,0_1,64.0,0.724161,64.0,-0.236114,64.0,0.504015,64.0,2.357173,64.0,0.384966,64.0,-1.248971,0,1,0
2,0_10,64.0,0.412827,64.0,0.031749,64.0,0.536186,64.0,2.901181,64.0,-0.167447,64.0,-1.327537,0,1,0
3,0_11,64.0,-0.087264,64.0,-0.376136,64.0,0.511172,64.0,2.578223,64.0,-0.367538,64.0,-1.441127,0,1,0
4,0_12,64.0,1.01206,64.0,1.804934,64.0,-0.108143,64.0,2.400641,64.0,-0.038878,64.0,-1.279095,0,1,0


# Putting It Together

In [5]:
def get_classifier_and_params(classifier_string):
    if classifier_string == "rf":
        return rf_classifier, rf_params
    elif classifier_string == "xgb":
        return classifier, xgb_params
    elif classifier_string == "knn":
        return classifier, knn_params
    elif classifier_string == "lr":
        return classifier, lr_params
    elif classifier_string == "sgd":
        return classifier, sgd_params
    elif classifier_string == "ridge":
        return classifier, ridge_params
    elif classifier_string == "svm":
        return classifier, svm_params
    elif classifier_string == "nb":
        return classifier, nb_params
    else:
        raise ValueError("Classifier string not recognized")

In [6]:
Y_LABEL_COL = 'y'

# identify if there is another column you want to 
# stratify on, besides the y label
STRATIFICATION_GROUPING_COL = 'cut_no'

# list of the columns that are not features columns
# (not including the y-label column)
META_LABEL_COLS = ['cut_id', 'cut_no', 'case', 'tool_class']

RAND_SEARCH_ITER = 10

# set a seed for the parameter sampler
sampler_seed = random.randint(0, 2 ** 16)

# generate the list of parameters to sample over
train_params = list(
    ParameterSampler(
        general_params, n_iter=RAND_SEARCH_ITER, random_state=sampler_seed
    )
)[0]

uo_method = train_params['uo_method']
scaler_method = train_params['scaler_method']
imbalance_ratio = train_params['imbalance_ratio']
classifier = train_params['classifier']
print(f"classifier: {classifier}, uo_method: {uo_method}, imbalance_ratio: {imbalance_ratio}")

# get classifier and its parameters
clf_function, params = get_classifier_and_params(classifier)

# instantiate the model
clf, param_dict_raw, param_dict_named = clf_function(sampler_seed, params)


# load feature dataframe
df = pd.read_csv(
    folder_processed_data_milling / "milling.csv",
    )

# add y label
df = milling_add_y_label_anomaly(df)

# create the x dataframe that has the META_LABEL_COLS drop, along with the `y` column dropped
dfx = df.drop(META_LABEL_COLS + [Y_LABEL_COL], axis=1).copy()

# create the y dataframe
dfy = df[[Y_LABEL_COL]].copy()


if STRATIFICATION_GROUPING_COL is not None and STRATIFICATION_GROUPING_COL is not Y_LABEL_COL:
    df_strat = df[[STRATIFICATION_GROUPING_COL, Y_LABEL_COL]].drop_duplicates()

    skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for train_index, test_index in skfolds.split(df_strat[['cut_no']], df_strat[['y']]):
        train_strat_vals = df_strat.iloc[train_index][STRATIFICATION_GROUPING_COL].values
        test_strat_vals = df_strat.iloc[test_index][STRATIFICATION_GROUPING_COL].values

        x_train = df[df[STRATIFICATION_GROUPING_COL].isin(train_strat_vals)]
        y_train = x_train[Y_LABEL_COL].values
        x_train = x_train.drop(META_LABEL_COLS + [Y_LABEL_COL], axis=1).values

        x_test = df[df[STRATIFICATION_GROUPING_COL].isin(train_strat_vals)]
        y_test = x_test[Y_LABEL_COL].values
        x_test = x_test.drop(META_LABEL_COLS + [Y_LABEL_COL], axis=1).values

        # scale the data (TO DO: create function)
        if scaler_method == "standard":
            scaler = StandardScaler()
            scaler.fit(x_train)
            x_train = scaler.transform(x_train)
            x_test = scaler.transform(x_test)
        elif scaler_method == "min_max":
            scaler = MinMaxScaler()
            scaler.fit(x_train)
            x_train = scaler.transform(x_train)
            x_test = scaler.transform(x_test)
        else:
            pass

        # under-over-sample the data
        x_train, y_train = under_over_sampler(
            x_train, y_train, method=uo_method, ratio=imbalance_ratio
        )

        


classifier: ridge, uo_method: random_under, imbalance_ratio: 1.0


TypeError: 'str' object is not callable

In [97]:
x_train.shape

(2738, 12)

In [25]:
train_params

{'uo_method': 'random_over',
 'scaler_method': 'standard',
 'imbalance_ratio': 0.8,
 'classifier': <function src.models.classifiers.gaussian_classifier(sampler_seed, gaussian_params)>}

In [56]:
test_strat_vals

array([116,  11, 125, 126, 134, 135, 136, 139,  13, 140, 152, 155, 164,
         1,  21,  24,  29,  44,  45,  47,  48,  51,  52,  57,  68,   6,
        75,  80,  81,  83,  86,  92,  93])

In [47]:
df_strat

Unnamed: 0,cut_no,y
0,0,0
70,100,0
136,101,0
202,102,0
268,103,0
...,...,...
11256,96,0
11318,97,0
11380,98,0
11434,99,0


In [41]:
train_strat_vals

array([  0, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,  10, 110,
       111, 112, 113, 114, 115, 116, 117, 118, 119,  11, 120, 121, 122,
       123, 124, 125, 126, 127, 128, 129,  12, 130, 131, 132, 133, 134,
       135, 136, 137, 138, 139,  13, 140, 141, 142, 143, 144, 145, 146,
       147, 148, 149,  14, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159,  15, 160, 161, 162, 163, 164, 165, 166,  16,  18,  19,   1,
        20,  21,  22,  23,  24,  25,  26,  27,  28,  29,   2,  30,  31,
        32,  33,  34,  35,  36,  37,  38,  39,   3,  40,  41,  42,  43,
        44,  45,  46,  47,  48,  49,   4,  50,  51,  52,  53,  54,  55,
        56,  57,  58,  59,   5,  60,  61,  62,  63,  64,  65,  66,  67,
        68,  69,   6,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
         7,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,   8,  90,
        91,  92,  93,  95,  96,  97,  98,  99,   9])

In [42]:
test_strat_vals

array([  0, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,  10, 110,
       111, 112, 113, 114, 115, 116, 117, 118, 119,  11, 120, 121, 122,
       123, 124, 125, 126, 127, 128, 129,  12, 130, 131, 132, 133, 134,
       135, 136, 137, 138, 139,  13, 140, 141, 142, 143, 144, 145, 146,
       147, 148, 149,  14, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159,  15, 160, 161, 162, 163, 164, 165, 166,  16,  18,  19,   1,
        20,  21,  22,  23,  24,  25,  26,  27,  28,  29,   2,  30,  31,
        32,  33,  34,  35,  36,  37,  38,  39,   3,  40,  41,  42,  43,
        44,  45,  46,  47,  48,  49,   4,  50,  51,  52,  53,  54,  55,
        56,  57,  58,  59,   5,  60,  61,  62,  63,  64,  65,  66,  67,
        68,  69,   6,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
         7,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,   8,  90,
        91,  92,  93,  95,  96,  97,  98,  99,   9])

In [15]:
skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, cut_no_val  in skfolds.split(df[['cut_no']], df[['y']]):
    # print(cut_no_train)
    print(cut_no_val)

[  4   8   9  10  11  12  26  27  34  37  41  47  49  50  55  57  68  69
  72  75  76  92 101 110 111 114 115 119 123 142 148 159 162]
[ 13  14  23  25  30  35  51  60  63  67  70  73  78  81  85  91  95  96
  99 102 116 125 127 129 131 133 135 137 139 140 143 146 149]
[  0   1   6   7  15  19  21  24  33  36  42  46  53  59  64  84  88  89
  90  97  98 103 106 109 117 124 134 136 141 154 155 160 163]
[  2   3   5  16  17  20  31  32  48  52  54  56  62  65  66  74  80  83
  86  93  94 100 120 121 122 126 128 151 152 153 156 161 164]
[ 18  22  28  29  38  39  40  43  44  45  58  61  71  77  79  82  87 104
 105 107 108 112 113 118 130 132 138 144 145 147 150 157 158]


In [None]:
class TrainAnomalyClassifier:
    """
    Class for setting up and training an anomaly detection classifier.
    Utilizes a random search.
    """

In [16]:
cut_no_val

array([ 18,  22,  28,  29,  38,  39,  40,  43,  44,  45,  58,  61,  71,
        77,  79,  82,  87, 104, 105, 107, 108, 112, 113, 118, 130, 132,
       138, 144, 145, 147, 150, 157, 158])

In [19]:
df.iloc[cut_no_val]

Unnamed: 0,cut_no,y
1161,116,0
1461,11,0
1887,125,0
1953,126,0
2567,134,0
2642,135,0
2717,136,0
2942,139,0
3017,13,0
3087,140,1


In [None]:
def k_fold_other_column()