In [1]:
# Imports
import pandas as pd
import numpy as np
from IPython.display import HTML, display
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
warnings.simplefilter('ignore')
N_JOBS = -1
SEED = 420
SHIFT = 5
FOLDS = 10

In [2]:
# Get data
train = pd.read_csv("raw_data/train.csv")
test = pd.read_csv("raw_data/test.csv")

display(train.shape)
display(train.head(2))
display(test.shape)
display(test.head(2))

# Save test IDs for predictions
test_ids = test.ID

(4209, 378)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0


(4209, 377)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0


In [3]:
# Remove outliers?

'''# METHOD 1 : remove the one biggest outlier
oldNbRows = train.shape[0]
train = train[train.y < 250]
print("Removing the most extreme outliers : " + str(oldNbRows - train.shape[0]))
print("train : " + str(train.shape))'''

# METHOD 2 : do nothing


'# METHOD 1 : remove the one biggest outlier\noldNbRows = train.shape[0]\ntrain = train[train.y < 250]\nprint("Removing the most extreme outliers : " + str(oldNbRows - train.shape[0]))\nprint("train : " + str(train.shape))'

In [4]:
# Define target

# METHOD 1 : take y as is
y = pd.DataFrame({"y": train.y})

'''# METHOD 2 : apply a transfo on y to make it more "normal"
y = pd.DataFrame({"y": np.log1p(train.y + SHIFT)}, columns = ["y"])'''

'# METHOD 2 : apply a transfo on y to make it more "normal"\ny = pd.DataFrame({"y": np.log1p(train.y + SHIFT)}, columns = ["y"])'

In [5]:
# Add median/mean/min of y for X0-X8

'''# METHOD 1 
y_x0 = train[["X0", "y"]].groupby(["X0"], as_index = False).mean()
y_x1 = train[["X1", "y"]].groupby(["X1"], as_index = False).mean()
y_x2 = train[["X2", "y"]].groupby(["X2"], as_index = False).mean()
y_x3 = train[["X3", "y"]].groupby(["X3"], as_index = False).mean()
y_x4 = train[["X4", "y"]].groupby(["X4"], as_index = False).mean()
y_x5 = train[["X5", "y"]].groupby(["X5"], as_index = False).mean()
y_x6 = train[["X6", "y"]].groupby(["X6"], as_index = False).mean()
y_x8 = train[["X8", "y"]].groupby(["X8"], as_index = False).mean()'''

# METHOD 2
y_x0 = train[["X0", "y"]].groupby(["X0"], as_index = False).median()
'''y_x1 = train[["X1", "y"]].groupby(["X1"], as_index = False).median()
y_x2 = train[["X2", "y"]].groupby(["X2"], as_index = False).median()
y_x3 = train[["X3", "y"]].groupby(["X3"], as_index = False).median()
y_x4 = train[["X4", "y"]].groupby(["X4"], as_index = False).median()
y_x5 = train[["X5", "y"]].groupby(["X5"], as_index = False).median()
y_x6 = train[["X6", "y"]].groupby(["X6"], as_index = False).median()
y_x8 = train[["X8", "y"]].groupby(["X8"], as_index = False).median()'''

'''# METHOD 3
y_x0 = train[["X0", "y"]].groupby(["X0"], as_index = False).min()
y_x1 = train[["X1", "y"]].groupby(["X1"], as_index = False).min()
y_x2 = train[["X2", "y"]].groupby(["X2"], as_index = False).min()
y_x3 = train[["X3", "y"]].groupby(["X3"], as_index = False).min()
y_x4 = train[["X4", "y"]].groupby(["X4"], as_index = False).min()
y_x5 = train[["X5", "y"]].groupby(["X5"], as_index = False).min()
y_x6 = train[["X6", "y"]].groupby(["X6"], as_index = False).min()
y_x8 = train[["X8", "y"]].groupby(["X8"], as_index = False).min()'''

y_x0.columns = ["X0", "y_x0"]
'''y_x1.columns = ["X1", "y_x1"]
y_x2.columns = ["X2", "y_x2"]
y_x3.columns = ["X3", "y_x3"]
y_x4.columns = ["X4", "y_x4"]
y_x5.columns = ["X5", "y_x5"]
y_x6.columns = ["X6", "y_x6"]
y_x8.columns = ["X8", "y_x8"]'''
train = pd.merge(train, y_x0, on = "X0", how = "outer")
'''train = pd.merge(train, y_x1, on = "X1", how = "outer")
train = pd.merge(train, y_x2, on = "X2", how = "outer")
train = pd.merge(train, y_x3, on = "X3", how = "outer")
train = pd.merge(train, y_x4, on = "X4", how = "outer")
train = pd.merge(train, y_x5, on = "X5", how = "outer")
train = pd.merge(train, y_x6, on = "X6", how = "outer")
train = pd.merge(train, y_x8, on = "X8", how = "outer")'''
test = pd.merge(test, y_x0, on = "X0", how = "left")
'''test = pd.merge(test, y_x1, on = "X1", how = "left")
test = pd.merge(test, y_x2, on = "X2", how = "left")
test = pd.merge(test, y_x3, on = "X3", how = "left")
test = pd.merge(test, y_x4, on = "X4", how = "left")
test = pd.merge(test, y_x5, on = "X5", how = "left")
test = pd.merge(test, y_x6, on = "X6", how = "left")
test = pd.merge(test, y_x8, on = "X8", how = "left")'''

'''# METHOD 1
test["y_x0"].fillna(test["y_x0"].dropna().mean(), inplace = True)
test["y_x1"].fillna(test["y_x1"].dropna().mean(), inplace = True)
test["y_x2"].fillna(test["y_x2"].dropna().mean(), inplace = True)
test["y_x3"].fillna(test["y_x3"].dropna().mean(), inplace = True)
test["y_x4"].fillna(test["y_x4"].dropna().mean(), inplace = True)
test["y_x5"].fillna(test["y_x5"].dropna().mean(), inplace = True)
test["y_x6"].fillna(test["y_x6"].dropna().mean(), inplace = True)
test["y_x8"].fillna(test["y_x8"].dropna().mean(), inplace = True)'''

# METHOD 2
print("Nb of NA in train + test : " + str(test["y_x0"].isnull().sum()))
test["y_x0"].fillna(test["y_x0"].dropna().median(), inplace = True)
'''test["y_x1"].fillna(test["y_x1"].dropna().median(), inplace = True)
test["y_x2"].fillna(test["y_x2"].dropna().median(), inplace = True)
test["y_x3"].fillna(test["y_x3"].dropna().median(), inplace = True)
test["y_x4"].fillna(test["y_x4"].dropna().median(), inplace = True)
test["y_x5"].fillna(test["y_x5"].dropna().median(), inplace = True)
test["y_x6"].fillna(test["y_x6"].dropna().median(), inplace = True)
test["y_x8"].fillna(test["y_x8"].dropna().median(), inplace = True)'''

'''# METHOD 3
test["y_x0"].fillna(test["y_x0"].dropna().min(), inplace = True)
test["y_x1"].fillna(test["y_x1"].dropna().min(), inplace = True)
test["y_x2"].fillna(test["y_x2"].dropna().min(), inplace = True)
test["y_x3"].fillna(test["y_x3"].dropna().min(), inplace = True)
test["y_x4"].fillna(test["y_x4"].dropna().min(), inplace = True)
test["y_x5"].fillna(test["y_x5"].dropna().min(), inplace = True)
test["y_x6"].fillna(test["y_x6"].dropna().min(), inplace = True)
test["y_x8"].fillna(test["y_x8"].dropna().min(), inplace = True)'''


Nb of NA in train + test : 6


'# METHOD 3\ntest["y_x0"].fillna(test["y_x0"].dropna().min(), inplace = True)\ntest["y_x1"].fillna(test["y_x1"].dropna().min(), inplace = True)\ntest["y_x2"].fillna(test["y_x2"].dropna().min(), inplace = True)\ntest["y_x3"].fillna(test["y_x3"].dropna().min(), inplace = True)\ntest["y_x4"].fillna(test["y_x4"].dropna().min(), inplace = True)\ntest["y_x5"].fillna(test["y_x5"].dropna().min(), inplace = True)\ntest["y_x6"].fillna(test["y_x6"].dropna().min(), inplace = True)\ntest["y_x8"].fillna(test["y_x8"].dropna().min(), inplace = True)'

In [10]:
print(test.loc[test.X0 == "ae"]["y_x0"])
print(test.loc[test.X0 == "ag"]["y_x0"])
print(test.loc[test.X0 == "g"]["y_x0"])


3281   92.340
Name: y_x0, dtype: float64
311   92.340
Name: y_x0, dtype: float64
1854   110.750
3058   110.750
3079   110.750
Name: y_x0, dtype: float64


In [None]:
# Handle cat features

'''# METHOD 1 : convert into dummy variables
alldata = pd.concat([train.drop("y", axis = 1), test], axis = 0)
alldata = pd.get_dummies(alldata).astype(int)
train = alldata.iloc[:train.shape[0], :]
test = alldata.iloc[train.shape[0]: , :]'''

'''# METHOD 2 : use LabelEncoder (assumes some order in values)
for c in train.drop("y", axis = 1).columns:
    if train[c].dtype == "object" :
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))'''

# METHOD 3 : use LabelEncoder with other order (after a will be b, not aa)
def encodeLetters(charcode) : 
    code = 0
    length = len(str(charcode))
    for i in range(length) :
        # example : AC = 1 * 26 ^ 1 + 3 * 26 ^ 0
        code += (ord(str(charcode)[i]) - ord("a") + 1) * (26 ** (length - i - 1)) - 1
    return(code)

for c in train.drop("y", axis = 1).columns:
    if train[c].dtype == "object" :
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = train[c].apply(encodeLetters)
        test[c] = test[c].apply(encodeLetters)


In [None]:
# Remove features with no information (unique value in train set) ?

# METHOD 1 : Remove those
constant_vars = []
for col in train.drop("y", axis = 1).columns:
    if(train[col].nunique() == 1):
        constant_vars.append(col)
print("Removing constant variables : " + str(constant_vars))
train.drop(constant_vars, axis = 1, inplace = True)
test.drop(constant_vars, axis = 1, inplace = True)

'''# METHOD 2 : do nothing'''


In [None]:
# Handle duplicate variables
def findDuplicateVars(df) :
    cols = df.columns
    removed_cols = []
    for i in range(len(cols) - 1) :
        v = df[cols[i]].values
        for j in range(i + 1, len(cols)):
            if np.array_equal(v, df[cols[j]].values) :
                #print("Dups : " + str(cols[i]) + " and " + str(cols[j]))
                removed_cols.append(cols[j])
    return(removed_cols)


# METHOD 1 : Drop variables which are duplicate in train + test
old_nb_vars = train.shape[1]
alldata = pd.concat([train.drop("y", axis = 1), test], axis = 0)
removed_vars = findDuplicateVars(alldata)
removed_vars = list(set(removed_vars))
train.drop(removed_vars, axis = 1, inplace = True)
test.drop(removed_vars, axis = 1, inplace = True)
print("Removed " + str(old_nb_vars - train.shape[1]) + " duplicate variables")
print(sorted(removed_vars))

'''# METHOD 2 : Drop variables which are duplicate just in train
old_nb_vars = train.shape[1]
removed_vars = findDuplicateVars(train)
removed_vars = list(set(removed_vars))
train.drop(removed_vars, axis = 1, inplace = True)
test.drop(removed_vars, axis = 1, inplace = True)
print("Removed " + str(old_nb_vars - train.shape[1]) + " duplicate variables")
print(sorted(removed_vars))'''

'''# METHOD 3 : do nothing'''



In [None]:
# Add columns with count of 1s for each binary col
binary_vars = list(set(train.columns.drop(["ID", "y", "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"])))
train["bin_ones"] = (train[binary_vars] == 1).astype(int).sum(axis = 1)
test["bin_ones"] = (test[binary_vars] == 1).astype(int).sum(axis = 1)

In [None]:
# Add top X PCA components
N_COMP = 12
pca = PCA(n_components = N_COMP, random_state = SEED)
pca_train = pca.fit_transform(train.drop("y", axis = 1))
pca_test = pca.transform(test)

# Add top X ICA components
N_COMP = 12
ica = FastICA(n_components = N_COMP, random_state = SEED)
ica_train = ica.fit_transform(train.drop("y", axis = 1))
ica_test = ica.transform(test)

# Add top X T-SVD components
N_COMP = 12
tsvd = TruncatedSVD(n_components = N_COMP, random_state = SEED)
tsvd_train = tsvd.fit_transform(train.drop("y", axis = 1))
tsvd_test = tsvd.transform(test)

# Add top X GaussianRandomProjection components
N_COMP = 12
grp = GaussianRandomProjection(n_components = N_COMP, random_state = SEED)
grp_train = grp.fit_transform(train.drop("y", axis = 1))
grp_test = grp.transform(test)

# Add top X SparseRandomProjection components
N_COMP = 12
srp = SparseRandomProjection(n_components = N_COMP, random_state = SEED)
srp_train = srp.fit_transform(train.drop("y", axis = 1))
srp_test = srp.transform(test)

for i in range(1, N_COMP + 1) :
    train["pca_" + str(i)] = pca_train[:, i - 1]
    test["pca_" + str(i)] = pca_test[:, i - 1]
    
    train["ica_" + str(i)] = ica_train[:, i - 1]
    test["ica_" + str(i)] = ica_test[:, i - 1]
    
    train["tsvd_" + str(i)] = tsvd_train[:, i - 1]
    test["tsvd_" + str(i)] = tsvd_test[:, i - 1]
    
    train["grp_" + str(i)] = grp_train[:, i - 1]
    test["grp_" + str(i)] = grp_test[:, i - 1]
    
    train["srp_" + str(i)] = srp_train[:, i - 1]
    test["srp_" + str(i)] = srp_test[:, i - 1]


In [None]:
# Handle duplicate rows
'''cols_to_groupby = [k for k in train.columns if k not in ["ID", "y"]] 
cols_to_apply = cols_to_groupby + ["y"]'''

'''# METHOD 1 : take mean of duplicate rows
train = train[cols_to_apply].groupby(cols_to_groupby, as_index = False).mean()
train.columns = cols_to_apply'''

'''# METHOD 2 : take median of duplicate rows
train = train[cols_to_apply].groupby(cols_to_groupby, as_index = False).median()
train.columns = cols_to_apply'''

'''# METHOD 3 : keep the row with the lowest y 
train = train[cols_to_apply].groupby(cols_to_groupby, as_index = False).min()
train.columns = cols_to_apply'''

# METHOD 4 : do nothing


In [None]:
'''# Concatenate train and test for global preprocessing
alldata = pd.concat([train, test], axis = 0)'''

In [None]:
'''# Differentiate numerical features and categorical features
cat_feats = test.select_dtypes(include = ["object"]).columns
print("Categorical features : " + str(len(cat_feats)))
num_feats = test.select_dtypes(exclude = ["object"]).columns
print("Numerical features : " + str(len(num_feats)))'''

In [None]:
'''# Redifferentiate train and test sets
train = alldata.iloc[:train.shape[0], :]
test = alldata.iloc[train.shape[0]: , :]
cols = train.columns'''

In [None]:
# Add AND and OR operations on top/all binary features

In [None]:
# Add simpler groups of cat features (diminish cardinality)

In [None]:
# Create Validation set for ensembling
X_test = train.drop("y", axis = 1).sample(frac = (1 / FOLDS), random_state = SEED)
X_train = train.drop("y", axis = 1).drop(X_test.index, axis = 0)

X_test_y = y.sample(frac = (1 / FOLDS), random_state = SEED)
X_train_y = y.drop(X_test_y.index, axis = 0)


In [None]:
'''# Write data in CSV files
train.drop("y", axis = 1, inplace = True)

train.to_csv("clean_data/train" + ".csv", index = False)
y.to_csv("clean_data/y.csv", index = False)

X_train.to_csv("clean_data/X_train" + ".csv", index = False)
X_train_y.to_csv("clean_data/X_train_y.csv", index = False)

X_test.to_csv("clean_data/X_test" + ".csv", index = False)
X_test_y.to_csv("clean_data/X_test_y.csv", index = False)

test.to_csv("clean_data/test" + ".csv", index = False)
test_ids.to_csv("clean_data/test_ids.csv", index = False)'''


In [None]:
'''dc1 = {
    "probe" : "false",
    "log_target" : -1,
    "encode_cats" : "LE+",
    "constant_vars" : "with",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc2 = {
    "probe" : "false",
    "log_target" : -1,
    "encode_cats" : "LE",
    "constant_vars" : "with",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc3 = {
    "probe" : "false",
    "log_target" : -1,
    "encode_cats" : "dummies",
    "constant_vars" : "with",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc4 = {
    "probe" : "false",
    "log_target" : -1,
    "encode_cats" : "drop",
    "constant_vars" : "with",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc5 = {
    "probe" : "false",
    "log_target" : 0,
    "encode_cats" : "LE",
    "constant_vars" : "with",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc6 = {
    "probe" : "false",
    "log_target" : 0,
    "encode_cats" : "dummies",
    "constant_vars" : "with",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc7 = {
    "probe" : "false",
    "log_target" : 0,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc8 = {
    "probe" : "false",
    "log_target" : 0,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc9 = {
    "probe" : "false",
    "log_target" : 0,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "without",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc10 = {
    "probe" : "false",
    "log_target" : 0,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "without",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc11 = {
    "probe" : "false",
    "log_target" : 0,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc12 = {
    "probe" : "false",
    "log_target" : 0,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc13 = {
    "probe" : "true",
    "log_target" : 0,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc14 = {
    "probe" : "true",
    "log_target" : 0,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc15 = {
    "probe" : "true",
    "log_target" : 5,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc16 = {
    "probe" : "true",
    "log_target" : 5,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc17 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc18 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc19 = {
    "probe" : "true",
    "log_target" : 50,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc20 = {
    "probe" : "true",
    "log_target" : 50,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : False,
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc21 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : "min",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc22 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : "min",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc23 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : "median",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc24 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : "median",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc25 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : "mean",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc26 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 0,
    "dupli_rows" : "mean",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc27 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 1,
    "dupli_rows" : "median",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc28 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 1,
    "dupli_rows" : "median",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc29 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc30 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc31 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 2,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc32 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 2,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc33 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 4,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc34 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 4,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc35 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 6,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc36 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 6,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc37 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 8,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc38 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 8,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc39 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc40 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc41 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 12,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc42 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 12,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc43 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 2,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc44 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 2,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc45 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 4,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc46 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 4,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc47 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 6,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc48 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 6,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc49 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 8,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc50 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 8,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc51 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 10,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc52 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 10,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc53 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 2,
    "grp" : 0,
    "srp" : 0,
}

dc54 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 2,
    "grp" : 0,
    "srp" : 0,
}

dc55 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 6,
    "grp" : 0,
    "srp" : 0,
}

dc56 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 6,
    "grp" : 0,
    "srp" : 0,
}

dc57 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 0,
    "srp" : 0,
}

dc58 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 0,
    "srp" : 0,
}

dc59 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 2,
    "srp" : 0,
}

dc60 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 2,
    "srp" : 0,
}

dc61 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 6,
    "srp" : 0,
}

dc62 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 6,
    "srp" : 0,
}

dc63 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 10,
    "srp" : 0,
}

dc64 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 10,
    "srp" : 0,
}

dc65 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 10,
    "srp" : 2,
}

dc66 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 10,
    "srp" : 2,
}

dc67 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 10,
    "srp" : 6,
}

dc68 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 10,
    "srp" : 6,
}

dc69 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 10,
    "srp" : 10,
}

dc70 = {
    "probe" : "true",
    "log_target" : 10,
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "mean_x" : 8,
    "dupli_rows" : "median",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 10,
    "grp" : 10,
    "srp" : 10,
}


data_configs = {
#    "dc1" : dc1,
#    "dc2" : dc2,
#    "dc3" : dc3,
#    "dc4" : dc4,
#    "dc5" : dc5,
#    "dc6" : dc6,
#    "dc7" : dc7,
#    "dc8" : dc8,
#    "dc9" : dc9,
#    "dc10" : dc10,
#    "dc11" : dc11,
#    "dc12" : dc12,
#    "dc13" : dc13,
#    "dc14" : dc14,
#    "dc15" : dc15,
#    "dc16" : dc16,
#    "dc17" : dc17,
#    "dc18" : dc18,
#    "dc19" : dc19,
#    "dc20" : dc20,
#    "dc21" : dc21,
#    "dc22" : dc22,
#    "dc23" : dc23,
#    "dc24" : dc24,
#    "dc25" : dc25,
#    "dc26" : dc26,
#    "dc27" : dc27,
#    "dc28" : dc28,
#    "dc29" : dc29,
#    "dc30" : dc30,
#    "dc31" : dc31,
#    "dc32" : dc32,
#    "dc33" : dc33,
#    "dc34" : dc34,
#    "dc35" : dc35,
#    "dc36" : dc36,
#    "dc37" : dc37,
#    "dc38" : dc38,
#    "dc39" : dc39,
#    "dc40" : dc40,
#    "dc41" : dc41,
#    "dc42" : dc42,
#    "dc43" : dc43,
#    "dc44" : dc44,
#    "dc45" : dc45,
#    "dc46" : dc46,
#    "dc47" : dc47,
#    "dc48" : dc48,
#    "dc49" : dc49,
#    "dc50" : dc50,
#    "dc51" : dc51,
#    "dc52" : dc52,
#    "dc53" : dc53,
#    "dc54" : dc54,
#    "dc55" : dc55,
#    "dc56" : dc56,
#    "dc57" : dc57,
#    "dc58" : dc58,
#    "dc59" : dc59,
#    "dc60" : dc60,
#    "dc61" : dc61,
#    "dc62" : dc62,
#    "dc63" : dc63,
#    "dc64" : dc64,
#    "dc65" : dc65,
#    "dc66" : dc66,
    "dc67" : dc67,
    "dc68" : dc68,
    "dc69" : dc69,
    "dc70" : dc70,
}'''