In [1]:
# Imports
import pandas as pd
import numpy as np
from IPython.display import HTML, display
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.linear_model import LassoCV, LassoLarsCV, RidgeCV
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor, DMatrix

# Visualization options
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', None)
%matplotlib inline
HTML("<style>.container { width: 90% !important; }</style>")
warnings.filterwarnings("ignore")

# Global variables
SEED = 420
N_JOBS = 18
FOLDS = 10



In [2]:
# Utilities functions
def encodeLetters(charcode) : 
    code = 0
    length = len(str(charcode))
    for i in range(length) :
        # example : AC = 1 * 26 ^ 1 + 3 * 26 ^ 0
        code += (ord(str(charcode)[i]) - ord("a") + 1) * (26 ** (length - i - 1)) - 1
    return(code)

def findDuplicateVars(df) :
    cols = df.columns
    removed_cols = []
    for i in range(len(cols) - 1) :
        v = df[cols[i]].values
        for j in range(i + 1, len(cols)):
            if np.array_equal(v, df[cols[j]].values) :
                #print("Dups : " + str(cols[i]) + " and " + str(cols[j]))
                removed_cols.append(cols[j])
    return(removed_cols)


In [3]:
def config_dataset(train, test, y, conf, conf_name, verbose = False) :
    print("***** CONFIG : " + conf_name + "\n" + str(conf))
    ##################################
    if (conf["encode_cats"] == "LE") :
        # Encode cat variables with LabelEncoder
        for c in train.drop("y", axis = 1).columns:
            if train[c].dtype == "object" :
                lbl = LabelEncoder() 
                lbl.fit(list(train[c].values) + list(test[c].values)) 
                train[c] = lbl.transform(list(train[c].values))
                test[c] = lbl.transform(list(test[c].values))
    elif (conf["encode_cats"] == "LE+") :
        # Encode cat variables with a custom LabelEncoder using the right letter order (i.e. "aa" is 26, not 2)
        for c in train.drop("y", axis = 1).columns:
            if train[c].dtype == "object" :
                lbl = LabelEncoder() 
                lbl.fit(list(train[c].values) + list(test[c].values)) 
                train[c] = train[c].apply(encodeLetters)
                test[c] = test[c].apply(encodeLetters)
    elif (conf["encode_cats"] == "dummies") :
        # Encode cat variables as dummy variables
        alldata = pd.concat([train.drop("y", axis = 1), test], axis = 0)
        alldata = pd.get_dummies(alldata).astype(int)
        train = alldata.iloc[:train.shape[0], :]
        test = alldata.iloc[train.shape[0]: , :]  
        train["y"] = y
    elif (conf["encode_cats"] == "drop") :
        # Drop cat variables
        binary_vars_train = list(set(train.columns.drop(["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"])))
        binary_vars_test = list(set(test.columns.drop(["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"])))
        train = train[binary_vars_train]
        test = test[binary_vars_test]     
        
    ##################################
    if (conf["constant_vars"] == "without") :
        # Remove variables constant in the train set
        constant_vars = []
        for col in train.drop("y", axis = 1).columns:
            if(train[col].nunique() == 1):
                constant_vars.append(col)
        if (verbose) :
            print("Removing constant variables : " + str(constant_vars))            
        train.drop(constant_vars, axis = 1, inplace = True)
        test.drop(constant_vars, axis = 1, inplace = True)
        
    ##################################
    if (conf["dupli_vars"] == "remove_train") :
        # Remove variables duplicate in train
        old_nb_vars = train.shape[1]
        removed_vars = findDuplicateVars(train)
        removed_vars = list(set(removed_vars))
        train.drop(removed_vars, axis = 1, inplace = True)
        test.drop(removed_vars, axis = 1, inplace = True)
        if (verbose) :
            print("Removed " + str(old_nb_vars - train.shape[1]) + " duplicate variables")
            print(sorted(removed_vars))
    elif (conf["dupli_vars"] == "remove_train+test") :
        # Remove variables duplicate in train+test
        old_nb_vars = train.shape[1]
        alldata = pd.concat([train.drop("y", axis = 1), test], axis = 0)
        removed_vars = findDuplicateVars(alldata)
        removed_vars = list(set(removed_vars))
        train.drop(removed_vars, axis = 1, inplace = True)
        test.drop(removed_vars, axis = 1, inplace = True)
        if (verbose) :
            print("Removed " + str(old_nb_vars - train.shape[1]) + " duplicate variables")
            print(sorted(removed_vars))
        
    ##################################
    if (conf["binary_counts"] == "with") :
        # Add columns with count of 1s for each binary col
        if ("X0" in train.columns) :
            binary_vars = list(set(train.columns.drop(["ID", "y", "X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"])))
        else : 
            binary_vars = list(set(train.columns.drop(["ID", "y"])))
        train["bin_ones"] = (train[binary_vars] == 1).astype(int).sum(axis = 1)
        test["bin_ones"] = (test[binary_vars] == 1).astype(int).sum(axis = 1)         
        
    ##################################
    vars_to_reduce = list(set(train.columns.drop(["y"])))
    
    if (conf["pca"] > 0) :
        pca = PCA(n_components = conf["pca"], random_state = SEED)
        pca_train = pca.fit_transform(train[vars_to_reduce])
        pca_test = pca.transform(test)
    if (conf["ica"] > 0) :
        ica = FastICA(n_components = conf["ica"], random_state = SEED)
        ica_train = ica.fit_transform(train[vars_to_reduce])
        ica_test = ica.transform(test)
    if (conf["tsvd"] > 0) :
        tsvd = TruncatedSVD(n_components = conf["tsvd"], random_state = SEED)
        tsvd_train = tsvd.fit_transform(train[vars_to_reduce])
        tsvd_test = tsvd.transform(test)
    if (conf["grp"] > 0) :
        grp = GaussianRandomProjection(n_components = conf["grp"], random_state = SEED)
        grp_train = grp.fit_transform(train[vars_to_reduce])
        grp_test = grp.transform(test)
    if (conf["srp"] > 0) :
        srp = SparseRandomProjection(n_components = conf["srp"], random_state = SEED)
        srp_train = srp.fit_transform(train[vars_to_reduce])
        srp_test = srp.transform(test)

    if (conf["pca"] > 0) :
        for i in range(1, conf["pca"] + 1) :
            train["pca_" + str(i)] = pca_train[:, i - 1]
            test["pca_" + str(i)] = pca_test[:, i - 1]
    if (conf["ica"] > 0) :
        for i in range(1, conf["ica"] + 1) :
            train["ica_" + str(i)] = ica_train[:, i - 1]
            test["ica_" + str(i)] = ica_test[:, i - 1]
    if (conf["tsvd"] > 0) :
        for i in range(1, conf["tsvd"] + 1) :
            train["tsvd_" + str(i)] = tsvd_train[:, i - 1]
            test["tsvd_" + str(i)] = tsvd_test[:, i - 1]
    if (conf["grp"] > 0) :
        for i in range(1, conf["grp"] + 1) :
            train["grp_" + str(i)] = grp_train[:, i - 1]
            test["grp_" + str(i)] = grp_test[:, i - 1]
    if (conf["srp"] > 0) :
        for i in range(1, conf["srp"] + 1) :
            train["srp_" + str(i)] = srp_train[:, i - 1]
            test["srp_" + str(i)] = srp_test[:, i - 1]

    train.sort_index(axis = 1, inplace = True)
    test.sort_index(axis = 1, inplace = True)
    print(train.shape)
    return (train, test)

# TODO : scalers?
# TODO : means of X0-X8?
# TODO : log transform? https://www.kaggle.com/eikedehling/stack-of-svm-elasticnet-xgboost-rf-0-55
# https://www.kaggle.com/qqgeogor/some-feature-engineering


In [4]:
def get_cv_score(model, X_train, X_train_y) :
    oof_preds = pd.DataFrame()
    oof_targets = pd.DataFrame()
    kf = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        #print("FOLD " + str(i + 1))
        X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_test_fold = X_train_y.iloc[train_index], X_train_y.iloc[test_index]
        
        # Fit and predict
        model.fit(X_train_fold, y_train_fold.y)
        preds_X_test_fold = model.predict(X_test_fold)
        
        # Assemble OOF predictions and targets
        oof_preds = pd.concat([pd.Series(oof_preds), pd.Series(preds_X_test_fold)], axis = 0)
        oof_targets = pd.concat([pd.Series(oof_targets), pd.Series(y_test_fold.y)], axis = 0)
        
    # Compute error on concatenated OOF predictions
    cv_score = r2_score(oof_targets, oof_preds)
    print("Global OOF r2_score : " + str(cv_score))
                    
    return(cv_score)

In [5]:
def run_generator(models, data_configs, init_train, init_test) :
    # Iterate on each available model definition
    results = pd.DataFrame()
    for model_name, model  in models.items() :
        # Iterate on each data configuration :
        for conf_name, conf in data_configs.items() :
            print("***** MODEL : " + model_name + " *****")
            start = time.time()

            # Configure dataset
            train = init_train.copy()
            test = init_test.copy()
            y = pd.DataFrame({"y": train.y})
            train, test = config_dataset(train, test, y, conf, conf_name)
            train.drop("y", axis = 1, inplace = True)

            # Get CV score
            #display(train.head(2))
            cv_score = get_cv_score(model, train, y)

            # Fit, predict
            model.fit(train, y)
            preds_test = model.predict(test)

            # Handle format problems
            if isinstance(model, (RidgeCV, KNeighborsRegressor)) :
                preds_test = pd.DataFrame(preds_test).iloc[:, 0].values

            # Save preds
            file_name = "preds/" + model_name + "_" + conf_name + "_preds_test.csv"
            pd.DataFrame({"ID": init_test.ID.values, "y": preds_test}).to_csv(file_name, index = False)

            elapsed = time.time() - start
            print("Elapsed : " + str(elapsed))

            # Store results
            cur_result = pd.DataFrame(columns = ["name", "conf", "cv", "time"])
            cur_result.loc[0] = [model_name, conf_name, cv_score, elapsed] 
            results = pd.concat([results, cur_result], axis = 0)

    return(results)


In [6]:
models = {
    "la" : LassoCV(eps = 0.0001, 
                   n_alphas = 100, 
                   max_iter = 10000, 
                   tol = 0.0001,                     
                   normalize = True, 
                   precompute = True, 
                   random_state = SEED,
                   n_jobs = N_JOBS),
    "ll" : LassoLarsCV(max_n_alphas = 1000, 
                       max_iter = 10000,
                       normalize = True, 
                       precompute = True, 
                       n_jobs = N_JOBS),
    "ri" : RidgeCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6],
                   normalize = True),
    "rf" : RandomForestRegressor(n_estimators = 200,
                                 criterion = "mse", 
                                 max_features = 0.75,
                                 max_depth = 8,
                                 min_samples_split = 5,
                                 min_samples_leaf = 2, 
                                 bootstrap = True, 
                                 n_jobs = N_JOBS, 
                                 random_state = SEED),
    "et" : ExtraTreesRegressor(n_estimators = 200,
                               criterion = "mse", 
                               max_features = 0.75,
                               max_depth = 8,
                               min_samples_split = 5,
                               min_samples_leaf = 2, 
                               bootstrap = True, 
                               n_jobs = N_JOBS, 
                               random_state = SEED), 
#    "kn" : KNeighborsRegressor(n_neighbors = 100, 
#                               weights = "distance", 
#                               p = 3, 
#                               n_jobs = N_JOBS), 
#    "gb" : GradientBoostingRegressor(loss = "ls", 
#                                     learning_rate = 0.03, 
#                                     n_estimators = 200, 
#                                     max_depth = 8,
#                                     criterion = "friedman_mse",
#                                     min_samples_split = 5,
#                                     min_samples_leaf = 2,
#                                     subsample = 0.75,
#                                     max_features = 0.75, 
#                                     random_state = SEED),
    "xg" : XGBRegressor(objective = "reg:linear", 
                        learning_rate = 0.03, 
                        n_estimators = 200, 
                        max_depth = 8,
                        min_child_weight = 2, 
                        subsample = 0.75, 
                        colsample_bytree = 0.75, 
                        colsample_bylevel = 0.75, 
                        nthread = N_JOBS,
                        seed = SEED)
}

In [7]:
'''
data_configs = {
    "encode_cats" : ["LE", "LE+", "dummies", "drop"],
    "constant_vars" : ["with", "without"],
    "dupli_vars" : ["with", "remove_train", "remove_train+test"],
    "binary_counts" : ["with", "without"],
    "pca" : [0, 3, 6, 9, 12, 15],
    "ica" : [0, 3, 6, 9, 12, 15],
    "tsvd" : [0, 3, 6, 9, 12, 15],
    "grp" : [0, 3, 6, 9, 12, 15],
    "srp" : [0, 3, 6, 9, 12, 15],
    }
'''

'\ndata_configs = {\n    "encode_cats" : ["LE", "LE+", "dummies", "drop"],\n    "constant_vars" : ["with", "without"],\n    "dupli_vars" : ["with", "remove_train", "remove_train+test"],\n    "binary_counts" : ["with", "without"],\n    "pca" : [0, 3, 6, 9, 12, 15],\n    "ica" : [0, 3, 6, 9, 12, 15],\n    "tsvd" : [0, 3, 6, 9, 12, 15],\n    "grp" : [0, 3, 6, 9, 12, 15],\n    "srp" : [0, 3, 6, 9, 12, 15],\n    }\n'

In [8]:
dc1 = {
    "encode_cats" : "LE+",
    "constant_vars" : "with",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc2 = {
    "encode_cats" : "LE",
    "constant_vars" : "with",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc3 = {
    "encode_cats" : "dummies",
    "constant_vars" : "with",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc4 = {
    "encode_cats" : "drop",
    "constant_vars" : "with",
    "dupli_vars" : "with",
    "binary_counts" : "without",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc5 = {
    "encode_cats" : "LE+",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "without",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc6 = {
    "encode_cats" : "LE+",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc7 = {
    "encode_cats" : "LE",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc8 = {
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc9 = {
    "encode_cats" : "drop",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 0,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc10 = {
    "encode_cats" : "LE+",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 5,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc11 = {
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 5,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc12 = {
    "encode_cats" : "drop",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 5,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc13 = {
    "encode_cats" : "LE+",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc14 = {
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc15 = {
    "encode_cats" : "drop",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 10,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc16 = {
    "encode_cats" : "LE+",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 15,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc17 = {
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 15,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc18 = {
    "encode_cats" : "drop",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 15,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc19 = {
    "encode_cats" : "LE+",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 20,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc20 = {
    "encode_cats" : "dummies",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 20,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

dc21 = {
    "encode_cats" : "drop",
    "constant_vars" : "without",
    "dupli_vars" : "without",
    "binary_counts" : "with",
    "pca" : 20,
    "ica" : 0,
    "tsvd" : 0,
    "grp" : 0,
    "srp" : 0,
}

data_configs = {
    "dc1" : dc1,
    "dc2" : dc2,
    "dc3" : dc3,
    "dc4" : dc4,
    "dc5" : dc5,
    "dc6" : dc6,
    "dc7" : dc7,
    "dc8" : dc8,
    "dc9" : dc9,
    "dc10" : dc10,
    "dc11" : dc11,
    "dc12" : dc12,
    "dc13" : dc13,
    "dc14" : dc14,
    "dc15" : dc15,
    "dc16" : dc16,
    "dc17" : dc17,
    "dc18" : dc18,
    "dc19" : dc19,
    "dc20" : dc20,
    "dc21" : dc21,
}

<b>Script</b>

In [9]:
# Get data
init_train = pd.read_csv("raw_data/train.csv")
init_test = pd.read_csv("raw_data/test.csv")

display(init_train.shape)
display(init_train.head(2))


(4209, 378)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24,X26,X27,X28,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39,X40,X41,X42,X43,X44,X45,X46,X47,X48,X49,X50,X51,X52,X53,X54,X55,X56,X57,X58,X59,X60,X61,X62,X63,X64,X65,X66,X67,X68,X69,X70,X71,X73,X74,X75,X76,X77,X78,X79,X80,X81,X82,X83,X84,X85,X86,X87,X88,X89,X90,X91,X92,X93,X94,X95,X96,X97,X98,X99,X100,X101,X102,X103,X104,X105,X106,X107,X108,X109,X110,X111,X112,X113,X114,X115,X116,X117,X118,X119,X120,X122,X123,X124,X125,X126,X127,X128,X129,X130,X131,X132,X133,X134,X135,X136,X137,X138,X139,X140,X141,X142,X143,X144,X145,X146,X147,X148,X150,X151,X152,X153,X154,X155,X156,X157,X158,X159,X160,X161,X162,X163,X164,X165,X166,X167,X168,X169,X170,X171,X172,X173,X174,X175,X176,X177,X178,X179,X180,X181,X182,X183,X184,X185,X186,X187,X189,X190,X191,X192,X194,X195,X196,X197,X198,X199,X200,X201,X202,X203,X204,X205,X206,X207,X208,X209,X210,X211,X212,X213,X214,X215,X216,X217,X218,X219,X220,X221,X222,X223,X224,X225,X226,X227,X228,X229,X230,X231,X232,X233,X234,X235,X236,X237,X238,X239,X240,X241,X242,X243,X244,X245,X246,X247,X248,X249,X250,X251,X252,X253,X254,X255,X256,X257,X258,X259,X260,X261,X262,X263,X264,X265,X266,X267,X268,X269,X270,X271,X272,X273,X274,X275,X276,X277,X278,X279,X280,X281,X282,X283,X284,X285,X286,X287,X288,X289,X290,X291,X292,X293,X294,X295,X296,X297,X298,X299,X300,X301,X302,X304,X305,X306,X307,X308,X309,X310,X311,X312,X313,X314,X315,X316,X317,X318,X319,X320,X321,X322,X323,X324,X325,X326,X327,X328,X329,X330,X331,X332,X333,X334,X335,X336,X337,X338,X339,X340,X341,X342,X343,X344,X345,X346,X347,X348,X349,X350,X351,X352,X353,X354,X355,X356,X357,X358,X359,X360,X361,X362,X363,X364,X365,X366,X367,X368,X369,X370,X371,X372,X373,X374,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [10]:
# Generate preds
results = run_generator(models, data_configs, init_train, init_test)

***** MODEL : kn *****
***** CONFIG : dc9
{'ica': 0, 'binary_counts': 'with', 'tsvd': 0, 'pca': 0, 'constant_vars': 'without', 'grp': 0, 'srp': 0, 'encode_cats': 'drop', 'dupli_vars': 'without'}
(4209, 359)
Global OOF r2_score : -0.0146135586259
Elapsed : 2.743401527404785
***** MODEL : kn *****
***** CONFIG : dc11
{'ica': 0, 'binary_counts': 'with', 'tsvd': 0, 'pca': 5, 'constant_vars': 'without', 'grp': 0, 'srp': 0, 'encode_cats': 'dummies', 'dupli_vars': 'without'}
(4209, 559)
Global OOF r2_score : -0.0127943933937
Elapsed : 4.450928211212158
***** MODEL : kn *****
***** CONFIG : dc2
{'ica': 0, 'binary_counts': 'without', 'tsvd': 0, 'pca': 0, 'constant_vars': 'with', 'grp': 0, 'srp': 0, 'encode_cats': 'LE', 'dupli_vars': 'with'}
(4209, 378)
Global OOF r2_score : 0.00178897350364
Elapsed : 2.525865077972412
***** MODEL : kn *****
***** CONFIG : dc1
{'ica': 0, 'binary_counts': 'without', 'tsvd': 0, 'pca': 0, 'constant_vars': 'with', 'grp': 0, 'srp': 0, 'encode_cats': 'LE+', 'dupli_var

In [11]:
# Show ordered results
results.sort_values(["cv"], ascending = False, inplace= True)
results.reset_index(drop = True, inplace = True)
display(results)

Unnamed: 0,name,conf,cv,time
0,rf,dc8,0.56730,12.75608
1,et,dc14,0.56703,20.27883
2,et,dc8,0.56675,11.36553
3,rf,dc3,0.56670,9.61692
4,et,dc11,0.56609,8.80150
5,et,dc3,0.56607,17.22338
6,rf,dc1,0.56598,8.12535
7,rf,dc5,0.56592,15.61269
8,rf,dc6,0.56572,8.12773
9,rf,dc2,0.56571,9.03722


In [12]:
'''
for cur_model in model_list :
    for cur_data_config in data_config_list :
        create cur_data(cur_data_config)
        get_cvscore(cur_model(cur_data))
        fit(cur_model(cur_data))
        predict(cur_model(cur_data))
        
optimize parameters for most promising models
'''

'\nfor cur_model in model_list :\n    for cur_data_config in data_config_list :\n        create cur_data(cur_data_config)\n        get_cvscore(cur_model(cur_data))\n        fit(cur_model(cur_data))\n        predict(cur_model(cur_data))\n        \noptimize parameters for most promising models\n'

In [13]:
'''import itertools
c = list(itertools.product(data_configs["constant_vars"], data_configs["encode_cats"]))
print(c)'''

'import itertools\nc = list(itertools.product(data_configs["constant_vars"], data_configs["encode_cats"]))\nprint(c)'