# MIT-GSL Uruguay 

## January 2020

-----

# Lesson - 2: Introduction to ML

# StumbleUpon - Hands On Exercise

**Overview**

1. The following hands on exercise focuses on the application of techniques discussed during the lecture series and the hands on programming session.
2. The exercise aims to bolster the understanding of use of algorithms for model building

Instructions
----------------------


1. We have the following datasets created:
    1. TFIDF
    2. TFIDF - with important features selected
    3. TF (Count)
    4. PCA on TFIDF
    5. Base features (can be combined with any of the four above)
2. We have the following algorithms to choose from:
    1. GBM
    2. RF
    3. ERT
    4. SGD
    5. XGBoost (we will discuss this in detail)
3. The task is to use pick a dataset and an algorithm, cross-validate it, and generate predictions for out-of-fold and test data
4. These predictions will be included in the ensemble with the remaining models to generate the final probabilities

Points to note:
-----------------------

1. Don't pick a data-algo combination already used
2. The ultimate objective is to build a better ensemble and **not** a strong individual model
3. You can choose any other algorithm from outside the five mentioned above
4. We are going to test the results on the separate hold out 25% sample 
5. There is no right answer to this question
6. Your answer will depend on how you twist and turn the **given** data 
7. We will collectively discuss why we chose a particular algorithm and critically appraise each other's responses

In [1]:
# Collective code till now
# ==============================================
# 0. Module imports
# ==============================================

# working directory
import os
os.chdir("/pool001/madhavk/gsl-uruguay/W-01-IntroML/")

# data manipulation
import pandas as pd
import numpy as np
import scipy.stats as st

# plots
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pylab as pl

# classification algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

# dimension reduction
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

# cross-validation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import model_selection

# parallel processing
from joblib import Parallel, delayed  
import multiprocessing

# model evaluation
from sklearn.metrics import roc_auc_score
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence

# text mining
import re
from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings("ignore")



# ==============================================
# 1. Data import and explore
# ==============================================

train = pd.read_table("stumble-data/stumbleupon/train.tsv", sep= "\t")
train = train.replace("?", np.nan)
train[["alchemy_category_score", "is_news", "news_front_page"]] = train[["alchemy_category_score",
                                                                         "is_news", "news_front_page"]].astype(float)

# ==============================================
# 3. Prep for modeling
# ==============================================

# =============================
# Train-val-test split
# =============================

train, val = train_test_split(train, test_size= 0.5, train_size= 0.5, random_state= 234)
val, test = train_test_split(val, test_size= 0.2, train_size= 0.3, random_state= 675)
print("Train data size: " + str(train.shape))
print("Validation data size: " + str(val.shape))
print("Test data size: " + str(test.shape))

# ==============================================
# 4. Explore
# ==============================================


# ==========================================
# 5. Feature engineering
# ==========================================

# =============================
# 5.1 Variables with missing 
#     values
# =============================

train["alchemy_category_score"] = train["alchemy_category_score"].fillna(np.mean(train["alchemy_category_score"]))
val["alchemy_category_score"] = val["alchemy_category_score"].fillna(np.mean(train["alchemy_category_score"]))
test["alchemy_category_score"] = test["alchemy_category_score"].fillna(np.mean(train["alchemy_category_score"]))
train["is_news"] = train["is_news"].fillna(0)
train["news_front_page"] = train["news_front_page"].fillna(2)

# =============================
# 5.2.1 Categorical variables
# =============================

# impute missing values as separate category
train["alchemy_category"] = train["alchemy_category"].fillna("_M")
val["alchemy_category"] = val["alchemy_category"].fillna("_M")
test["alchemy_category"] = test["alchemy_category"].fillna("_M")

# dummy variables for all categories
alch_train = pd.get_dummies(train["alchemy_category"], prefix= "al_cat")
alch_val = pd.get_dummies(val["alchemy_category"], prefix= "al_cat")
alch_test = pd.get_dummies(test["alchemy_category"], prefix= "al_cat")
train = train.join(alch_train)
val = val.join(alch_val)
test= test.join(alch_test)


# =============================
# 5.2.2 Categorical variables
# =============================

# Function to create n cross validation folds
def createFolds(df, nfolds):
    rows = df.shape[0]
    folds = list(range(0, nfolds))*int(np.ceil(float(rows)/nfolds))
    folds = folds[0:rows]
    np.random.shuffle(folds)
    folds = pd.Series(folds)
    return folds


# cross-fold category average
def cat_avg_cv(df, target, var, var_out, idcol, nfolds= 4, r1= 0.6, r2= 0.4):
    df = df[[idcol, target, var]]
    df_out = pd.DataFrame({idcol:[], var_out: []})
    folds = createFolds(df= df, nfolds= nfolds)
    for f in range(0, nfolds):
        print("Fold", f+1, "of", nfolds)
        tr = df.ix[folds.values == f, [idcol, var]]
        va = df.ix[folds.values != f, [var, target]]
        fold_mean = va[target].mean()
        va = va.groupby(var).agg({target: np.mean})
        va = pd.DataFrame({var: va.index, var_out: va[target]})
        va[var_out] = r1*va[var_out] + r2*fold_mean
        tr = pd.merge(tr, va, on= var, how= "left")
        tr = tr[[idcol, var_out]]
        df_out = df_out.append(tr)

    df_out = df_out.fillna(df[target].mean())
    return df_out

# for training data
alch_cat_cv = cat_avg_cv(train, "label", "alchemy_category", "alch_avg", 
                         idcol= "urlid", nfolds= 4, r1= 0.6, r2= 0.4)
# use direct means for validation and testing data
alch_avg = train.groupby("alchemy_category").label.mean()
alch_avg = pd.DataFrame({"alchemy_category": alch_avg.index,
                         "alch_avg": alch_avg.values})
# merge with train and val
train = pd.merge(train, alch_cat_cv, on= "urlid", how= "left")
val = pd.merge(val, alch_avg, on= "alchemy_category", how= "left")
test = pd.merge(test, alch_avg, on= "alchemy_category", how= "left")

  from numpy.core.umath_tests import inner1d


Train data size: (3697, 27)
Validation data size: (1109, 27)
Test data size: (740, 27)
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4


In [2]:
# =============================
# 5.3.1 Text features
# =============================

# TF - COUNT
count_dtm = CountVectorizer(min_df= 10,  max_features= 10000, strip_accents= 'unicode',
                            analyzer= "word", token_pattern= r"\w{1,}", ngram_range=(1, 2), 
                            binary= True)
count_dtm.fit(train["boilerplate"])
train_cnt_dtm = count_dtm.transform(train["boilerplate"])
val_cnt_dtm = count_dtm.transform(val["boilerplate"])
test_cnt_dtm = count_dtm.transform(test["boilerplate"])

# TFIDF
idf_dtm = TfidfVectorizer(min_df= 10,  max_features= None, strip_accents= "unicode",
                          analyzer= "word", token_pattern= r"\w{1,}", ngram_range=(1, 2), 
                          use_idf= 1, smooth_idf= 1, sublinear_tf= 1)
idf_dtm.fit(train["boilerplate"])
train_idf_dtm = idf_dtm.transform(train["boilerplate"])
val_idf_dtm = idf_dtm.transform(val["boilerplate"])
test_idf_dtm = idf_dtm.transform(test["boilerplate"])

In [3]:
# =============================
# 5.4 Feature selection
# =============================

def textFeatureSelect(dtm, target, model, tf_object, nfolds= 4, nreps= 2):
    feat_imp = pd.DataFrame({"feat":[], "imp": []})
    for r in range(0, nreps):
        print("Run", r+1, "of", nreps)
        folds = createFolds(df= dtm, nfolds= nfolds)
        for f in range(0, nfolds):
            print("Fold", f+1, "of", nfolds)
            tr = np.where(folds.values != f)[0]
            va = np.where(folds.values == f)[0]
            model.fit(dtm[tr,:], target[tr])
            imp = pd.DataFrame({"feat": tf_object.get_feature_names(), "imp": model.coef_[0,:]})
            imp["imp"] = imp["imp"].abs()
            feat_imp = feat_imp.append(imp)

    feat_imp = feat_imp.groupby("feat")
    feat_imp = feat_imp.agg({"imp": np.mean})
    feat_imp = pd.DataFrame({"feat": feat_imp.index, "imp": feat_imp.imp})
    feat_imp = feat_imp.sort_values(by = ["imp"], ascending= False)
    
    return feat_imp

# Select best features from text data using logistic regression
model = LogisticRegression()
imp_text_feat = textFeatureSelect(train_idf_dtm, target= train["label"].values, model= model, 
                                  tf_object= idf_dtm, nfolds= 4, nreps= 2)

# keep the top 1000 ones
text_feat = imp_text_feat.ix[0:1000, :]
idf_dtm_words = pd.DataFrame({"feat": idf_dtm.get_feature_names()})
idf_dtm_words = pd.merge(idf_dtm_words, text_feat, on= "feat", how= "inner", left_index= False, right_index= True)

# subset dtm with important features only
train_idf_dtm_sub = train_idf_dtm[:, idf_dtm_words.index]
val_idf_dtm_sub = val_idf_dtm[:, idf_dtm_words.index]
test_idf_dtm_sub = test_idf_dtm[:, idf_dtm_words.index]

Run 1 of 2
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
Run 2 of 2
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4


In [4]:
# =============================
# 5.5 Dimension reduction -- PCA
# =============================

# fit with 100 components
rpca = PCA(n_components = 100, random_state = 8795, svd_solver = "randomized")
rpca.fit(train_idf_dtm.toarray())

# tranform dtm
train_idf_dtm_pca = rpca.transform(train_idf_dtm.toarray())
val_idf_dtm_pca = rpca.transform(val_idf_dtm.toarray())
test_idf_dtm_pca = rpca.transform(test_idf_dtm.toarray())

In [5]:
# ==========================================
# 6. Cross-validation
# ==========================================


# =============================
# 6.1 CV function - 1 
# =============================

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

def cross_val_model(train, target, model, nfolds= 4, classify= True):
        folds = KFold(n_splits = nfolds, shuffle= True, random_state = np.random.randint(1e4))
        if classify:
            preds = [np.array([model.fit(train[train_indices], 
                                         target[train_indices]).predict_proba(train[test_indices])[:,1], 
                               target[test_indices], test_indices])
                     for train_indices, test_indices in folds.split(train)]
        else:
            preds = [np.array([model.fit(train[train_indices], target[train_indices]).predict(train[test_indices]), 
                               target[test_indices], test_indices])
                     for train_indices, test_indices in folds.split(train)]
        
        preds = [preds[x].transpose() for x in range(len(preds))]
        preds = np.concatenate(preds, axis= 0)
        preds = pd.DataFrame(preds, columns= ["pred", "target", "index"])
        
        return preds

    
# serial version
# nreps = 4
# preds = [cross_val_model(tr_dtm, target, model= model, nfolds= 10, classify= True) for r in range(nreps)]

def cross_val_model_nrep(train, target, model, nfolds, nreps= 4, classify= True,
                         num_cores= 2, parallel= False, verbose= 0):
    if parallel:
        preds = Parallel(n_jobs= num_cores, 
                         verbose= verbose)(delayed(cross_val_model)(train, 
                                                                    target, model, nfolds= 10,
                                                                    classify= True) for r in range(nreps))
    else:
        preds = [cross_val_model(train, target, model= model, nfolds= 10, classify= True) for r in range(nreps)]
    preds = pd.DataFrame(np.concatenate(preds, axis= 0), columns= ["pred", "target", "index"])
    preds = preds.groupby(["index", "target"]).agg({"pred": np.mean})
    preds = preds.reset_index().sort_values(by= ["index"])
    return preds


# =============================
# 6.3 CV function - 2
# =============================

def cvModel(train, test, target, feat, model, idcol, nfolds= 2, nreps= 2, classify= True):
    ''' Train a model using k-fold cross validation
        and return cross-validated predictions on 
        training and test data sets
    '''
    val_pred = pd.DataFrame({idcol: [], 
                             'target': [],
                            'repeat':[], 
                            'fold': [], 
                            'pred': []})
    test_pred = pd.DataFrame(test[idcol])
    
    for r in range(0, nreps):
        folds = createFolds(train, nfolds)
        print("Run", r+1, "of", nreps)
        for f in range(0, nfolds):
            print("Fold", f+1, "of", nfolds)
            tr = folds.values != f
            va = folds.values == f
            model.fit(train.ix[tr, feat], target[tr])
            scored = pd.DataFrame({idcol: train.ix[va, idcol], 'target': target[va],
                                   'repeat': r, 'fold': f})
            tmp = 'pred' + str(r) + str(f)                    
            if classify:
                scored["pred"] = model.predict_proba(train.ix[va, feat])[:,1]
                test_pred[tmp] = model.predict_proba(test[feat])[:,1]
            else:
                scored["pred"] = model.predict(train.ix[va, feat])
                test_pred[tmp] = model.predict(test[feat])
            val_pred = val_pred.append(scored)
            
    val_pred = val_pred.groupby(idcol)
    val_pred = val_pred.agg({'target': np.mean,
                             'pred': np.mean})
    val_pred = val_pred.reset_index()
    val_pred = val_pred.sort_values(by= [idcol])
    test_pred["pred"] = test_pred.ix[:,1:(test_pred.shape[1]+1)].mean(axis= 1)
    test_pred = test_pred[[idcol, "pred"]]
    return val_pred, test_pred


# merge pca components with main
train_idf_dtm_pca = pd.DataFrame(train_idf_dtm_pca)
val_idf_dtm_pca = pd.DataFrame(val_idf_dtm_pca)
test_idf_dtm_pca = pd.DataFrame(test_idf_dtm_pca)

# give string names to columns
train_idf_dtm_pca.columns = ["pca" + str(i) for i in train_idf_dtm_pca.columns]
val_idf_dtm_pca.columns = ["pca" + str(i) for i in val_idf_dtm_pca.columns]
test_idf_dtm_pca.columns = ["pca" + str(i) for i in test_idf_dtm_pca.columns]

# generate index to merge on
train_idf_dtm_pca.index = train.index
val_idf_dtm_pca.index = val.index
test_idf_dtm_pca.index = test.index

# join
train_pca = train.join(train_idf_dtm_pca)
val_pca = val.join(val_idf_dtm_pca)
test_pca = test.join(test_idf_dtm_pca)

In [6]:
# ==========================================
# 6.x Cross-validated models
# ==========================================

# Logistic regression
logReg7_cv = LogisticRegression()
logReg7_train = cross_val_model_nrep(train= train_idf_dtm, target= train.label.values, model= logReg7_cv, 
                                     nfolds= 10, nreps= 4, classify= True, num_cores= 4, parallel= True,
                                     verbose= 5)
logReg7_val= pd.Series(logReg7_cv.fit(train_idf_dtm, train.label.values).predict_proba(test_idf_dtm)[:,1])


# gbm
feat = list(train_idf_dtm_pca.columns.values)
gbm4 = GradientBoostingClassifier(loss = "deviance", n_estimators= 100,
                                  max_depth= 2, min_samples_split= 10, min_samples_leaf= 10,
                                  subsample= 0.75, max_features= None, verbose= 0)
gbm4_train, gbm4_val = cvModel(train_pca, test_pca, target= train_pca["label"], model= gbm4, feat= feat, 
                               idcol= "urlid", nfolds= 4, nreps= 2, classify= True)


# random forests
feat = ["avglinksize", "commonlinkratio_1", "commonlinkratio_2", "commonlinkratio_3", "commonlinkratio_4", 
        "compression_ratio", "embed_ratio", "framebased", "frameTagRatio", "hasDomainLink", "html_ratio",
        "image_ratio", "lengthyLinkDomain", "linkwordscore", "non_markup_alphanum_characters", "numberOfLinks", 
        "numwords_in_url", "parametrizedLinkRatio","spelling_errors_ratio", "alchemy_category_score", 
        "alch_avg"]
feat.extend(list(train_idf_dtm_pca.columns.values))

rf2 = RandomForestClassifier(n_estimators= 100, min_samples_split= 5, random_state= 9876,
                             max_features= 15, verbose= 0)
rf2_train, rf2_val = cvModel(train_pca, test_pca, target= train_pca["label"], model= rf2, feat= feat, 
                               idcol= "urlid", nfolds= 4, nreps= 2, classify= True)

# ERT
ert1_cv = ExtraTreesClassifier(n_estimators= 25, min_samples_split= 10, random_state= 134,
                               max_features= 55, min_samples_leaf= 5, verbose= 0)

ert1_train = cross_val_model_nrep(train= train_idf_dtm_sub, target= train.label.values, model= ert1_cv, 
                                     nfolds= 3, nreps= 8, classify= True, num_cores= 4, parallel= True,
                                     verbose= 5)
# fit on entire training data to check on validation data
ert1_val= pd.Series(ert1_cv.fit(train_idf_dtm_sub, train.label.values).predict_proba(test_idf_dtm_sub)[:,1])

[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    2.4s remaining:    2.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.4s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    2.4s finished


Run 1 of 2
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
Run 2 of 2
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
Run 1 of 2
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
Run 2 of 2
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4


[Parallel(n_jobs=4)]: Done   3 out of   8 | elapsed:    5.1s remaining:    8.5s
[Parallel(n_jobs=4)]: Done   5 out of   8 | elapsed:   10.1s remaining:    6.1s
[Parallel(n_jobs=4)]: Done   8 out of   8 | elapsed:   10.1s finished


In [7]:
# Current Scores
# CV scores
print("Logistic cv score:" + str(np.round(roc_auc_score(train["label"], logReg7_train.pred.values), 5)))
print("GBM cv score:" + str(np.round(roc_auc_score(gbm4_train.target.values, gbm4_train.pred.values), 5)))
print("RF cv score: " + str(np.round(roc_auc_score(rf2_train.target.values, rf2_train.pred.values), 5)))
print("ERT cv score: " + str(np.round(roc_auc_score(train["label"], ert1_train.pred.values), 5)))

print(" --- ")

# Test scores
print("Logistic val score:"  + str(np.round(roc_auc_score(test["label"], logReg7_val.values), 5)))
print("GBM val score:" + str(np.round(roc_auc_score(test_pca["label"], gbm4_val.pred.values), 5)))
print("RF val score: " + str(np.round(roc_auc_score(test_pca["label"], rf2_val.pred.values), 5)))
print("ERT val score: "  + str(np.round(roc_auc_score(test["label"], ert1_val.values), 5)))

Logistic cv score:0.87528
GBM cv score:0.86904
RF cv score: 0.86915
ERT cv score: 0.87807
 --- 
Logistic val score:0.88871
GBM val score:0.89272
RF val score: 0.89157
ERT val score: 0.88481


In [8]:
# Ensemble
train["sort_col"] = range(0, train.shape[0])
gbm4_train = pd.merge(gbm4_train, train[["urlid", "sort_col"]], on= "urlid", how= 'left').sort_values(by= ["sort_col"])
rf2_train = pd.merge(rf2_train, train[["urlid", "sort_col"]], on= "urlid", how= 'left').sort_values(by= ["sort_col"])

In [9]:
# Ensemble
train_ens = pd.DataFrame({"urlid": gbm4_train.urlid.values, "label": gbm4_train.target.values, 
                          "logReg7": logReg7_train.pred.values,
                          "gbm4": gbm4_train.pred.values, "rf2": rf2_train.pred.values,
                         "ert1": ert1_train.pred.values})
val_ens = pd.DataFrame({"urlid": gbm4_val.urlid.values, "gbm4": gbm4_val.pred.values,
                        "logReg7": logReg7_val.values, "rf2": rf2_val.pred.values,
                       "ert1": ert1_val.values})
val_ens = pd.merge(val_ens, test[["urlid", "label"]], on= "urlid")


# logistic regression
logReg8_ens = LogisticRegression()
feat = ["gbm4", "logReg7", "rf2", "ert1"]
logReg8_ens_cv, logReg8_ens_val = cvModel(train_ens, val_ens, target= train_ens["label"], model= logReg8_ens, 
                                          feat= feat, idcol= "urlid", nfolds= 4, nreps= 4, classify= True)

# auc score
print(" --- ")
print("Ensemble cv score: " + str(np.round(roc_auc_score(logReg8_ens_cv.target.values, logReg8_ens_cv.pred.values), 5)))
print("Ensemble val score: " + str(np.round(roc_auc_score(val_ens["label"], logReg8_ens_val.pred.values), 5)))

Run 1 of 4
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
Run 2 of 4
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
Run 3 of 4
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
Run 4 of 4
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
 --- 
Ensemble cv score: 0.88024
Ensemble val score: 0.89459


In [10]:
simple_avg_val = np.mean([val_ens.gbm4.values, val_ens.rf2.values, val_ens.logReg7.values,
                         val_ens.ert1.values], axis= 0)
print("Simple avg val: " + str(np.round(roc_auc_score(val_ens["label"], simple_avg_val), 5)))

Simple avg val: 0.89574


In [11]:
# Exercise

# sgd
sgd1_cv = SGDClassifier(loss= "log", penalty= "l2", n_iter= 30, random_state= 34)
sgd1_cv_train = cross_val_model_nrep(train= train_idf_dtm_sub, target= train.label.values, model= sgd1_cv, 
                                     nfolds= 10, nreps= 4, classify= True, num_cores= 4, parallel= True,
                                     verbose= 5)
sgd1_cv_val= pd.Series(logReg7_cv.fit(train_idf_dtm_sub, train.label.values).predict_proba(test_idf_dtm_sub)[:,1])

print("SGD cv score:" + str(np.round(roc_auc_score(train["label"], sgd1_cv_train.pred.values), 5)))
print("SGD val score:"  + str(np.round(roc_auc_score(test["label"], sgd1_cv_val.values), 5)))

SGD cv score:0.90495
SGD val score:0.8864


[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    0.4s remaining:    0.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.4s finished


In [12]:
# Ensemble
train_ens = pd.DataFrame({"urlid": gbm4_train.urlid.values, "label": gbm4_train.target.values, 
                          "logReg7": logReg7_train.pred.values,
                          "gbm4": gbm4_train.pred.values, "rf2": rf2_train.pred.values,
                         "ert1": ert1_train.pred.values, "x1":sgd1_cv_train.pred.values})
val_ens = pd.DataFrame({"urlid": gbm4_val.urlid.values, "gbm4": gbm4_val.pred.values,
                        "logReg7": logReg7_val.values, "rf2": rf2_val.pred.values,
                       "ert1": ert1_val.values, "x1":sgd1_cv_val.values})
val_ens = pd.merge(val_ens, test[["urlid", "label"]], on= "urlid")


# logistic regression
logReg8_ens = LogisticRegression()
feat = ["gbm4", "logReg7", "rf2", "ert1", "x1"]
logReg8_ens_cv, logReg8_ens_val = cvModel(train_ens, val_ens, target= train_ens["label"], model= logReg8_ens, 
                                          feat= feat, idcol= "urlid", nfolds= 4, nreps= 4, classify= True)

# auc score
print(" --- ")
print("Ensemble cv score: " + str(np.round(roc_auc_score(logReg8_ens_cv.target.values, logReg8_ens_cv.pred.values), 5)))
print("Ensemble val score: " + str(np.round(roc_auc_score(val_ens["label"], logReg8_ens_val.pred.values), 5)))

Run 1 of 4
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
Run 2 of 4
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
Run 3 of 4
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
Run 4 of 4
Fold 1 of 4
Fold 2 of 4
Fold 3 of 4
Fold 4 of 4
 --- 
Ensemble cv score: 0.91225
Ensemble val score: 0.88463


In [13]:
simple_avg_val = np.mean([val_ens.gbm4.values, val_ens.rf2.values, val_ens.logReg7.values,
                         val_ens.ert1.values, val_ens.x1.values], axis= 0)
print("Simple avg val: " + str(np.round(roc_auc_score(val_ens["label"], simple_avg_val), 5)))

Simple avg val: 0.89507
