In [56]:
"""
OOF stack regime
Thanks for the great kernel from https://www.kaggle.com/hhstrand/oof-stacking-regime

Explaination for OOF stack regime:
This is a method of stacking different models, 
with the idea of "combine the decisions from multiple models to improve the overall performance"

first step is similar to cross validation, but keep the prediction for each fold as out-of-folder prediction. 
Then use these predictions (from different model) as X_train, and correct result from original 
dataset as y_train, and prediction from test set (from different model) as X_test, 
to predict the final results. The main reason of using OOF is to avoid data leaking.


Changes:
Train oof (out of folder) prediction files to a new model by LGB
1. add one more strategy
First model: get new model first then predict final result
Second model: predict final result by the model from each fold, then average

2.Tuning hyperparameters for LGB
""" 

#oof stack regime 1

import pandas as pd
import numpy as np
import re
import lightgbm as lgb
import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score


#######################
# FEATURE ENGINEERING #
#######################
"""
Main function
Input: pandas Series and a feature engineering function
Output: pandas Series
"""
def engineer_feature(series, func, normalize=True):
    feature = series.apply(func)
       
    if normalize:
        feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))
    feature.name = func.__name__ 
    return feature

"""
Engineer features
Input: pandas Series and a list of feature engineering functions
Output: pandas DataFrame
"""
def engineer_features(series, funclist, normalize=True):
    features = pd.DataFrame()
    for func in funclist:
        feature = engineer_feature(series, func, normalize)
        features[feature.name] = feature
    return features

"""
Normalizer
Input: NumPy array
Output: NumPy array
"""
scaler = StandardScaler()
def z_normalize(data):
    scaler.fit(data)
    return scaler.transform(data)
    
"""
Feature functions
"""
def asterix_freq(x):
    return x.count('!')/len(x)

def uppercase_freq(x):
    return len(re.findall(r'[A-Z]',x))/len(x)
    
"""
Import submission and OOF files
"""
def get_subs(nums):
    subs = np.hstack([np.array(pd.read_csv("../input/trained-models/sub" + str(num) + ".csv")[LABELS]) for num in subnums])
    oofs = np.hstack([np.array(pd.read_csv("../input/trained-models/oof" + str(num) + ".csv")[LABELS]) for num in subnums])
    return subs, oofs

if __name__ == "__main__":
    
    train = pd.read_csv('../input/train.csv').fillna(' ')
    test = pd.read_csv('../input/test.csv').fillna(' ')
    sub = pd.read_csv('../input/sample_submission.csv')
    INPUT_COLUMN = "comment_text"
    LABELS = train.columns[2:]
    
    # Import submissions and OOF files
    # 29: LightGBM trained on Fasttext (CV: 0.9765, LB: 0.9620)
    # 51: Logistic regression with word and char n-grams (CV: 0.9858, LB: ?)
    # 52: LSTM trained on Fasttext (CV: ?, LB: 0.9851)
    subnums = [1,2,3,4,5,29,51,52]
    subs, oofs = get_subs(subnums)
    
    # Engineer features
    feature_functions = [len, asterix_freq, uppercase_freq]
    features = [f.__name__ for f in feature_functions]
    F_train = engineer_features(train[INPUT_COLUMN], feature_functions)
    F_test = engineer_features(test[INPUT_COLUMN], feature_functions)
    
    X_train = np.hstack([F_train[features].as_matrix(), oofs])
    X_test = np.hstack([F_test[features].as_matrix(), subs])    

    stacker = lgb.LGBMClassifier(max_depth=3, metric="auc", n_estimators=125, num_leaves=10, 
                                 boosting_type="gbdt", learning_rate=0.1, feature_fraction=0.45, 
                                 colsample_bytree=0.45, bagging_fraction=0.8, bagging_freq=5, reg_lambda=0.2)
    
    # Fit and submit
    scores = []
    for label in LABELS:
        print(label)
        score = cross_val_score(stacker, X_train, train[label], cv=5, scoring='roc_auc')
        print("AUC:", score)
        scores.append(np.mean(score))
        stacker.fit(X_train, train[label])
        sub[label] = stacker.predict_proba(X_test)[:,1]
    print("CV score:", np.mean(scores))
    
    sub.to_csv("oof_regime.csv", index=False)



toxic
AUC: [0.98751373 0.98812735 0.98757516 0.98793401 0.98705573]
severe_toxic
AUC: [0.99173013 0.99183782 0.99092373 0.99139372 0.99257303]
obscene
AUC: [0.99506502 0.99498896 0.99563831 0.99543938 0.99482957]
threat
AUC: [0.9942114  0.99602995 0.99263837 0.99198302 0.99488323]
insult
AUC: [0.9894157  0.9888453  0.98990449 0.98941499 0.98867792]
identity_hate
AUC: [0.99096652 0.98860013 0.99067486 0.98913574 0.99272776]
CV score: 0.9913578346912558


In [55]:
#oof stack regime 2

import pandas as pd
import numpy as np
import re
import lightgbm as lgb
import warnings
from sklearn.model_selection import KFold
import gc
warnings.filterwarnings(action='ignore', category=DeprecationWarning, module='sklearn')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from collections import defaultdict
import os
from sklearn.metrics import roc_auc_score


#######################
# FEATURE ENGINEERING #
#######################
"""
Main function
Input: pandas Series and a feature engineering function
Output: pandas Series
"""
def engineer_feature(series, func, normalize=True):
    feature = series.apply(func)
       
    if normalize:
        feature = pd.Series(z_normalize(feature.values.reshape(-1,1)).reshape(-1,))
    feature.name = func.__name__ 
    return feature

"""
Engineer features
Input: pandas Series and a list of feature engineering functions
Output: pandas DataFrame
"""
def engineer_features(series, funclist, normalize=True):
    features = pd.DataFrame()
    for func in funclist:
        feature = engineer_feature(series, func, normalize)
        features[feature.name] = feature
    return features

"""
Normalizer
Input: NumPy array
Output: NumPy array
"""
scaler = StandardScaler()
def z_normalize(data):
    scaler.fit(data)
    return scaler.transform(data)
    
"""
Feature functions
"""
def asterix_freq(x):
    return x.count('!')/len(x)

def uppercase_freq(x):
    return len(re.findall(r'[A-Z]',x))/len(x)
    
"""
Import submission and OOF files
"""
def get_subs(nums):
    subs = np.hstack([np.array(pd.read_csv("../input/trained-models/sub" + str(num) + ".csv")[LABELS]) for num in nums])
    oofs = np.hstack([np.array(pd.read_csv("../input/trained-models/oof" + str(num) + ".csv")[LABELS]) for num in nums])
    return subs, oofs

if __name__ == "__main__":
    
    train = pd.read_csv('../input/train.csv').fillna(' ')
    test = pd.read_csv('../input/test.csv').fillna(' ')
    submission = pd.read_csv('../input/sample_submission.csv')
    INPUT_COLUMN = "comment_text"
    LABELS = train.columns[2:]
    
    # Import submissions and OOF files
    # 29: LightGBM trained on Fasttext (CV: 0.9765, LB: 0.9620)
    # 51: Logistic regression with word and char n-grams (CV: 0.9858, LB: ?)
    # 52: LSTM trained on Fasttext (CV: ?, LB: 0.9851)
    subnums = [1,2,3,4,5,29,51,52]
    subs, oofs = get_subs(subnums)
    
    # Engineer features
    feature_functions = [len, asterix_freq, uppercase_freq]
    features = [f.__name__ for f in feature_functions]
    F_train = engineer_features(train[INPUT_COLUMN], feature_functions)
    F_test = engineer_features(test[INPUT_COLUMN], feature_functions)
    
    X_train = np.hstack([F_train[features].as_matrix(), oofs])
    X_test = np.hstack([F_test[features].as_matrix(), subs])
    params = {
        "objective": "binary",
        'metric': {'auc'},
        "boosting_type": "gbdt",
        "verbosity": -1,
        "num_threads": 4,
        "max_depth":3,
        "bagging_fraction": 0.8,
        "bagging_freq":5,
        #"colsample_bytree":0.45,
        "feature_fraction": 0.45,
        "learning_rate": 0.1,
        "num_leaves": 3,
        "verbose": -1,
        #"min_split_gain": .1,
        "reg_alpha": .3
    }
    # Now go through folds
    # I use K-Fold for reasons described here : 
    # https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/49964
    train.drop("comment_text",axis=1,inplace=True)
    scores = []
    scores_false = []
    folds = KFold(n_splits=10, shuffle=True, random_state=233)
    trn_lgbset = lgb.Dataset(X_train, free_raw_data=False)
    del X_train
    gc.collect()
    for class_name in LABELS:
        print("Class %s scores : " % class_name)
        class_pred = np.zeros(len(train))
        train_target = train[class_name]
        trn_lgbset.set_label(train_target.values)
        submission[class_name] = np.zeros(len(X_test))
        lgb_rounds = 1000
        score_temp = 0;
        for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, train_target)):
            watchlist = [
                trn_lgbset.subset(trn_idx),
                trn_lgbset.subset(val_idx)
            ]
            # Train lgb l1
            model = lgb.train(
                params=params,
                train_set=watchlist[0],
                num_boost_round=lgb_rounds,
                valid_sets=watchlist,
                early_stopping_rounds=50,
                verbose_eval=0
            )
            class_pred[val_idx] = model.predict(trn_lgbset.data[val_idx], num_iteration=model.best_iteration)
            # sum all the predictions from each fold
            submission[class_name] = submission[class_name] + model.predict(X_test, num_iteration=model.best_iteration)
            score = roc_auc_score(train_target.values[val_idx], class_pred[val_idx])
            score_temp += score
            print("\t Fold %d : %.6f in %3d rounds" % (n_fold + 1, score, model.best_iteration))
        submission[class_name] = submission[class_name] / folds.n_splits  
        print("full score : %.6f" % roc_auc_score(train_target, class_pred))
        scores.append(roc_auc_score(train_target, class_pred))
        scores_false.append(score_temp / 10)
        train[class_name + "_oof"] = class_pred


    print('Total CV score is {}'.format(np.mean(scores)))
    print("total false CV score is {}".format(np.mean(scores_false)))
    submission.to_csv("result.csv", index=False)



Class toxic scores : 
	 Fold 1 : 0.988278 in 333 rounds
	 Fold 2 : 0.986089 in 338 rounds
	 Fold 3 : 0.988153 in 333 rounds
	 Fold 4 : 0.986216 in 291 rounds
	 Fold 5 : 0.988151 in 341 rounds
	 Fold 6 : 0.986373 in 245 rounds
	 Fold 7 : 0.988396 in 242 rounds
	 Fold 8 : 0.988238 in 355 rounds
	 Fold 9 : 0.986904 in 138 rounds
	 Fold 10 : 0.988947 in 511 rounds
full score : 0.987564
Class severe_toxic scores : 
	 Fold 1 : 0.992248 in  73 rounds
	 Fold 2 : 0.993084 in 140 rounds
	 Fold 3 : 0.991702 in  50 rounds
	 Fold 4 : 0.992143 in 158 rounds
	 Fold 5 : 0.991564 in  45 rounds
	 Fold 6 : 0.992328 in 205 rounds
	 Fold 7 : 0.992173 in 278 rounds
	 Fold 8 : 0.992309 in  36 rounds
	 Fold 9 : 0.991183 in 196 rounds
	 Fold 10 : 0.989662 in  46 rounds
full score : 0.989639
Class obscene scores : 
	 Fold 1 : 0.994383 in  98 rounds
	 Fold 2 : 0.995753 in  39 rounds
	 Fold 3 : 0.995501 in 302 rounds
	 Fold 4 : 0.994613 in  38 rounds
	 Fold 5 : 0.995866 in  57 rounds
	 Fold 6 : 0.994576 in  43 ro

In [49]:
# from os import listdir
# from os.path import isfile, join
# mypath = '../input/single_model_predictions_03092018/'
# classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
#        'identity_hate']
# test_files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and f.endswith('test_oof.csv')]
# test_df = []
# for i, file in enumerate(test_files):
#     df = pd.read_csv(mypath+file)
#     temp = df[df['fold_id']==0].drop('fold_id', axis = 1).copy()
#     temp[classes] = 0
#     for i in np.arange(0, df['fold_id'].max()):
#         temp[classes] += df.loc[df['fold_id']==i, classes].values/df['fold_id'].max()
#     test_df.append(temp)
# print('Finished to load test predictions.')
# train_files = [f for f in listdir(mypath) if isfile(join(mypath, f)) and f.endswith('train_oof.csv')]
# train_oof = []
# for i, file in enumerate(train_files):
#     train_oof.append(pd.read_csv(mypath+file))
# print('Finished to load OOF predictions.')

Finished to load test predictions.
Finished to load OOF predictions.
