**XGboost Intro**

XGboost is tree based model and one of the most powerful machine learning techniques; it can be used with patterns, numbers and text problems. However RNN models more common for text problems.

**Different models structure and design produce better ensemble or stacking results.**

This model can be used as an ensemble or stack item alongside with RNN models to produce better results than any of the two models.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# kaggle standard imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))

# extra imports
np.random.seed(235)
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import gc
import re
from sklearn.metrics import f1_score

# XGboost related
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import xgboost as xgb
from xgboost import XGBClassifier
from scipy.sparse import csr_matrix, hstack

# Any results you write to the current directory are saved as output.

**Prepare Data**

references:
* Data preparing and process inspired by (Shujian Liu) Kernals

In [None]:
print('load data') 
# load training and testing data
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

# split training data to validation
train_df, val_df = train_test_split(train_df, train_size=0.9, random_state=235)
print("train_df shape= ", train_df.shape)
print("load data success !")

In [None]:
print('fill missing and get the values')
# fill missing and get the values
X_train = train_df["question_text"].fillna("na_").values
X_val = val_df["question_text"].fillna("na_").values
X_test = test_df["question_text"].fillna("na_").values

y_train = train_df['target'].values
y_val = val_df['target'].values

In [None]:
print('size of training data: ', X_train.shape)  # size of training data:  (1175509,)

**Prepare Vectors For XGboost input**

In [None]:
char_vector = TfidfVectorizer(
    ngram_range=(2,4),     # 对ngram进行TFIDF
    max_features=20000,
    stop_words='english',   # list类型
    analyzer='char_wb',
    token_pattern=r'\w{1,}',
    strip_accents='unicode',
    sublinear_tf=True, 
    max_df=0.98,
    min_df=2  # 上下词频阈值之外的单词不计入
)

In [None]:
print('fit char vector')
char_vector.fit(X_train[:85000])  # ?为什么只取训练集中的一部分还不清楚
print("fit success !")

In [None]:
print('transfer data based on char vector')
print('transfer train')
# 返回TFIDF矩阵。tocsr存储稀疏矩阵
train_char_vector = char_vector.transform(X_train).tocsr() 
print('transfer validation')
valid_char_vector = char_vector.transform(X_val).tocsr()
print('transfer test')
test_char_vector = char_vector.transform(X_test).tocsr()

print("finished !")

In [None]:
all_text = list(X_train) + list(X_test)  # 训练集+测试集。但是没有包含验证集

In [None]:
word_vector = TfidfVectorizer(
    ngram_range=(1,1),  # 对每一个单词进行TFIDF
    max_features=9000,
    sublinear_tf=True, 
    strip_accents='unicode', 
    analyzer='word', 
    token_pattern="\w{1,}", 
    stop_words="english",
    max_df=0.95,
    min_df=2
)

In [None]:
print('fit word vector')
word_vector.fit(all_text)
print("finished!")

In [None]:
print('transfer data based on word vector')
# transform后得到tfidf矩阵。toser对稀疏矩阵压缩存储
train_word_vector = word_vector.transform(X_train).tocsr()
valid_word_vector = word_vector.transform(X_val).tocsr()
test_word_vector = word_vector.transform(X_test).tocsr()
print("finished!")

**Features Engineering**

In [None]:
del all_text
del X_train
del X_val
del X_test
gc.collect()

In [None]:
data = [train_df, val_df, test_df]
print("finished!")

In [None]:
# references: https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing
mistake_list = ['colour', 'centre', 'favourite', 'travelling', 'counselling', 'theatre', 'cancelled', 'labour', 'organisation', 'wwii', 'citicise', 'youtu ', 'youtube ', 'Qoura', 'sallary', 'Whta', 'narcisist', 'howdo', 'whatare', 'howcan', 'howmuch', 'howmany', 'whydo', 'doI', 'theBest', 'howdoes', 'mastrubation', 'mastrubate', "mastrubating", 'pennis', 'Etherium', 'narcissit', 'bigdata', '2k17', '2k18', 'qouta', 'exboyfriend', 'airhostess', 'whst', 'watsapp', 'demonitisation', 'demonitization', 'demonetisation']

In [None]:
def get_features(data):
    # data = [train_df, val_df, test_df]  (3,)
    # 经过本函数的处理后 data的第一维[1175509 rows x 3 columns]变为[1175509 rows x 16 columns]
    # 第二维由[130613 rows x 3 columns] 变为 [130613 rows x 16 columns]
    # 第三维由[56370 rows x 2 columns] 变为 [56370 rows x 15 columns]
    for dataframe in data:
        # dataFrame 添加列
        dataframe["text_size"] = dataframe["question_text"].apply(len).astype('uint16')  # 句子长度
        dataframe["capital_size"] = dataframe["question_text"].apply(lambda x: sum(1 for c in x if c.isupper())).astype('uint16')  # 大写字母的个数
        dataframe["capital_rate"] = dataframe.apply(lambda x: float(x["capital_size"]) / float(x["text_size"]), axis=1).astype('float16')  # 大写字母率
        dataframe["exc_count"] = dataframe["question_text"].apply(lambda x: x.count("!")).astype('uint16')  # 感叹号数量
        dataframe["quetion_count"] = dataframe["question_text"].apply(lambda x: x.count("?")).astype('uint16')  # 问号数量
        dataframe["unq_punctuation_count"] = dataframe["question_text"].apply(lambda x: sum(x.count(p) for p in '∞θ÷α•à−β∅³π‘₹´°£€\×™√²')).astype('uint16') # 不同标点符号数量
        dataframe["punctuation_count"] = dataframe["question_text"].apply(lambda x: sum(x.count(p) for p in '.,;:^_`')).astype('uint16')  # 标点符号数量
        dataframe["symbol_count"] = dataframe["question_text"].apply(lambda x: sum(x.count(p) for p in '*&$%')).astype('uint16')  # ？？
        dataframe["words_count"] = dataframe["question_text"].apply(lambda x: len(x.split())).astype('uint16')  # 单词数量
        dataframe["unique_words"] = dataframe["question_text"].apply(lambda x: (len(set(1 for w in x.split())))).astype('uint16')  # 不同单词的数量
        dataframe["unique_rate"] = dataframe["unique_words"] / dataframe["words_count"]  
        dataframe["word_max_length"] = dataframe["question_text"].apply(lambda x: max([len(word) for word in x.split()]) ).astype('uint16')  # 最大单词长度
        dataframe["mistake_count"] = dataframe["question_text"].apply(lambda x: sum(x.count(w) for w in mistake_list)).astype('uint16')  # 错误拼写数量
    print("data shape = ", np.array(data).shape)
    return data

In [None]:
print('generate the features')
# data = [train_df, val_df, test_df]
data = get_features(data)
# print("data shape = ", np.array(data).shape)
# print(data)
print("finished!")

In [None]:
feature_cols = ["text_size", "capital_size", "capital_rate", "exc_count", "quetion_count", "unq_punctuation_count", "punctuation_count", "symbol_count", "words_count", "unique_words", "unique_rate", "word_max_length", "mistake_count"]

**Input Final Format**

In [None]:
print('final preparation for input')
# 不取qid，question text，target列的数据，只取feature_cols列的数据。
X_train = csr_matrix(train_df[feature_cols].values)
X_val = csr_matrix(val_df[feature_cols].values)
X_test = csr_matrix(test_df[feature_cols].values)

del val_df
del train_df
del test_df

gc.collect()

In [None]:
'''
input_train = hstack([X_train, train_char_vector,train_word_vector])
input_valid = hstack([X_val, valid_char_vector, valid_word_vector])
input_test = hstack([X_test, test_char_vector, test_word_vector])
'''
# 按列将数组堆叠
input_train = hstack([X_train, train_word_vector, train_char_vector])
input_valid = hstack([X_val, valid_word_vector, valid_char_vector])
input_test = hstack([X_test, test_word_vector, test_char_vector])

#print('input_train: ', input_train)
train_word_vector = None
train_char_vector = None
valid_word_vector = None
valid_char_vector = None
test_word_vector = None
test_char_vector = None
#print('input_train: ', input_train)
print("finished!")

**Build The model**

In [None]:
'''reference: some settings inspired by Toxic competition kernels'''
def build_xgb(train_X, train_y, valid_X, valid_y=None, subsample=0.75):

    xgtrain = xgb.DMatrix(train_X, label=train_y)
    if valid_y is not None:
        xgvalid = xgb.DMatrix(valid_X, label=valid_y)
    else:
        xgvalid = None
    
    model_params = {}
    # binary 0 or 1
    model_params['objective'] = 'binary:logistic'
    # eta is the learning_rate, [default=0.3]
    model_params['eta'] = 0.3
    # depth of the tree, deeper more complex.
    model_params['max_depth'] = 6
    # 0 [default] print running messages, 1 means silent mode
    model_params['silent'] = 1
    model_params['eval_metric'] = 'auc'
    # will give up further partitioning [default=1]
    model_params['min_child_weight'] = 1
    # subsample ratio for the training instance
    model_params['subsample'] = subsample
    # subsample ratio of columns when constructing each tree
    model_params['colsample_bytree'] = subsample
    # random seed
    model_params['seed'] = 2018
    # imbalance data ratio
    #model_params['scale_pos_weight'] = 
    
    # convert params to list
    model_params = list(model_params.items())
    
    return xgtrain, xgvalid, model_params

**Train The Model**

In [None]:
def train_xgboost(xgtrain, xgvalid, model_params, num_rounds=500, patience=20):
    
    if xgvalid is not None:
        # watchlist what information should be printed. specify validation monitoring
        watchlist = [ (xgtrain, 'train'), (xgvalid, 'test') ]
        #early_stopping_rounds = stop if performance does not improve for k rounds
        model = xgb.train(model_params, xgtrain, num_rounds, watchlist, early_stopping_rounds=patience)
    else:
        model = xgb.train(model_params, xgtrain, num_rounds)
    
    return model

In [None]:
print('train the model')
xgtrain, xgvalid, model_params = build_xgb(input_train, y_train ,input_valid, y_val)
model = train_xgboost(xgtrain, xgvalid, model_params)
print("finished!")

**Predict And Export Results**

In [None]:
print('predict validation')
validate_hat = np.zeros(( X_val.shape[0], 1) )
validate_hat[:,0] = model.predict(xgb.DMatrix(input_valid), ntree_limit=model.best_ntree_limit)

In [None]:
scores_list = []
# for threshold in [0.2, 0.3, 0.31, 0.33, 0.4, 0.45, 0.5]:
for threshold in np.arange(0.1, 0.501, 0.01):
    score = f1_score(y_val, (validate_hat > threshold).astype(int))
    scores_list.append([threshold, score])
    print('F1 score: {} for threshold: {}'.format(score, threshold))
        
    scores_list.sort(key=lambda x:x[1] , reverse=True)
    best_threshold = scores_list[0][0]
    print('best threshold to generate predictions: ', best_threshold)
    print('best score: ', scores_list[0][1])

In [None]:
print('predict results')
predictions = np.zeros(( X_test.shape[0], 1) )
predictions[:,0] = model.predict(xgb.DMatrix(input_test), ntree_limit=model.best_ntree_limit)

In [None]:
def save_results(submit, y_hat, name, threshold=0.35):
    print('threshold is: ', threshold)
    results = (y_hat > threshold).astype(int)
    print(results[:100])
    submit['prediction'] = results
    save_to = (name+'.csv')
    submit.to_csv(save_to, index=False)

In [None]:
print('save results')
submission = pd.read_csv('../input/sample_submission.csv')
save_results(submission, predictions, 'submission', threshold=best_threshold)