In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## Getting Started
Thanks to wonderful python libraries and packages for making our life easier.

In [None]:
# nlp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from gensim.models.word2vec import Word2Vec
import spacy
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence

#utils
from collections import Counter, defaultdict
import gc, time
from tqdm import tqdm

#visualization
%matplotlib inline
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns

# some basic ml models and metrics evaluation
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from scikitplot.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.utils.fixes import signature
from sklearn.metrics import average_precision_score
from sklearn.metrics import mean_squared_error

#ensembles
import xgboost as xgb
import lightgbm as lgb

Let's get some idea about the data that we are dealing.

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
# Looking the data
print("Train shape : ", train.shape)
print("Test shape : ", test.shape)
dist = train['target'].value_counts()
sns.barplot(x=np.arange(2), y=dist)
plt.title("Distribution of positive and negative labels")
plt.xlabel("target")
plt.ylabel("Count")
plt.show()
train.head()

In [None]:
# keep some validation set
train_df, val_df = train_test_split(train, test_size=0.1, random_state=33)

In [None]:
train_text = train['question_text']
test_text = test['question_text']
all_text = pd.concat([train_text, test_text])
y = train_df.target.values
yde = val_df.target.values

## Utils

In [None]:
def evaluatePredictions(y, pred, silent=False):
    f1_list = list()
    thre_list = np.arange(0.1, 0.901, 0.01)
    for thresh in thre_list:
        thresh = np.round(thresh, 2)
        f1 = f1_score(y, (pred>thresh).astype(int))
        f1_list.append(f1)
        if not silent:
            print("F1 score at threshold {0} is {1}".format(thresh, f1))
    #return f1_list
    plot_confusion_matrix(y, np.array(pd.Series(pred.reshape(-1,)).map(lambda x:1 if x>thre_list[np.argmax(f1_list)] else 0)))
    best = thre_list[np.argmax(f1_list)]
    best = np.round(best, 2)
    score = np.max(f1_list)
    print('Best Threshold: ', best)
    print('Best F1 Score: ', score)
    return best, score

def plotPrecisionRecall(y, pred):
    precision, recall, _ = precision_recall_curve(y, pred)
    # In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
    step_kwargs = ({'step': 'post'}
                   if 'step' in signature(plt.fill_between).parameters
                   else {})
    plt.step(recall, precision, color='b', alpha=0.2,
             where='post')
    plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    average_precision = average_precision_score(y, pred)
    plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
              average_precision))

def performKfCV(kf, models, X, y, Xde):
    preds = {}
    de_preds = 0
    tr_preds = np.zeros([X.shape[0],])
    for name in models.keys():
        preds[name] = (tr_preds, de_preds)
    for i, (train_index, val_index) in tqdm(enumerate(kf.split(X))):
        Xtrain, Xval = X[train_index], X[val_index]
        Ytrain, Yval = y[train_index], y[val_index]
        for name, model in models.items():
            model.fit(Xtrain,Ytrain)
            val_preds = model.predict_proba(Xval)[:,1]
            cv_pred, de_pred = preds[name]
            cv_pred[val_index]=val_preds
            de_pred += 0.2*model.predict_proba(Xde)[:,1]
            preds[name] = (cv_pred, de_pred)
    return preds
    
stop_words = stopwords.words('english')

## Feature extraction and baseline techniques
Let's start by very simple feature extraction Bags of Words (with count) and TFIDF and see how some popular methods for text classification like Naive Bayes, Linear models and Decision trees perform. Then I will experiment with embeddings and see how some approaches work.

>Although, SVMs are considered state-of-art for text classifications, the number of examples is too high (big data problem).  There will be a lot of support vectors and learning will take long time and model will also be large.


### Naive Bayes with Bag of words
> According to documentation of Naive Bayes it performs best for bag of words like features.

In [None]:
cntVect = CountVectorizer(binary=True, stop_words=stop_words,
                             preprocessor=lambda x: " ".join(text_to_word_sequence(x)),
                             token_pattern="[a-zA-Z]{2,}",
                             min_df=5, max_df=0.99)
mult_nb = MultinomialNB()
bern_nb = BernoulliNB()

In [None]:
cntVect.fit(all_text)
X = cntVect.transform(train_df.question_text)
Xde = cntVect.transform(val_df.question_text)
Xte = cntVect.transform(test_text)

**Running a 5 fold cross validation to obtain  F1 score**. Since, the data is highly imbalanced, F1 score will be better than accuracy.

In [None]:
nb_models = {
    "mult_nb": mult_nb,
    "bern_nb": bern_nb
}
kf = KFold(n_splits=5, shuffle=True, random_state=33)
nb_preds = performKfCV(kf, nb_models, X, y, Xde)

In [None]:
thresholds = {}
scores = {}
for name,(cv_pred, de_pred) in nb_preds.items():
    print(name)
    print("Cross validation F1 score")
    thresholdNb = evaluatePredictions(y, cv_pred, silent=True)
    print("Development set F1 score")
    thresholdNb, scoreNb = evaluatePredictions(yde, de_pred, silent=True)
    thresholds[name] = thresholdNb
    scores[name] = scoreNb

**Combining predictions of two naive bayes models**

In [None]:
# combining
weights_nb = {
    "mult_nb": 0.5,
    "bern_nb": 0.5
}
com_cv_pred=0
com_de_pred=0
for name,(cv_pred, de_pred) in nb_preds.items():
    com_cv_pred += weights_nb[name]*cv_pred
    com_de_pred += weights_nb[name]*de_pred
print("Cross validation F1 score")
thresholdNb = evaluatePredictions(y, com_cv_pred, silent=True)
print("Development set F1 score")
thresholdNb,scoreNb = evaluatePredictions(yde, com_de_pred, silent=True)


In [None]:
barx = [name for name,_ in scores.items()]
barx.append("combined_nb")
bary = [score for _,score in scores.items()]
bary.append(scoreNb)
sns.lineplot(x=barx, y=bary)
plt.xlabel("Model")
plt.ylabel("F1 Score")
plt.title("Naive Bayes Models F1 Score using BOW for validation set data")
plt.show()
for name, (cv_pred, de_pred) in nb_preds.items():
    print(name)
    plotPrecisionRecall(yde, de_pred)
    plt.show()
print("combined_nb")
plotPrecisionRecall(yde, com_de_pred)
plt.show()

**Trying Linear models**

In [None]:
cntVect = CountVectorizer(binary=False, stop_words=stop_words,
                             preprocessor=lambda x: " ".join(text_to_word_sequence(x)),
                             token_pattern="[a-zA-Z]{2,}",
                             min_df=5, max_df=0.99)
tfidfVect = TfidfVectorizer(binary=False, stop_words=stop_words,
                             preprocessor=lambda x: " ".join(text_to_word_sequence(x)),
                             token_pattern="[a-zA-Z]{2,}",
                             min_df=5, max_df=0.99)
ln_cnt_models = {}
ln_tfidf_models = {}
Cs = [0.1, 1, 10]
for c in Cs:
    ln_cnt_models["cnt_log_"+str(c)] = LogisticRegression(random_state=333,
                                                          class_weight="balanced",
                                                          verbose=0, C=c, solver='lbfgs', n_jobs=1)
    ln_tfidf_models["tfidf_log_"+str(c)] = LogisticRegression(random_state=333,
                                                              class_weight="balanced",
                                                              verbose=0, C=c, solver='lbfgs', n_jobs=1)

In [None]:
cntVect.fit(all_text) #exploiting all given data
X = cntVect.transform(train_df.question_text)
Xde = cntVect.transform(val_df.question_text)
Xte = cntVect.transform(test_text)

In [None]:
LogisticRegression?

In [None]:
ln_cnt_preds = performKfCV(kf, ln_cnt_models, X, y, Xde)

In [None]:
for name,(cv_pred, de_pred) in ln_cnt_preds.items():
    print(name)
    print("Cross validation F1 score")
    thresholdNb = evaluatePredictions(y, cv_pred, silent=True)
    print("Development set F1 score")
    thresholdNb, scoreNb = evaluatePredictions(yde, de_pred, silent=True)
    thresholds[name] = thresholdNb
    scores[name] = scoreNb

In [None]:
com_cv_pred=0
com_de_pred=0
for name,(cv_pred, de_pred) in ln_cnt_preds.items():
    com_cv_pred += 1/len(ln_cnt_preds)*cv_pred
    com_de_pred += 1/len(ln_cnt_preds)*de_pred
print("Cross validation F1 score")
thresholdNb = evaluatePredictions(y, com_cv_pred, silent=True)
print("Development set F1 score")
thresholdNb,scoreNb = evaluatePredictions(yde, com_de_pred, silent=True)

In [None]:
barx = [name for name,_ in scores.items()]
bary = [score for _,score in scores.items()]
sns.lineplot(x=barx, y=bary, markers=True)
plt.xlabel("Model")
plt.ylabel("F1 Score")
plt.title("F1 Score using different models for validation set data")
plt.show()
scoresObj = [(name,score) for name,score in scores.items()]
print (tabulate(scoresObj, floatfmt=".4f", headers=("model", 'F1 score')))
# for name, (cv_pred, de_pred) in ln_cnt_preds.items():
#     print(name)
#     plotPrecisionRecall(yde, de_pred)
#     plt.show()
# print("combined_ln_cnt")
# plotPrecisionRecall(yde, com_de_pred)
# plt.show()

In [None]:
tfidfVect.fit(all_text) #exploiting all given data
X = tfidfVect.transform(train_df.question_text)
Xde = tfidfVect.transform(val_df.question_text)
Xte = tfidfVect.transform(test_text)


In [None]:
ln_tfidf_preds = performKfCV(kf, ln_tfidf_models, X, y, Xde)

In [None]:
for name,(cv_pred, de_pred) in ln_tfidf_preds.items():
    print(name)
    print("Cross validation F1 score")
    thresholdNb = evaluatePredictions(y, cv_pred, silent=True)
    print("Development set F1 score")
    thresholdNb, scoreNb = evaluatePredictions(yde, de_pred, silent=True)
    thresholds[name] = thresholdNb
    scores[name] = scoreNb

In [None]:
com_cv_pred=0
com_de_pred=0
for name,(cv_pred, de_pred) in ln_tfidf_preds.items():
    com_cv_pred += 1/len(ln_tfidf_preds)*cv_pred
    com_de_pred += 1/len(ln_tfidf_preds)*de_pred
print("Cross validation F1 score")
thresholdNb = evaluatePredictions(y, com_cv_pred, silent=True)
print("Development set F1 score")
thresholdNb,scoreNb = evaluatePredictions(yde, com_de_pred, silent=True)

In [None]:
barx = [name for name,_ in scores.items()]
bary = [score for _,score in scores.items()]
sns.lineplot(x=barx, y=bary)
plt.xlabel("Model")
plt.ylabel("F1 Score")
plt.title("F1 Score using different models for validation set data")
plt.show()
scoresObj = [(name,score) for name,score in scores.items()]
print (tabulate(sorted(scoresObj, key=lambda x: -x[1]), floatfmt=".4f", headers=("model", 'F1 score')))
# for name, (cv_pred, de_pred) in ln_cnt_preds.items():
#     print(name)
#     plotPrecisionRecall(yde, de_pred)
#     plt.show()
# print("combined_ln_cnt")
# plotPrecisionRecall(yde, com_de_pred)
# plt.show()

**Decision trees**

In [None]:

cntVect = CountVectorizer(binary=True, stop_words=stop_words,
                             preprocessor=lambda x: " ".join(text_to_word_sequence(x)),
                             token_pattern="[a-zA-Z]{2,}",
                             min_df=5, max_df=0.99)

tree_models = {}
depths = [3,5,7]
for depth in depths:
    tree_models["dt_"+str(depth)] = DecisionTreeClassifier(random_state=333, class_weight="balanced", max_depth=depth)

In [None]:
cntVect.fit(all_text) #exploiting all given data
X = cntVect.transform(train_df.question_text)
Xde = cntVect.transform(val_df.question_text)
Xte = cntVect.transform(test_text)

In [None]:
dt_preds = performKfCV(kf, tree_models, X, y, Xde)

In [None]:
for name,(cv_pred, de_pred) in dt_preds.items():
    print(name)
    print("Cross validation F1 score")
    thresholdNb = evaluatePredictions(y, cv_pred, silent=True)
    print("Development set F1 score")
    thresholdNb, scoreNb = evaluatePredictions(yde, de_pred, silent=True)
    thresholds[name] = thresholdNb
    scores[name] = scoreNb

**Let's try to combine all the independent predictions (Naive ensemble)**

In [None]:
all_preds = [nb_preds, ln_cnt_preds, ln_tfidf_preds, dt_preds] #ignoring dt_trees now. We will use xgboost later
train_stack = None
val_stack = None
for preds in all_preds:
    for name, (cv_preds, de_preds) in preds.items():
        print("stacking ", name)
        if train_stack is None:
            train_stack = (cv_preds.reshape(-1,1))
            val_stack = (de_preds.reshape(-1,1))
        else:
            train_stack = np.hstack((train_stack, (cv_preds.reshape(-1,1))))
            val_stack = np.hstack((val_stack, (de_preds.reshape(-1,1))))

            
        

**Finding best C**

In [None]:
Cs = [0.0001, 0.001, 0.01]
ensembled_models = {}
for c in Cs:
    ensembled_models["combiner_ln_"+str(c)] = LogisticRegression(class_weight = "balanced", C=c, solver='lbfgs', verbose=1)
en_preds = performKfCV(kf, ensembled_models, train_stack, y, val_stack)


In [None]:
for name,(cv_pred, de_pred) in en_preds.items():
    print(name)
    print("Cross validation F1 score")
    thresholdNb = evaluatePredictions(y, cv_pred, silent=True)
    print("Development set F1 score")
    thresholdNb, scoreNb = evaluatePredictions(yde, de_pred, silent=True)
    thresholds[name] = thresholdNb
    scores[name] = scoreNb

**Using C determined from above to combine all predictions**

In [None]:
classifier = LogisticRegression(class_weight = "balanced", C=0.0001, solver='lbfgs', verbose=1)
classifier.fit(train_stack, y)
tr_preds = classifier.predict_proba(train_stack)[:,1]
val_preds = classifier.predict_proba(val_stack)[:,1]
evaluatePredictions(y, tr_preds, silent=True)
evaluatePredictions(yde, val_preds, silent=True)

# F1 score is < 0.6 for all baseline methods and its combinations

 **Let's see if XGBOOST, a sophisticated boosting algorithm,  will work better?**

X = train_df.question_text
y = train_df.target
cntVectBin = CountVectorizer(binary=False, stop_words=stop_words,
                             preprocessor=lambda x: " ".join(text_to_word_sequence(x)),
                             token_pattern="[a-zA-Z]{2,}",
                             min_df=5, max_df=0.99)
Xt = cntVectBin.fit_transform(X)
Xv = cntVectBin.transform(val_df.question_text)
Yt = y
Yv = val_df.target

Using native xgb library instead of Sklearn wrapper as it has more documentation and more features

# custom evaluation metric
def f1score(pred_probs, dtrain):
    labels = dtrain.get_label() # obtain true labels
    thresh = 0.8 # threshold for f1 score, previous observations
    f1 = f1_score(labels, (pred_probs>thresh).astype(int))
    return 'F1_Score', f1

xgbTrain = xgb.DMatrix(Xt, label=Yt)
xgbVal = xgb.DMatrix(Xv, label=Yv)
ratio = np.count_nonzero(Yt==0)/np.count_nonzero(Yt==1) # we will not have divide by zero error
print(ratio)
# specify training parameters
params = {
    'booster': 'gbtree',
    'objective':'binary:logistic', # this is classification
    'max_depth':5, # max depth of decision trees stubs (we have large data so 5 may be reasonable)
    'silent':1, # don't want any noise on console
    'eta':0.9, # aggressive learning rate, decided by seeing progress for data in verbose mode 
    'scale_pos_weight': ratio, # handling imbalanced data
    'lambda': 3, # L2 regularization, need tuning
    'alpha':1,
    'eval_metric': ['logloss'],
    # hyperparameters to be tuned. Gives some taste similar to bagging
    'max_delta_step': 1,
    'subsample':0.9,
    'colsample_bytree': 0.9,
    'colsample_bylevel': 0.9,
    'seed':300
}
num_rounds = 1000
watchlist  = [(xgbVal,'test'), (xgbTrain,'train')]

xgbModel1 = xgb.train(params, xgbTrain, num_rounds, watchlist, verbose_eval=10, feval=f1score, maximize=True)


xgb_predicted = xgbModel1.predict(xgbVal)
print(xgb_predicted)
evaluatePredictions(Yv, xgb_predicted)

importances = xgbModel1.get_fscore()
terms = np.array(list(cntVectBin.vocabulary_.keys()))
indices = np.array(list(cntVectBin.vocabulary_.values()))
inverse_vocabulary = terms[np.argsort(indices)]
real_word = []
for key in list(importances.keys()):
    nkey = key.replace("f","")
    real_word.append(inverse_vocabulary[int(nkey)])

importance_df = pd.DataFrame({
        'Splits': list(importances.values()),
        'Feature': real_word
    })
importance_df.sort_values(by='Splits', inplace=True, ascending=False)
# importance_df.head()
importance_df[:20].plot(kind='barh', x='Feature', figsize=(8,6), color='green')
plt.title("Top 20 words that are used in split")
plt.show()

featmap_df = pd.DataFrame({
    'index': np.arange(0,len(inverse_vocabulary)),
        'words': inverse_vocabulary
    })
featmap_df["type"] = "i"
np.savetxt("featmap.txt", featmap_df.values, fmt='%s')

xgb.to_graphviz(xgbModel1, num_trees=10, fmap="featmap.txt")

**LightGBM alternative to XGBoost?**

In [None]:
X = train_df.question_text
y = train_df.target
cntVectBin = CountVectorizer(binary=True, stop_words=stop_words,
                             preprocessor=lambda x: " ".join(text_to_word_sequence(x)),
                             token_pattern="[a-zA-Z]{2,}",
                             min_df=5, max_df=0.99, dtype=np.float32)
cntVectBin.fit(all_text)
Xt = cntVectBin.transform(X)
Xv = cntVectBin.transform(val_df.question_text)
Yt = y
Yv = val_df.target

In [None]:
terms = np.array(list(cntVectBin.vocabulary_.keys()))
indices = np.array(list(cntVectBin.vocabulary_.values()))
inverse_vocabulary = terms[np.argsort(indices)]
len(inverse_vocabulary)

In [None]:
# custom evaluation metric
def f1score(pred_probs, dtrain):
    labels = dtrain.get_label() # obtain true labels
    thresh = 0.23 # threshold for f1 score, previous observations
    if len(pred_probs)>0:
        f1 = f1_score(labels, (pred_probs>thresh).astype(int))
    else:
        f1 = 0
    return 'F1_Score', f1, True

In [None]:
lgbTrain = lgb.Dataset(Xt, label=Yt, silent=1)
lgbVal = lgb.Dataset(Xv, label=Yv, silent=1)
ratio = np.count_nonzero(Yt==0)/np.count_nonzero(Yt==1) # we will not have divide by zero error
print(ratio)
params = {
    'objective':'xentropy', 
#     'objective':'binary', # this is classification
#     'boosting':'random_forest', #type of boosting (default: gbdt)
    'num_leaves': 16, # hyperparameter to tune
    'num_threads':4,
    'min_data_in_leaf': 25, # support for decision (generalization), (to be tuned)
    'max_depth':10, # max depth of decision trees stubs, (to be tuned)
    'eta':0.9, # aggressive learning rate, decided by seeing progress for data in verbose mode 
    'scale_pos_weight': ratio, # handling imbalanced data
    'lambda': 3, # L2 regularization, need tuning
    'metric': ['auc','xentropy'],
    'max_delta_step': 1, # hyperparameters to be tuned. Gives some taste of bagging
    'subsample':0.9, # bagging fraction
    'bagging_freq':5, # how often to do bagging
    'colsample_bytree': 0.9,
    'seed':300
}
num_rounds = 500

In [None]:
evals_result = {}
lgbModel = lgb.train(params, lgbTrain, num_rounds,valid_sets=[lgbTrain, lgbVal],
                     valid_names=["train", "test"],
                     verbose_eval=10,evals_result=evals_result,
                     feature_name=list(inverse_vocabulary))


In [None]:
def showEvals(evals_result):
    test_f1score = evals_result["test"]["xentropy"]
    train_f1score = evals_result["train"]["xentropy"]
    test_auc = evals_result["test"]["auc"]
    train_auc = evals_result["train"]["auc"]
    print(len(test_f1score))
    print(len(train_f1score))
    plt.xlabel("Number of estimators")
    plt.ylabel("Cross Entropy")
    plt.plot(np.arange(1,len(test_f1score)+1), test_f1score, label="validation")
    plt.plot(np.arange(1,len(train_f1score)+1), train_f1score, label="train")
    plt.legend()
    plt.show()
    plt.xlabel("Number of estimators")
    plt.ylabel("AUC")
    plt.plot(np.arange(1,len(test_auc)+1), test_auc, label="validation")
    plt.plot(np.arange(1,len(train_auc)+1), train_auc, label="train")
    plt.legend()
    plt.show()
showEvals(evals_result)

In [None]:
lgb_predicted = lgbModel.predict(Xv)
print(lgb_predicted)
evaluatePredictions(Yv, lgb_predicted)

In [None]:
lgb.plot_importance(lgbModel, max_num_features=20, title="Feature importance by split")
lgb.plot_importance(lgbModel, max_num_features=20, importance_type='gain', precision=1, title="Feature importance by gain")

In [None]:
lgb.create_tree_digraph(lgbModel, precision=1)

## Analysis
**LightGBM is very similar to XGBoost and performing similar (or better). The advantage with LightGBM is that it is faster. Using cross validation with LightGBM may be good idea. It seems that varying max_depth of tree (num_leaves) and min_data_in_leaf may give some interesting trees providing insights about data.**

Before that, let's summarize the observations.

1. Data is imbalanced, so accuracy paradox is introduced i.e. better accuracy will not better denote quality of classification. Thus, we need to rely on measures like AUC and F1 score.
2. Naive Bayes with BOW models had F1 scores around 54% to 55%.
3. Logistic Regression Classifer (Linear Models) had F1 scores of about 59% for BOW with counts and 58% for TFIDF features.
4. Decision trees stubs had F1 scores about 32%.
5. XGBoost (using gradient boosted DTs) was slow to train and had many hyperparameters. The settings gave F1 score of about 59%.
6. Light GBM seems to be better alternative to XGBoost, given its similarity in performance with XGBoost but with faster speed. A specific settings of hyperparameters touched 60% F1 Score.

In [None]:
X = train_df.question_text
y = train_df.target
cntVectBin = CountVectorizer(binary=True, stop_words=stop_words,
                             preprocessor=lambda x: " ".join(text_to_word_sequence(x)),
                             token_pattern="[a-zA-Z]{2,}",
                             min_df=5, max_df=0.99, dtype=np.float32)
cntVectBin.fit(all_text)
Xt = cntVectBin.transform(X)
Xv = cntVectBin.transform(val_df.question_text)
Yt = y
Yv = val_df.target
Xtest = cntVectBin.transform(test_text)

In [None]:
terms = np.array(list(cntVectBin.vocabulary_.keys()))
indices = np.array(list(cntVectBin.vocabulary_.values()))
inverse_vocabulary = terms[np.argsort(indices)]
len(inverse_vocabulary)

In [None]:
lgb_models = {}
# Here if number of leaves is small, we need more support (min_data_in_leaf)
ratio = np.count_nonzero(Yt==0)/np.count_nonzero(Yt==1) # we will not have divide by zero error
print(ratio)
default_params = {
    'objective':'xentropy', 
#     'objective':'binary', # this is classification
#     'boosting':'random_forest', #type of boosting (default: gbdt)
    'num_threads':4,
    'max_depth':20, # max depth of decision trees stubs, (to be tuned)
    'eta':0.9, # aggressive learning rate, decided by seeing progress for data in verbose mode 
    'scale_pos_weight': ratio, # handling imbalanced data
    'lambda': 3, # L2 regularization, need tuning
    'metric': ['auc','xentropy'],
    'max_delta_step': 1, # hyperparameters to be tuned. Gives some taste of bagging
    'subsample':0.9, # bagging fraction
    'bagging_freq':2, # how often to do bagging
    'colsample_bytree': 0.9,
    'seed':333
}
num_rounds = 200
# num_leaves:min_data_in_leaf:max_depth
hyParams = {
    8:30, # most general things
    16:25, # a bit more specific and so on
    32:20,
    64:10,
    100:5
}
for num_leaves, min_data_in_leaf in hyParams.items():
    lgb_models["lgb_"+str(num_leaves)+"_"+str(min_data_in_leaf)] = None


In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=333)
evals_result = {}

def performKfCVLGB(kf, models, X, y, Xde):
    y = y.values
    preds = {}
    de_preds = 0
    tr_preds = np.zeros([X.shape[0],])
    for name in models.keys():
        preds[name] = (tr_preds, de_preds)
    for i, (train_index, val_index) in tqdm(enumerate(kf.split(X))):
        Xtrain, Xval = X[train_index], X[val_index]
        Ytrain, Yval = y[train_index], y[val_index]
        print(type(Xtrain))
        lgbTrain = lgb.Dataset(Xtrain, label=Ytrain, silent=1)
        lgbVal = lgb.Dataset(Xval, label=Yval, silent=1)
        for name, model in models.items():
            ev_res = {}
            print(name)
            parts = name.split("_", -1)
            par = default_params.copy()
            par["num_leaves"] = int(parts[1])
            par["min_data_in_leaf"] = int(parts[2])
            print(par)
            if model is None:
                lgb_models[name]=[]
            model = lgb.train(par, lgbTrain, num_rounds,valid_sets=[lgbTrain, lgbVal],
                     valid_names=["train", "test"],
                     verbose_eval=50, evals_result=ev_res,
                     feature_name=list(inverse_vocabulary))
            md_list = lgb_models[name]
            md_list.append(model)
            lgb_models[name] = md_list
            evals_result[name] = ev_res
            val_preds = model.predict(Xval)
            cv_pred, de_pred = preds[name]
            cv_pred[val_index]=val_preds
            de_pred += 0.2*model.predict(Xde)
            preds[name] = (cv_pred, de_pred)
    return preds

lgb_preds = performKfCVLGB(kf, lgb_models, Xt, Yt, Xv)

In [None]:
scores={}
thresholds={}
for name,(cv_pred, de_pred) in lgb_preds.items():
    print(name)
    print("Cross validation F1 score")
    thresholdNb = evaluatePredictions(y, cv_pred, silent=True)
    print("Development set F1 score")
    thresholdNb, scoreNb = evaluatePredictions(yde, de_pred, silent=True)
    thresholds[name] = thresholdNb
    scores[name] = scoreNb

In [None]:
graphs = []
names = []
for name,model in lgb_models.items():
    count = 1
    for m in model:
        m.save_model(name+"_kf"+str(count)+".lgb", num_iteration=num_rounds)
        count+=1
        g = lgb.create_tree_digraph(m, precision=1)
        graphs.append(g)
        names.append(name)

In [None]:
print(names[0])
graphs[0]

In [None]:
print(names[24])
graphs[24]

In [None]:
com_cv_pred=0
com_de_pred=0
for name,(cv_pred, de_pred) in lgb_preds.items():
    com_cv_pred += 1/len(lgb_preds)*cv_pred
    com_de_pred += 1/len(lgb_preds)*de_pred
print("Cross validation F1 score")
thresholdNb = evaluatePredictions(y, com_cv_pred, silent=True)
print("Development set F1 score")
thresholdNb,scoreNb = evaluatePredictions(yde, com_de_pred, silent=True)

## Loading saved models

In [None]:
lgb_models = {}
for f in os.listdir("."):
    if f.startswith("lgb"):
#         print(f)
        fname = f
        model = lgb.Booster(model_file=fname)
        lgb_models[f] = model
print(len(lgb_models))

In [None]:
train_stack = None
val_stack = None
test_stack = None
for name,model in lgb_models.items():
    print(name)
    preds = model.predict(Xt)
    de_preds = model.predict(Xv)
    test_preds = model.predict(Xtest)
    if train_stack is None:
        train_stack = (preds.reshape(-1,1))
        val_stack = (de_preds.reshape(-1,1))
        test_stack = (test_preds.reshape(-1,1))
    else:
        train_stack = np.hstack((train_stack, (preds.reshape(-1,1))))
        val_stack = np.hstack((val_stack, (de_preds.reshape(-1,1))))
        test_stack = np.hstack((test_stack, test_preds.reshape(-1,1)))

In [None]:
classifier = LogisticRegression(class_weight = "balanced", C=0.001, solver='lbfgs', verbose=1)
classifier.fit(train_stack, y)
tr_preds = classifier.predict_proba(train_stack)[:,1]
val_preds = classifier.predict_proba(val_stack)[:,1]

In [None]:
test_preds = classifier.predict_proba(test_stack)[:,1]
print(test_preds)
test_text.tail()

In [None]:
evaluatePredictions(y, tr_preds, silent=True)
evaluatePredictions(yde, val_preds, silent=True)
print(accuracy_score(y, tr_preds>=0.86))
print(accuracy_score(yde, val_preds>0.7))

In [None]:
pred_test_y = (test_preds > 0.68) + 0
ones = np.where(pred_test_y > 0)[0]
print(pred_test_y.dtype)
submit_df = pd.DataFrame({"qid": test["qid"], "prediction": pred_test_y})
print(submit_df['prediction'].value_counts())
print(ones)
onesPred = test_text.iloc[ones]
onesPred[330:380]

In [None]:
onesPred[4434]

In [None]:
submit_df.to_csv("submission.csv", index=False)

 ## Using given embeddings as features

Loading the embeddings file (glove) as embedding index.

In [None]:
# em_file = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
# def get_coefs(word, *arr):
#     return word, np.asarray(arr, dtype='float32')
# embedding_index = dict(get_coefs(*d.split(' ')) for d in open(em_file))


**Generating Word2Vector from data we have**

In [None]:
# def processWords(X):
#     return [text_to_word_sequence(row) for row in X]

# res = processWords(train_df['question_text'])
# print(res[:10])
# train_df.head()

In [None]:
# w2vModel = Word2Vec(res, size=300, window=5, min_count=5, workers=4)
# w2v = {w: vec for w, vec in zip(w2vModel.wv.index2word, w2vModel.wv.vectors)}
# # Word2Vec?
# del w2vModel
# gc.collect()

In [None]:
# h1 = embedding_index.get("hello")
# h2 = w2v.get("hello")
# print(h1)
# print(h2)
# np.corrcoef(h1,h2)[0,1]

## Linear model with embeddings
** Using a transformer to vectorize text with the embeddings using *mean* embeddings approach.**

In [None]:
#Using stop words filter along with keras text to word sequence along with embeddings
class MeanEmbeddingVectorizer(object):
    def __init__(self, embedding_index, stop_words=None, debug=False, useSpacy=False):
        # todo documentation
        self.embedding_index = embedding_index
        if stop_words is None:
            self.stop_words = set(stopwords.words('english'))
        self.dim = 300 # we are using 300 dims embeddings
        self.debug = debug
        self.useSpacy = useSpacy
        self.nlp = None
        if self.useSpacy:
            self.nlp = spacy.load('en_core_web_sm')
    
    def analyzer(self, X):
        if self.useSpacy: #took long time
            doc1 = nlp(X)
            filtered_sentence = [token.lemma_ for token in doc1 if not token.is_stop and token.is_alpha and not token.dep_ == 'punct'and not token.lemma_ == '-PRON-']
        else:
            word_tokens = text_to_word_sequence(X) # tokenize given text into words
            filtered_sentence = [w for w in word_tokens if not w in self.stop_words] # filter stop words
        return set(filtered_sentence)

        
    def process(self, X):
        
        filtered_sentence = self.analyzer(X)
        vectors = np.zeros(self.dim)
        count = 0
        # get vector from
        for word in filtered_sentence:
            w2v = self.embedding_index.get(word)
            if w2v is not None:
                vectors = vectors + w2v
                count += 1
            elif self.debug:
                print("Word not found on embeddings: ", word)
        if count > 0:
            return vectors/count
        else:
            return vectors
        
    def fit(self,X,y):
        return self
    
    def transform(self, X):
        return np.array([self.process(words) for words in X ])
    

        

In [None]:
# meVect = MeanEmbeddingVectorizer(embedding_index, debug=False)
# meVect.fit(train_df.question_text,train_df.target)
# Xm = meVect.transform(train_df.question_text)

In [None]:
# Ym = train_df.target
# valYm = val_df.target
# valXm = meVect.transform(val_df.question_text)

In [None]:
# logistic_me = LogisticRegression(random_state=333, class_weight="balanced", verbose=1, C=0.01)
# logistic_me.fit(Xm, Ym)

In [None]:
# prob_predicted_me = logistic_me.predict_proba(valXm)
# print(prob_predicted_me[:,1])

In [None]:
# evaluatePredictions(valYm, prob_predicted_me[:,1])

In [None]:
# trprob_predicted_me = logistic_me.predict_proba(Xm)
# print(trprob_predicted_me[:,1])
# evaluatePredictions(Ym, trprob_predicted_me[:,1])

In [None]:
# plotPrecisionRecall(trainYm, trprob_predicted_me[:,1])

### TFIDF with Embeddings

In [None]:
class TfidfEmbeddingVectorizer(object):
    def __init__(self, embedding_index, stop_words=None, debug=False):
        # todo documentation
        self.embedding_index = embedding_index
        if stop_words is None:
            self.stop_words = set(stopwords.words('english'))
        self.dim = 300 # we are using 300 dims embeddings
        self.debug = debug
        self.word2idf = None
        
    def analyzer(self, X):
        word_tokens = text_to_word_sequence(X) # tokenize given text into words
        filtered_sentence = [w for w in set(word_tokens) if not w in self.stop_words] # filter stop words        
        return filtered_sentence
        
    def process(self, X):
        filtered_sentence = self.analyzer(X)
        vectors = np.zeros(self.dim)
        total_idf = 0
        # get vector from
        for word in filtered_sentence:
            w2v = self.embedding_index.get(word)
            if w2v is not None:
                idf = self.word2idf[word]
                vectors = vectors + (idf*w2v)
                total_idf += idf
            elif self.debug: #todo see what happens if we use word2vec here
                print("Word not found on embeddings: ", word)
        if total_idf > 0:
            return (vectors)/total_idf
        else:
            return vectors 
        
    def fit(self,X,y):
        tfidfVect = TfidfVectorizer(analyzer=lambda x: self.analyzer(x))
        tfidfVect.fit(X,y)
        max_idf = np.max(tfidfVect.idf_) # for unseen words i.e. they are rare and should have high weight
        self.word2idf = defaultdict(
            lambda: max_idf, 
            [(w, tfidfVect.idf_[i]) for w, i in tfidfVect.vocabulary_.items()])
        return self
    
    def transform(self, X):
        return np.array([self.process(words) for words in X ])
    

In [None]:
# test_array = [
#     "This is a sample sentence.",
#     "This is another demo sentence.",
#     "Well, how many more dummy sentence should I add?",
#     "This is a entirely uncorrelated sentence."
# ]
# target = [1,1,0,0]
# tfidfEmbVect = TfidfEmbeddingVectorizer(embedding_index, debug=False)
# tfidfEmbVect = tfidfEmbVect.fit(test_array, target)
# xdemo = tfidfEmbVect.transform(test_array)
# from sklearn.preprocessing import StandardScaler
# # scaler = StandardScaler()
# # xdemo = scaler.fit_transform(xdemo)
# print(xdemo)
# print(np.corrcoef(xdemo[0],xdemo[1])[0,1])
# print(np.corrcoef(xdemo[0],xdemo[2])[0,1])
# print(np.corrcoef(xdemo[0],xdemo[3])[0,1])
# print(np.corrcoef(xdemo[1],xdemo[2])[0,1])
# print(np.corrcoef(xdemo[1],xdemo[3])[0,1])
# print(np.corrcoef(xdemo[2],xdemo[3])[0,1])
# print(tfidfEmbVect.word2idf)

In [None]:
# tfidfEmbVect = TfidfEmbeddingVectorizer(embedding_index, debug=False)
# tfidfEmbVect = tfidfEmbVect.fit(train.question_text, train.target)

In [None]:
# X = tfidfEmbVect.transform(train.question_text)

In [None]:
# # del embedding_index # freeing memory once we have loaded 
# del meVect, embedding_index
# gc.collect()

In [None]:
# Y = train.target
# trainX, valX, trainY, valY = train_test_split(X, Y, test_size=0.1, random_state=333)

In [None]:
# logistic_te = LogisticRegression(random_state=333, class_weight="balanced", verbose=1, C=0.01)
# logistic_te.fit(trainX, trainY)

In [None]:
# prob_predicted_te = logistic_te.predict_proba(valX)
# evaluatePredictions(valY, prob_predicted_te[:,1])