In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from pandas_profiling import ProfileReport

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import *

from wordcloud import WordCloud,STOPWORDS
import textblob as tb

import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import RidgeClassifierCV, RidgeClassifier, LogisticRegression
from sklearn.model_selection import KFold

from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import f1_score


from tqdm import tqdm
import time
import os
import gc
import psutil

from contextlib import contextmanager
from collections import defaultdict

from scipy.sparse import hstack
from scipy.sparse import csr_matrix

import lightgbm as lgb
import xgboost as xgb

In [None]:
print(os.listdir("../input"))

In [None]:
path = '../input/'

### Data import

In [None]:
# Load train and test data
train = pd.read_csv(path+'train.csv', index_col='id')
test = pd.read_csv(path+'test.csv', index_col='id')

# Concatenate train and test in one dataframe
df = pd.concat([train.drop('label', axis=1), test], axis=0)

# Separate labels
label = train.label

submission = pd.read_csv(path+'sample_submission.csv')

In [None]:
print('Train data shape:{}\nTest data shape:{}\nFull set shape{}'.\
      format(train.shape, test.shape, df.shape))

### Exploration

In [None]:
# Check number of 0/1 labels in the dataset
label.value_counts()/label.shape[0]

In [None]:
# Take a glimpse at the data
df.head()

In [None]:
train[train.label==1].head()

In [None]:
train[train.label==0].head()

### WordCloud

In [None]:
neg_tweets = train[train.label == 1]
neg_string = []
for t in neg_tweets.tweet:
    neg_string.append(t)
neg_string = pd.Series(neg_string).str.cat(sep=' ')

wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(neg_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
neg_tweets = train[train.label == 0]
neg_string = []
for t in neg_tweets.tweet:
    neg_string.append(t)
neg_string = pd.Series(neg_string).str.cat(sep=' ')

wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(neg_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

## Feature Engineering

In [None]:
def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    return len(re.findall(regexp, text))

In [None]:
# Get length in words and characters
df["raw_word_len"] = df["tweet"].apply(lambda x: len(x.split()))
df["raw_char_len"] = df["tweet"].apply(lambda x: len(x))
# Check number of upper case, if you're angry you may write in upper case
df["nb_upper"] = df["tweet"].apply(lambda x: count_regexp_occ(r"[A-Z]", x))
df["caps_vs_len"] = (df["nb_upper"]/df["raw_char_len"])*100
# Check for punctuation
df['num_punctuation'] = df['tweet'].apply(lambda comment: sum(comment.count(w) for w in '.,;:')) 
df["punc_vs_len"] = (df["num_punctuation"]/df["raw_char_len"])*100
# Check for http links
df["has_http"] = df["tweet"].apply(lambda x: count_regexp_occ(r"http[s]{0,1}://\S+", x))
df["has_http"] = df["has_http"] > 0
df["has_http"] = df["has_http"].astype(int)

In [None]:
df.columns

In [None]:
numcols = ['raw_word_len', 'raw_char_len','nb_upper', 'caps_vs_len','num_punctuation', 'punc_vs_len']

In [None]:
df[numcols] = np.log1p(df[numcols])

## Clean text

### Remove twitter handles (@user)

In [None]:
def remove_pattern(text, pattern):
    r = re.findall(pattern, text)
    for i in r:
        text = re.sub(i, '', text)
    return text

In [None]:
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['tweet'], r'@[A-Za-z0-9_]+')

### Replace english contractions

In [None]:
cont_patterns = [
    ('(W|w)on\'t', 'will not'),
    ('(C|c)an\'t', 'can not'),
    ('(I|i)\'m', 'i am'),
    ('(A|a)in\'t', 'is not'),
    ('(\w+)\'ll', '\g<1> will'),
    ('(\w+)n\'t', '\g<1> not'),
    ('(\w+)\'ve', '\g<1> have'),
    ('(\w+)\'s', '\g<1> is'),
    ('(\w+)\'re', '\g<1> are'),
    ('(\w+)\'d', '\g<1> would'),
]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]

In [None]:
def clean_text(text):
    clean = text.lower()
    #Drop numbers
    clean = re.sub("\d+", " ", clean)
    #Remove extra spaces
    clean = re.sub('\s+', ' ', clean)
    # Remove ending space if any
    clean = re.sub('\s+$', '', clean)
    for (pattern, repl) in patterns:
        clean = re.sub(pattern, repl, clean)
    return clean

In [None]:
df["tidy_tweet"] = df["tidy_tweet"].apply(lambda x: clean_text(x))

In [None]:
# remove http links
df['tidy_tweet'] = df['tidy_tweet'].str.replace(r'https?://[^ ]+', "")
df['tidy_tweet'] = df['tidy_tweet'].str.replace(r'www.[^ ]+', "")

In [None]:
# remove hashtags
#df['tidy_tweet'] = df['tidy_tweet'].str.replace(r"#(\w+)", " ")

In [None]:
# remove special characters, numbers, punctuations
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")

In [None]:
# Get the new length in words and characters
df["clean_word_len"] = df["tidy_tweet"].apply(lambda x: len(x.split()))
df["clean_char_len"] = df["tidy_tweet"].apply(lambda x: len(x))
# Number of different characters used in a comment
# Using the f word only will reduce the number of letters required in the comment
df["clean_chars"] = df["tidy_tweet"].apply(lambda x: len(set(x)))
df["clean_chars_ratio"] = df["tidy_tweet"].apply(lambda x: len(set(x))) / df["tidy_tweet"].apply(
    lambda x: 1 + min(99, len(x)))

## TF-IDF

In [None]:
def char_analyzer(text):
    """
    This is used to split strings in small lots
    so <talk> and <talking> would have <Tal> <alk> in common
    """
    tokens = text.split()
    return [token[i: i + 3] for token in tokens for i in range(len(token) - 2)]

In [None]:
# Get TF-IDF features
train_text = df.iloc[:train.shape[0],:]['tidy_tweet']
test_text = df.iloc[train.shape[0]:,:]['tidy_tweet']
all_text = pd.concat([train_text, test_text])

In [None]:
# Use the char_analyzer to get another TFIDF
# Char level TFIDF would go through words when char analyzer only considers
# characters inside a word
char_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            strip_accents='unicode',
            tokenizer=char_analyzer,
            analyzer='word',
            ngram_range=(1, 3),
            max_features=50000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [None]:
tokenized_tweet = all_text.apply(lambda x: x.split())
tokenized_tweet.head()

In [None]:
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_tweet.head()

In [None]:
for i in range(1, len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

In [None]:
tokenized_tweet.head()

In [None]:
all_text = tokenized_tweet.apply(lambda x: str(x))
train_text = all_text[:train.shape[0]]
test_text = all_text[train.shape[0]:]
all_text.head()

In [None]:
# First on real words
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=20000)

word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [None]:
all_text = all_text.apply(lambda x: str(x))

## Prepare numerical features

In [None]:
# Scaling numerical features with MinMaxScaler though tree boosters don't need that
num_features = [f_ for f_ in df.columns if f_ not in ["tweet", "tidy_tweet"]]

skl = MinMaxScaler()
train_num_features = csr_matrix(skl.fit_transform(df.iloc[:train.shape[0], :][num_features]))
test_num_features = csr_matrix(skl.fit_transform(df.iloc[train.shape[0]:, :][num_features]))

## Stack TF-IDF Matrixes

In [None]:
csr_trn = hstack(
            [
                train_char_features,
                train_word_features,
                train_num_features
            ]
        ).tocsr()

In [None]:
csr_sub = hstack(
            [
                test_char_features,
                test_word_features,
                test_num_features
            ]
        ).tocsr()

## Logistic Regression

In [None]:
scores = []
folds = KFold(n_splits=8, shuffle=True, random_state=42)
train_pred = np.zeros(train.shape[0])
prediction = np.zeros(test.shape[0])
train_target = train.label

In [None]:
# Set regression parameters
all_parameters = {
    'C': 1,
    'tol': 0.1,
    'solver': 'lbfgs',
    'fit_intercept':True,
    'penalty': 'l2',
    'class_weight': 'balanced',
    'verbose': 0
}

In [None]:
classifier = LogisticRegression(**all_parameters)

In [None]:
trn_idx = list(enumerate(folds.split(train, train_target)))[0][1][0]

In [None]:
train_pred = np.zeros(train.shape[0])
prediction = np.zeros(test.shape[0])
train_target = train.label
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, train_target)):
    # Train LR
    classifier.fit(csr_trn[trn_idx], train_target.values[trn_idx])
    train_pred[val_idx] = classifier.predict_proba(csr_trn[val_idx])[:, 1]
    prediction += classifier.predict_proba(csr_sub)[:,1] / folds.n_splits
    rscore = roc_auc_score(train_target.values[val_idx], train_pred[val_idx])
    
    print("\t Fold %d : %.6f ROC AUC" % (n_fold + 1, rscore))

In [None]:
print("full score : %.6f" % roc_auc_score(train_target, train_pred))

In [None]:
train_pred_lr = train_pred.copy()

In [None]:
precision, recall, tresh = precision_recall_curve(train_target, train_pred)
f1 = []
for t in range(len(tresh)):
    f1.append(2/((1/precision[t])+(1/recall[t])))
th = tresh[np.argmax(f1)]

In [None]:
train_prediction = train_pred >= th
train_prediction = train_prediction.astype(int)

In [None]:
print("full score : %.6f" % f1_score(train_target, train_prediction))

In [None]:
prediction_lr = prediction.copy()

In [None]:
prediction = prediction - min(prediction)
prediction = prediction/max(prediction)
prediction = prediction >= th
prediction = prediction.astype(int)

In [None]:
submission['label'] = prediction

In [None]:
submission.to_csv('lr_submission.csv', index=False)

## LightGBM

In [None]:
folds = KFold(n_splits=8, shuffle=True, random_state=42)

In [None]:
# Set LGBM parameters
params = {
    "objective": "binary",
    'metric': {'auc'},
    "boosting_type": "gbdt",
    "verbosity": -1,
    "num_threads": 4,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,
    "learning_rate": 0.05,
    "num_leaves": 64,
    "verbose": -1,
    "min_split_gain": .1,
    "reg_alpha": .1,
}

In [None]:
lgb_round_dict = defaultdict(int)
trn_lgbset = lgb.Dataset(csr_trn, free_raw_data=False)

In [None]:
train_pred = np.zeros(train.shape[0])
prediction = np.zeros(test.shape[0])
trn_lgbset.set_label(train_target.values)
lgb_rounds = 5000
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, train_target)):
    watchlist = [
    trn_lgbset.subset(trn_idx),
    trn_lgbset.subset(val_idx)
    ]
    # Train lgb l1
    model = lgb.train(
        params=params,
        train_set=watchlist[0],
        num_boost_round=lgb_rounds,
        valid_sets=watchlist,
        early_stopping_rounds=50,
        verbose_eval=0
    )
    train_pred[val_idx] = model.predict(trn_lgbset.data[val_idx], num_iteration=model.best_iteration)
    prediction += model.predict(csr_sub, 
                                    num_iteration = model.best_iteration) / folds.n_splits
    rscore = roc_auc_score(train_target.values[val_idx], train_pred[val_idx])
    
    print("\t Fold %d : %.6f ROC AUC in %3d rounds" % (n_fold + 1, rscore, model.best_iteration))

In [None]:
train_pred_lgb = train_pred.copy()

In [None]:
print("full score : %.6f" % roc_auc_score(train_target, train_pred))

In [None]:
precision, recall, tresh = precision_recall_curve(train_target, train_pred)
f1 = []
for t in range(len(tresh)):
    f1.append(2/((1/precision[t])+(1/recall[t])))
th = tresh[np.argmax(f1)]

In [None]:
train_prediction = train_pred >= th
train_prediction = train_prediction.astype(int)

In [None]:
print("full score : %.6f" % f1_score(train_target, train_prediction))

In [None]:
prediction_lgb = prediction.copy()

In [None]:
prediction = prediction - min(prediction)
prediction = prediction/max(prediction)
prediction = prediction >= th
prediction = prediction.astype(int)

In [None]:
submission['label'] = prediction

In [None]:
submission.to_csv('lgbm_stem_submission.csv', index=False)

## XGBoost

In [None]:
folds = KFold(n_splits=8, shuffle=True, random_state=42)

In [None]:
# Set XGBoost parameters
param = {}
param['objective'] = 'binary:logistic'
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['eval_metric'] = 'auc'
param['min_child_weight'] = 1
param['subsample'] = 0.7
param['colsample_bytree'] = 0.7
param['seed'] = 23

plst = list(param.items())

In [None]:
xgtrain = xgb.DMatrix(csr_trn, label=train.label.values)
xgtest = xgb.DMatrix(csr_sub)

In [None]:
train_pred = np.zeros(train.shape[0])
prediction = np.zeros(test.shape[0])
num_rounds = 500
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train, train_target)):
    watchlist = [ (xgtrain.slice(trn_idx),'train'), (xgtrain.slice(val_idx), 'test') ]
    # Train xgb
    model = xgb.train(
        params = plst,
        dtrain = xgtrain.slice(trn_idx),
        num_boost_round = 500,
        evals = watchlist,
        early_stopping_rounds=20,
        verbose_eval=50
    )
    
    train_pred[val_idx] = model.predict(xgtrain.slice(val_idx), ntree_limit = model.best_ntree_limit)
    prediction += model.predict(xgtest, ntree_limit = model.best_ntree_limit) / folds.n_splits
    rscore = roc_auc_score(train_target.values[val_idx], train_pred[val_idx])
    
    print("\t Fold %d : %.6f ROC AUC in %3d rounds" % (n_fold + 1, rscore, model.best_iteration))

In [None]:
train_pred_xgb = train_pred.copy()

In [None]:
print("full score : %.6f" % roc_auc_score(train_target, train_pred))

In [None]:
precision, recall, tresh = precision_recall_curve(train_target, train_pred)
f1 = []
for t in range(len(tresh)):
    f1.append(2/((1/precision[t])+(1/recall[t])))
th = tresh[np.argmax(f1)]

In [None]:
train_prediction = train_pred >= th
train_prediction = train_prediction.astype(int)

In [None]:
print("full score : %.6f" % f1_score(train_target, train_prediction))

In [None]:
prediction_xgb = prediction.copy()

In [None]:
prediction = prediction - min(prediction)
prediction = prediction/max(prediction)
prediction = prediction >= th
prediction = prediction.astype(int)

In [None]:
submission['label'] = prediction

In [None]:
submission.to_csv('xgb_submission.csv', index=False)

## Cray Ensemble

In [None]:
train_pred = np.power((train_pred_lr * train_pred_xgb * train_pred_lgb), 1/3)

In [None]:
precision, recall, tresh = precision_recall_curve(train_target, train_pred)
f1 = []
for t in range(len(tresh)):
    f1.append(2/((1/precision[t])+(1/recall[t])))
th = tresh[np.argmax(f1)]

In [None]:
train_prediction = train_pred >= th
train_prediction = train_prediction.astype(int)

In [None]:
print("full score : %.6f" % f1_score(train_target, train_prediction))

In [None]:
prediction = np.power((prediction_lgb * prediction_xgb * prediction_lr), 1/3)

In [None]:
prediction = prediction - min(prediction)
prediction = prediction/max(prediction)
prediction = prediction >= th
prediction = prediction.astype(int)

In [None]:
submission['label'] = prediction

In [None]:
submission.to_csv('ensemble_submission.csv', index=False)