In [1]:
import os
import re
import string
import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from contextlib import contextmanager
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
print("train_original shape : ",train.shape)
print("test_original shape : ",test.shape)

train_original shape :  (1306122, 3)
test_original shape :  (375806, 2)


In [3]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


## modeling

In [4]:
TOKENIZER = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(text):
    '''
    spacing punctuations
    '''
    # r' \1 ' - group 1 
    return TOKENIZER.sub(r' \1 ', text).split()

In [5]:
tfidf_vectorizer = TfidfVectorizer(
                    ngram_range=(1,4),
                    tokenizer=tokenize,
                    min_df=3,
                    max_df=0.9,
                    strip_accents='unicode',
                    use_idf=True,
                    smooth_idf=True,
                    sublinear_tf=True).fit(pd.concat([train['question_text'], test['question_text']]))


X = tfidf_vectorizer.transform(train['question_text'])
X_test = tfidf_vectorizer.transform(test['question_text'])
y = train.target.values

In [6]:
class NBTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, alpha=1):
        self.r = None
        self.alpha = alpha

    def fit(self, X, y):
        # store smoothed log count ratio
        p = self.alpha + X[y==1].sum(0)
        q = self.alpha + X[y==0].sum(0)
        self.r = csr_matrix(np.log(
            (p / (self.alpha + (y==1).sum())) /
            (q / (self.alpha + (y==0).sum()))
        ))
        return self

    def transform(self, X, y=None):
        return X.multiply(self.r)

In [7]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result


In [8]:
nb_transformer = NBTransformer(alpha=1).fit(X, y)

X_nb = nb_transformer.transform(X)
X_test_nb = nb_transformer.transform(X_test)

In [9]:
models = []
train_meta = np.zeros(y.shape)
test_meta = np.zeros(X_test.shape[0])
splits = list(StratifiedKFold(n_splits=20, shuffle=True, random_state=42).split(train, y))

for idx, (train_idx, valid_idx) in enumerate(splits):
    #with timer("fitting on {}th split".format(idx)):
    X_train = X_nb[train_idx]
    y_train = y[train_idx]
    X_val = X_nb[valid_idx]
    y_val = y[valid_idx]
    model = LogisticRegression(solver='lbfgs', dual=False, class_weight='balanced', C=0.5, max_iter=40)
    model.fit(X_train, y_train)
    models.append(model)
    valid_pred = model.predict_proba(X_val)
    train_meta[valid_idx] = valid_pred[:,1]
    test_meta += model.predict_proba(X_test_nb)[:,1] / len(splits)

print(roc_auc_score(y, train_meta))
search_result = threshold_search(y, train_meta)
print(search_result)



0.9619835425126513
{'threshold': 0.7000000000000001, 'f1': 0.6558730909848857}


In [10]:
pred_test_y = (test_meta > search_result['threshold']).astype(int)
test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)