In [22]:
# !kaggle competitions download -c quora-question-pairs
# !pip install diff_match_patch

In [22]:
import fasttext.util
import pandas as pd
from sklearn.metrics import classification_report, log_loss
from sklearn.model_selection import train_test_split

from utils.common import compress_read
from utils.shared_code import to_kaggle_submission

In [2]:
train_df = compress_read('./data/train.csv')
pred_df = compress_read('./data/test.csv')
sample_submission_df = compress_read('./data/sample_submission.csv')

In [17]:

def generate_text(a, b):
    question1 = str(a).replace('\n', ' ')
    question2 = str(b).replace('\n', ' ')
    if not question1 or not question2:
        return None
    split1 = {w.lower() for w in question1.split(' ')}
    split2 = {w.lower() for w in question2.split(' ')}
    text = set()
    text |= {t for t in split1 & split2}
    text |= {'$' + t for t in split1 ^ split2}
    return ' '.join(text)


def generate_texts_df(input_dicts):
    text_list = []
    for i in input_dicts:
        label = '__label__' + str(i['is_duplicate'])
        x = generate_text(i['question1'], i['question2'])
        if x: text_list.append({'label': label, 'text': x})
    return pd.DataFrame(text_list)


train_dict, test_dict = train_test_split(train_df.sample(frac=1.0).to_dict(orient='records'), test_size=0.2)
generate_texts_df(train_dict).to_csv('ft_train.txt', header=False, index=False, sep='\t')

In [18]:
model = fasttext.train_supervised(
    'ft_train.txt',
    label_prefix="__label__",
    lr=0.1,
    epoch=5,
    verbose=2,
    minCount=3,
)

In [19]:
y_true = []
y_pred = []
y_pred_score = []
for i in generate_texts_df(test_dict).to_dict(orient='records'):
    words = str(i['text'])
    label = 0 if str(i['label']) == '__label__0' else 1
    pred = model.predict(words, k=1)
    label_hat = 0 if pred[0][0] == '__label__0' else 1
    label_score = pred[1][0] if label_hat == 1 else 1 - pred[1][0]
    y_true.append(label)
    y_pred.append(label_hat)
    y_pred_score.append(label_score)

print(classification_report(y_true, y_pred, digits=4))
print(log_loss(y_true, y_pred_score))

              precision    recall  f1-score   support

           0     0.8405    0.8724    0.8562     50916
           1     0.7681    0.7185    0.7425     29942

    accuracy                         0.8154     80858
   macro avg     0.8043    0.7955    0.7993     80858
weighted avg     0.8137    0.8154    0.8141     80858

0.41794484548691613


In [20]:
submission = []
for i in pred_df.to_dict(orient='records'):
    words = generate_text(i['question1'], i['question2'])
    pred = model.predict(words, k=1)
    label_hat = 0 if pred[0][0] == '__label__0' else 1
    label_score = pred[1][0] if label_hat == 1 else 1 - pred[1][0]
    submission.append({
        'test_id': i['test_id'],
        'is_duplicate': label_score
    })

In [23]:
to_kaggle_submission(submission)