## Packages and data

In [1]:
# !kaggle competitions download -c quora-question-pairs
# !pip install diff_match_patch

In [2]:
import fasttext.util
import pandas as pd
from sklearn.metrics import classification_report, log_loss
from sklearn.model_selection import train_test_split

def compress_read(f) -> pd.DataFrame:
    import os.path as path
    parquet_file = f.replace('.csv', '.parquet')
    if str(f).endswith('.csv') and path.exists(f):
        pd.read_csv(f).to_parquet(parquet_file, compression='brotli')
    return pd.read_parquet(parquet_file)

In [3]:
train_df = compress_read('./data/train.csv')
pred_df = compress_read('./data/test.csv')
sample_submission_df = compress_read('./data/sample_submission.csv')

## Data Cleaning

In [4]:
from collections import Counter

question_counter = Counter()
question_counter.update(train_df['question1'].values)
question_counter.update(train_df['question2'].values)
pd.DataFrame(question_counter.most_common(100), columns=['question', 'count'])

Unnamed: 0,question,count
0,What are the best ways to lose weight?,161
1,How can you look at someone's private Instagra...,120
2,How can I lose weight quickly?,111
3,What's the easiest way to make money online?,88
4,Can you see who views your Instagram?,79
...,...,...
95,How can I improve fluency in English?,42
96,How will scraping of 500 and 1000 rupees notes...,42
97,How is black money curbed with the ban of 1000...,42
98,How do I really make money online?,42


In [5]:
def cleaning_sentence_1(text):
    import re
    from string import punctuation

    if text is None:
        text = ' '

    text = text.lower()
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"when's", "when is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"[^A-Za-z0-9^,!./'+-=]", " ", text)
    # Except for the above special cases, "\'s" can only represent possessive case and should be replaced with " "
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " america ", text)
    text = re.sub(r" u s ", " america ", text)
    text = re.sub(r" uk ", " england ", text)
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r" dms ", "direct messages ", text)
    text = re.sub(r"demonitization", "demonetization", text)
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text)
    text = re.sub(r" ds ", " data science ", text)
    text = re.sub(r" ee ", " electronic engineering ", text)
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iphone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text)
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"III", "3", text)
    text = re.sub(r"the us", "america", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"-", " ", text)
    text = re.sub(r"=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r"\0s", "0", text)
    text = "".join([c for c in text if c not in punctuation])
    return text

def cleaning_sentence_2(text):
    from nltk.corpus import stopwords
    from nltk import word_tokenize
    stops = set(stopwords.words("english"))
    text = word_tokenize(text)
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

In [10]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

spark = SparkSession.builder.appName("pandas-on-spark")\
    .config('spark.driver.memory', '8G')\
    .config('spark.driver.maxResultSize', '0')\
    .getOrCreate()

In [11]:
def clean_df_save(df, name):
    spark.createDataFrame(data=df)\
        .repartition(50)\
        .withColumn('question1_original', F.expr('question1'))\
        .withColumn('question2_original', F.expr('question2'))\
        .withColumn('question1_clean', F.udf(cleaning_sentence_1)(F.expr("question1")))\
        .withColumn('question2_clean', F.udf(cleaning_sentence_1)(F.expr("question2")))\
        .withColumn('question1_clean_stop', F.udf(cleaning_sentence_2)(F.expr("question1_clean")))\
        .withColumn('question2_clean_stop', F.udf(cleaning_sentence_2)(F.expr("question2_clean")))\
        .write.mode("overwrite").parquet(f"./fasttext_data/{name}")\

clean_df_save(train_df, 'train_df')
clean_df_save(pred_df, 'pred_df')

22/01/16 08:04:22 WARN TaskSetManager: Stage 9 contains a task of very large size (3348 KiB). The maximum recommended task size is 1000 KiB.
22/01/16 08:05:10 WARN TaskSetManager: Stage 12 contains a task of very large size (18920 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [12]:
train_df = spark.read.parquet('./fasttext_data/train_df').toPandas()
pred_df = spark.read.parquet('./fasttext_data/pred_df').toPandas()

                                                                                

## FastText training

In [25]:
def generate_text(a, b):
    question1 = str(a).replace('\n', ' ')
    question2 = str(b).replace('\n', ' ')
    if not question1 or not question2:
        return None
    split1 = {w.lower() for w in question1.split(' ')}
    split2 = {w.lower() for w in question2.split(' ')}
    text = set()
    text |= {t for t in split1 & split2}
    text |= {'$' + t for t in split1 ^ split2}
    return ' '.join(text)

train_dict, test_dict = train_test_split(train_df.sample(frac=1.0).to_dict(orient='records'), test_size=0.2)

In [26]:
# training
text_list = []
for i in train_dict:
    label = '__label__' + str(i['is_duplicate'])
    x = generate_text(i['question1'], i['question2'])
    if x: text_list.append({'label': label, 'text': x})
pd.DataFrame(text_list).to_csv('ft_train.txt', header=False, index=False, sep='\t')

model = fasttext.train_supervised(
    'ft_train.txt',
    label_prefix="__label__",
    lr=0.1,
    epoch=5,
    verbose=2,
    minCount=3,
)

Read 6M words
Number of words:  71235
Number of labels: 2
Progress: 100.0% words/sec/thread: 2502543 lr:  0.000000 avg.loss:  0.410507 ETA:   0h 0m 0s


In [27]:
# testing
def fast_text_pred(a, b):
    words = generate_text(a, b)
    pred = model.predict(words, k=1)
    pred_label = 0 if pred[0][0] == '__label__0' else 1
    pred_score = pred[1][0] if pred_label == 1 else 1 - pred[1][0]
    return pred_label, pred_score

y_true = []
y_pred = []
y_pred_score = []
for i in test_dict:
    label = int(i['is_duplicate'])
    pred_label, pred_score = fast_text_pred(i['question1'], i['question2'])
    y_true.append(label)
    y_pred.append(pred_label)
    y_pred_score.append(pred_score)

print(classification_report(y_true, y_pred, digits=4))
print('log_loss:', log_loss(y_true, y_pred_score))

              precision    recall  f1-score   support

           0     0.8413    0.8729    0.8568     51035
           1     0.7675    0.7181    0.7420     29823

    accuracy                         0.8158     80858
   macro avg     0.8044    0.7955    0.7994     80858
weighted avg     0.8140    0.8158    0.8144     80858

log_loss: 0.417073991886548


In [14]:
# submit directly using fasttext
# submission = []
# for i in pred_df.to_dict(orient='records'):
#     pred_label, pred_score = fast_text_pred(i['question1'], i['question2'])
#     submission.append({
#         'test_id': i['test_id'],
#         'is_duplicate': pred_score
#     })
# to_kaggle_submission(submission)