# Quora Question Pairs challenge

In [1]:
import nltk
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from nltk.stem import WordNetLemmatizer
import re
import json
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Functions for loading data and perform preprocessing on the questions by:
- Setting text to lowercase
- Replace symbols and characters to text format ($ = dollar)
- Replacing contractions (you're = you are)
- Lemmatization
- removing leading and tracing whitespaces

In [2]:
def pre_process(txt):
    def replace_all(t, dic):
        for i, j in dic.items():
            t = t.replace(i, j)
        return t

    txt = str(txt).lower()

    with open('data/replacechars.json', 'r') as JSON:
        json_dict = json.load(JSON)

    text = replace_all(txt, json_dict)
    text = re.sub(r'([0-9]+)000000000', r'\1b', text)
    text = re.sub(r'([0-9]+)000000', r'\1m', text)
    text = re.sub(r'([0-9]+)000', r'\1k', text)
    lemmatizer = WordNetLemmatizer()
    text =  lemmatizer.lemmatize(text)

    return text.strip()

In [3]:
def load_data(filename):
    df = pd.read_csv(filename, index_col='id')
    df['question1'] = df['question1'].apply(pre_process)
    df['question2'] = df['question2'].apply(pre_process)
    return df

### (Partial) String Matching
Baseline method for detecting duplicate Quora questions

In [11]:
def string_matching():
    df = load_data('data/Test set.csv')
    df['is_duplicate'] = ((df['question1'] == df['question2']) | df['question1'].isin(df['question2']) | df['question2'].isin(df['question1'])).astype(int)
    return df

In [10]:
# string_matching().to_csv('data/string_match.csv')

Helper methods for extracting features on both training as test set

In [5]:
def word_count(entry):
    q1_set = set(entry['question1'].split(" "))
    q2_set = set(entry['question2'].split(" "))
    common = len(q1_set & q2_set)
    total = (len(q1_set) + len(q2_set))
    shared = round(common/total, 2)
    return common, total, shared

In [6]:
def dataframe_features():
    # training set
    df_train = load_data('data/Development set.csv')
    df_train[['common', 'total', 'shared']] = df_train.apply(word_count, axis=1, result_type='expand')
    df_train = df_train.drop(columns=['qid1', 'qid2', 'question1', 'question2'])
    x_train = df_train.iloc[:, 1:].values
    y_train = df_train.iloc[:, 0].values  # is_duplicated

    # test set
    df_test = load_data('data/Test set.csv')
    df_test[['common', 'total', 'shared']] = df_test.apply(word_count, axis=1, result_type='expand')
    df_test = df_test.drop(columns=['qid1', 'qid2', 'question1', 'question2', '?'])
    x_test = df_test.values

    return x_train, y_train, x_test, df_test

### Random Forest Classifier predictions

In [7]:
def rf_predictions():
    x_train, y_train, x_test, df_test = dataframe_features()
    cf = RandomForestClassifier()
    cf.fit(x_train, y_train)
    df_test['is_duplicate'] = cf.predict(x_test)
    return df_test

In [None]:
# rf_predictions().tocsv('data/random_forrest.csv')

### XGB Classifier predictions

In [8]:
def xgb_predictions():
    x_train, y_train, x_test, df_test = dataframe_features()
    xgb = XGBClassifier()
    xgb.fit(x_train, y_train)
    df_test['is_duplicate'] = xgb.predict(x_test)
    return df_test

In [None]:
# xgb_predictions().to_csv('data/xgb.csv')