# Information Retrieval - Quora Question Pairs challenge
by Kevin Nanhekhan (student-no. 4959094)

In [1]:
import nltk
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, ShuffleSplit
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from xgboost import XGBClassifier
from nltk.stem import WordNetLemmatizer
import re
import json
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Initial Solution
Baseline method of string matching and basic preprocessing (e.g. lower case, replace characters, lematization, stopword removal) for detecting duplicate Quora questions

In [5]:
def pre_process(txt):
    def replace_all(t, dic):
        for i, j in dic.items():
            t = t.replace(i, j)
        return t

    txt = str(txt).lower()

    with open('data/replacechars.json', 'r') as JSON:
        json_dict = json.load(JSON)

    text = replace_all(txt, json_dict)
    text = re.sub(r'([0-9]+)000000000', r'\1b', text)
    text = re.sub(r'([0-9]+)000000', r'\1m', text)
    text = re.sub(r'([0-9]+)000', r'\1k', text)
    lemmatizer = WordNetLemmatizer()
    text =  lemmatizer.lemmatize(text)

    return text.strip()

In [6]:
def string_matching():
    df = pd.read_csv('data/Test set.csv', index_col='id')
    df['question1'] = df['question1'].apply(pre_process)
    df['question2'] = df['question2'].apply(pre_process)
    df['is_duplicate'] = ((df['question1'] == df['question2']) | df['question1'].isin(df['question2']) | df['question2'].isin(df['question1'])).astype(int)
    return df

In [4]:
# string_matching().to_csv('data/string_match.csv')
cross_val_score()

# Improvement advanced solution

In [6]:
def word_count(entry):
    q1_set = set(entry['question1'].split(" "))
    q2_set = set(entry['question2'].split(" "))
    common = len(q1_set & q2_set)
    total = (len(q1_set) + len(q2_set))
    shared = round(common/total, 2)
    return common, total, shared

In [7]:
def dataframe_features():
    # training set
    df_train = load_data('data/Development set.csv')
    df_train[['common', 'total', 'shared']] = df_train.apply(word_count, axis=1, result_type='expand')
    df_train = df_train.drop(columns=['qid1', 'qid2', 'question1', 'question2'])
    x_train = df_train.iloc[:, 1:].values
    y_train = df_train.iloc[:, 0].values  # is_duplicated

    # test set
    df_test = load_data('data/Test set.csv')
    df_test[['common', 'total', 'shared']] = df_test.apply(word_count, axis=1, result_type='expand')
    df_test = df_test.drop(columns=['qid1', 'qid2', 'question1', 'question2', '?'])
    x_test = df_test.values

    return x_train, y_train, x_test, df_test

### Random Forest Classifier predictions

In [15]:
train_x, train_y, _, _ = dataframe_features()

shuffle = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
rf_param_grid = {
    'n_estimators':[25, 50, 100, 200, 500, 800],
    'min_samples_split':[5, 7, 10, 12, 15],
    'max_depth': [50, 75, 150, 200, None]
}

clf_search = HalvingGridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_param_grid, scoring=["accuracy", "f1"], factor=2, cv=shuffle, verbose=3)
clf_search.fit(train_x, train_y)

n_iterations: 6
n_required_iterations: 6
n_possible_iterations: 6
min_resources_: 12150
max_resources_: 388800
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 60
n_resources: 12150
Fitting 1 folds for each of 60 candidates, totalling 60 fits
[CV 1/1] END max_depth=50, min_samples_split=5, n_estimators=50;, score=(train=0.717, test=0.694) total time=   0.1s
[CV 1/1] END max_depth=50, min_samples_split=5, n_estimators=100;, score=(train=0.716, test=0.698) total time=   0.3s
[CV 1/1] END max_depth=50, min_samples_split=5, n_estimators=200;, score=(train=0.716, test=0.690) total time=   0.8s
[CV 1/1] END max_depth=50, min_samples_split=5, n_estimators=500;, score=(train=0.717, test=0.693) total time=   2.0s
[CV 1/1] END max_depth=50, min_samples_split=5, n_estimators=800;, score=(train=0.717, test=0.693) total time=   3.3s
[CV 1/1] END max_depth=50, min_samples_split=10, n_estimators=50;, score=(train=0.716, test=0.693) total time=   0.1s
[CV 1/1] END max_depth=50,

In [24]:
print(f"Best parameters found for RandomForestClassifier: {clf_search.best_params_}, best accuracy score: {clf_search.best_score_:.5f}")

Best parameters found for RandomForestClassifier: {'max_depth': 150, 'min_samples_split': 10, 'n_estimators': 50}, best accuracy score: 0.70730


In [7]:
def rf_predictions():
    x_train, y_train, x_test, df_test = dataframe_features()
    cf = RandomForestClassifier()
    cf.fit(x_train, y_train)
    df_test['is_duplicate'] = cf.predict(x_test)
    return df_test

In [None]:
# rf_predictions().tocsv('data/random_forrest.csv')

### Gradient Boosted Trees Classifier (XGB) predictions

In [25]:
train_x, train_y, _, _ = dataframe_features()

shuffle = ShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
xgb_param_grid = {
    'n_estimators':[20, 50, 100, 200, 500, 800],
    'learning_rate': [0.01, 0.05, 0.1, 0.5],
    'min_child_weight':[4, 5, 6],
    'max_depth': [4, 5, 6, None]
}
clf_search = HalvingGridSearchCV(estimator=XGBClassifier(), param_grid=xgb_param_grid, scoring=["accuracy", "f1"], factor=2, cv=shuffle, verbose=3)
clf_search.fit(train_x, train_y)

n_iterations: 9
n_required_iterations: 9
n_possible_iterations: 9
min_resources_: 1518
max_resources_: 388800
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 288
n_resources: 1518
Fitting 1 folds for each of 288 candidates, totalling 288 fits
[CV 1/1] END learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=20;, score=(train=0.727, test=0.726) total time=   0.0s
[CV 1/1] END learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=50;, score=(train=0.727, test=0.726) total time=   0.0s
[CV 1/1] END learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=100;, score=(train=0.727, test=0.723) total time=   0.0s
[CV 1/1] END learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=200;, score=(train=0.733, test=0.726) total time=   0.0s
[CV 1/1] END learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=500;, score=(train=0.738, test=0.733) total time=   0.2s
[CV 1/1] END learning_rate=0.01, max_depth=4, min_chil

In [26]:
print(f"Best parameters found for RandomForestClassifier: {clf_search.best_params_}, best accuracy score: {clf_search.best_score_}")

Best parameters found for RandomForestClassifier: {'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 20}, best accuracy score: 0.7049446095649824


In [8]:
def xgb_predictions():
    x_train, y_train, x_test, df_test = dataframe_features()
    xgb = XGBClassifier()
    xgb.fit(x_train, y_train)
    df_test['is_duplicate'] = xgb.predict(x_test)
    return df_test

In [None]:
# xgb_predictions().to_csv('data/xgb.csv')

### Neural Network -->


### Final solution: Ensemble XGB + Neural Network