In [58]:
# ONLY ON MY COMPUTER
import sys
import sys,os,os.path
python_path = ['', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python27.zip', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-darwin', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-mac', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-mac/lib-scriptpackages', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/lib-tk', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/lib-old', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/lib-dynload', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages', '/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/xgboost-0.6-py2.7.egg', '/Users/Melancardie/Dropbox/Documents/My School/NYU/Spring 2017/DS-GA 1008/HW/hw3/ALI']

for p in python_path:
    if p not in sys.path:
        sys.path.append(p)


In [59]:
import pandas as pd
import numpy as np
import nltk
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import time
from sklearn.linear_model import LogisticRegression
import os
import spacy
nlp = spacy.load('en')


In [60]:
# DATA LOADING FUNCTIONS

# split dataset
def split_dataset(full_data, train_ratio, validation_ratio, test_ratio):
    """
    Function that splits the dataset into train, validation, and test
    """
    random_idx = np.random.permutation(len(full_data))
    train_threshold = int(round(train_ratio*len(full_data)))
    validation_threshold = int(round((train_ratio+validation_ratio)*len(full_data)))
    
    train_set = full_data.iloc[random_idx[:train_threshold]]
    validation_set = full_data.iloc[random_idx[train_threshold:validation_threshold]]
    test_set = full_data.iloc[random_idx[validation_threshold:]]
    
    return train_set, validation_set, test_set


# load dataset
def load_datasets(load_dir = "../data/kaggle_competition/", prefix="clean_kaggle_", post_fix=""):
    """
    Function that loads the dataset
    """
    train_set = pd.read_csv(os.path.join(load_dir, "{0}train{1}.csv".format(prefix,post_fix)), keep_default_na=False)
    validation_set = pd.read_csv(os.path.join(load_dir, "{0}validation{1}.csv".format(prefix,post_fix)), keep_default_na=False)
    test_set = pd.read_csv(os.path.join(load_dir, "{0}test{1}.csv".format(prefix,post_fix)), keep_default_na=False)
    return train_set, validation_set, test_set

def xy_split(df, label_col="is_duplicate"):
    """
    Function that splits a data frame into X and y
    """
    return df.drop(label_col, axis=1).as_matrix(), df[label_col]


In [61]:
# DATA CLEANING FUNCTIONS
def clean_str(input_str):
    """
    Helper function that converts string to ASCII
    """
    # trivial case
    if pd.isnull(input_str) or type(input_str)==np.float or type(input_str)==float:
        return ""
    # encoding
    input_str = input_str.decode('ascii', 'ignore').lower()
    return input_str

def clean_dataset(full_dataset):
    """
    Function that cleans the full dataset
    """
    full_dataset["clean_q1"] = full_dataset["question1"].apply(clean_str,1)
    full_dataset["clean_q2"] = full_dataset["question2"].apply(clean_str,1)
    col_need = ["clean_q1", "clean_q2"]
    if "is_duplicate" in full_dataset.columns:
        col_need += ["is_duplicate"]
    return full_dataset[col_need]

In [62]:
# FEATURE ENGINEERING FUNCTIONS
def word_overlap(row):
    """
    Function that calculates the percentage of word overlap
    """
    avg_length = float(len(row['token_1'])+len(row['token_2']))/2
    save_token_num = len(set(row['token_1']).intersection(set(row['token_2'])))
    return float(save_token_num)/avg_length

def sentence_similarity(row):
    """
    Function that returns the Spacy sentence similarity
    """
    return row["doc1"].similarity(row["doc2"])


def jaccard_sim(set1, set2):
    """
    Jaccard Similarity
    """
    if len(set1.union(set2)) == 0:
        return 0.0
    else:
        return float(len(set1.intersection(set2)))/len(set1.union(set2))

def jaccard_sim_unhashbale(set1, set2):
    """
    Jaccard Similarity
    """
    count = 0.0
    str_set2 = str(set2)
    for i in set1:
        if str(i) in str_set2:
            count += 1.0
    if (len(set1)+len(set2)-count) == 0:
        return 0.0
    else:
        return count/ (len(set1)+len(set2)-count)
    
    
def load_embedding(glove_file="/Users/Melancardie/Dropbox/Documents/My Research/NYU/Sundararajan/trust/lib/glove.6B/glove.6B.50d.txt",
                   line_to_load = 50000):
    """
    Function that populates a dictionary with word embedding vectors
    """
    ctr = 0
    word_emb = {}
    with open(glove_file, "r") as f:
        for i, line in enumerate(f):
            contents = line.split()
            word_emb[contents[0]]=np.asarray(contents[1:]).astype(float)
            ctr += 1
            if ctr >= line_to_load:
                break
    return word_emb
glove_emb = load_embedding()
 

    
def vectorize_tokens(token_list, word_emb, dim=50):
    """
    Function that vectorize phrases from a counter
    """
    ctr = 0.0
    vec = np.zeros(dim)
    for token in token_list:
        if token in word_emb:
            vec += word_emb[token].astype(float)
            ctr += 1
    if ctr == 0 :
        return vec
    else:
        return vec / float(ctr)
    
def emb_dist(row, embedding):
    """
    Function that calculates the euclidean distance among two embeddings
    """
    # embedding
    emb1 = vectorize_tokens(row["token_1"], embedding)
    emb2 = vectorize_tokens(row["token_2"], embedding)
    return np.linalg.norm(emb1-emb2)

def emb_diff(row, embedding, emb_mat):
    """
    Function that calculates the euclidean distance among two embeddings
    """
    # embedding
    emb1 = vectorize_tokens(row["token_1"], embedding)
    emb2 = vectorize_tokens(row["token_2"], embedding)
    emb_mat.append(np.abs(emb1-emb2))
     

def feature_engineering(df, embedding=glove_emb, normalize=False):
    """
    Feature engineering function
    """
    total_begin = time.time()
    
    # preprocessing #
    # tokenization
    df['token_1'] = df.apply(lambda x: nltk.word_tokenize(x["clean_q1"]), 1)
    df['token_2'] = df.apply(lambda x: nltk.word_tokenize(x["clean_q2"]), 1)
    # spacy rep
    df['doc1'] = df.apply(lambda x: nlp(unicode(x["clean_q1"], "utf-8")), 1)
    df['doc2'] = df.apply(lambda x: nlp(unicode(x["clean_q2"], "utf-8")), 1)
    # capitalized spacy rep
    df['cap_doc1'] = df.apply(lambda x: nlp(unicode(x["clean_q1"].upper(), "utf-8")), 1)
    df['cap_doc2'] = df.apply(lambda x: nlp(unicode(x["clean_q2"].upper(), "utf-8")), 1)
    # entity
    df['entity_set_1'] = df.apply(lambda x: x["cap_doc1"].ents, 1)
    df['entity_set_2'] = df.apply(lambda x: x["cap_doc2"].ents, 1)
    # name chunk
    df['noun_chunks_1'] = df.apply(lambda x: [chunk for chunk in x["cap_doc1"].noun_chunks], 1)
    df['noun_chunks_2'] = df.apply(lambda x: [chunk for chunk in x["cap_doc2"].noun_chunks], 1)

    preprocess_time = time.time()
    print("preprocessed  for {0} seconds".format(preprocess_time-total_begin))
    
    # length #
    df.loc[:,"len_1"] = df.apply(lambda x: len(x["token_1"]), 1)
    df.loc[:,"len_2"] = df.apply(lambda x: len(x["token_2"]), 1)
    df.loc[:,"len_diff"] = np.abs(df["len_1"]-df["len_2"])
    df.loc[:,"len_diff_percent"] = np.abs(df["len_1"]-df["len_2"]) /((df["len_1"]+df["len_2"])/2)
    after_length = time.time()
    print("length fueature loaded for {0} seconds".format(after_length-preprocess_time))
    
    # first words match #
    df.loc[:,"first_word_q1"] = df.apply(lambda x: x["clean_q1"].split(" ")[0], 1)
    df.loc[:,"first_word_q2"] = df.apply(lambda x: x["clean_q2"].split(" ")[0], 1)
    df.loc[:,"first_word_match"] = (df["first_word_q1"] == df["first_word_q2"])
    after_first = time.time()
    print("first word feature loaded for {0} seconds".format(after_first-after_length))
    
    # bag of words #
#     if tokenizer is None:
#         bag_of_word_tokenizer = CountVectorizer(stop_words="english", max_features=top_k_word)
#     else:
#         bag_of_word_tokenizer = tokenizer
#     q1_matrix = bag_of_word_tokenizer.fit_transform(df["clean_q1"]).astype(np.float)
#     q2_matrix = bag_of_word_tokenizer.fit_transform(df["clean_q2"]).astype(np.float)
#     df["vec_q1"] = [q1_matrix[i] for i in range(len(df))]
#     df["vec_q2"] = [q2_matrix[i] for i in range(len(df))]
#     print("question vectorized")

    
    # similarity measure #
    #cosine_sim = [cosine_similarity(q1_matrix[i], q2_matrix[i])[0][0] for i in range(len(df))]
    #df["cosine_sim"] = cosine_sim
    df.loc[:,"overlap_percent"] = df.apply(word_overlap, 1)
    # Spacy stentence similarity
    df.loc[:,"spacy_sentence_similarity"] = df.apply(sentence_similarity, 1)
    # edit distance
    df.loc[:,"edit_distance"] = df.apply(lambda x: nltk.edit_distance(x["token_1"], x["token_2"]), 1)
    # token Jaccard
    df.loc[:,"token_jaccard"] = df.apply(lambda x: jaccard_sim(set(x["token_1"]), set(x["token_2"])), 1)
    after_sim = time.time()
    print("similarity feature loaded for {0} seconds".format(after_sim-after_first))
    
    # embedding #
    # embedding diff -- UGLY
    dim_emb = embedding.values()[0].shape[0]
    emb_mat = []
    df.apply(lambda x: emb_diff(x, embedding, emb_mat), 1)
    emb_mat = np.array(emb_mat)
    for dim in range(dim_emb):
        df["emb_diff_dim_{0}".format(dim)] = emb_mat[:,dim]
    # euclidean distance - embedding
    df.loc[:,"emb_dist"] = df.apply(lambda x: emb_dist(x, embedding), 1)
    after_emb = time.time()
    print("embedding feature loaded for {0} seconds".format(after_emb-after_sim))
    
    # entity features #
    # entity same
    df.loc[:,"entity_same"] = df.apply(lambda x: x["entity_set_1"]==x["entity_set_2"], 1)
    # entity # same
    df.loc[:,"entity_len_same"] = df.apply(lambda x: len(x["entity_set_1"])==len(x["entity_set_2"]), 1)
    # entity # diff
    df.loc[:,"entity_len_diff"] = df.apply(lambda x: np.abs(len(x["entity_set_1"])-len(x["entity_set_2"])), 1)
    # entity Jaccard
    df.loc[:,"entity_jaccard"] = df.apply(lambda x: jaccard_sim_unhashbale(x["entity_set_1"], x["entity_set_2"]), 1)
    
    # noun chunk same
    df.loc[:, "chunk_same"] = df.apply(lambda x: x["noun_chunks_1"]==x["noun_chunks_2"], 1)
    # noun chunk # same
    df.loc[:,"chunk_len_same"] = df.apply(lambda x: len(x["noun_chunks_1"])==len(x["noun_chunks_2"]), 1)
    # noun chunk # diff
    df.loc[:,"chunk_len_diff"] = df.apply(lambda x: np.abs(len(x["noun_chunks_1"])-len(x["noun_chunks_2"])), 1)
    # noun chunk Jaccard
    df.loc[:,"chunk_jaccard"] = df.apply(lambda x: jaccard_sim_unhashbale(x["noun_chunks_1"], x["noun_chunks_2"]), 1)
    after_entity = time.time()
    print("entity feature loaded for {0} seconds".format(after_entity-after_emb))
    
    
    # filter columns
    ignore_columns = ["first_word_q1", "first_word_q2", "clean_q1", "clean_q2", "token_1", "token_2", 
                      "doc1", "doc2", "cap_doc1", "cap_doc2", "noun_chunks_1", "noun_chunks_2",
                     "entity_set_1", "entity_set_2"]
    col_normalize = ['len_1', 'len_2', 'len_diff', 'edit_distance', 'emb_dist', 'entity_len_diff', 'chunk_len_diff']
    #full_feature_df = df
    clean_feature_df = df.drop(ignore_columns, axis=1)
    if normalize:
        for col in clean_feature_df.columns:
            if str(col) in col_normalize:
                col_max = np.max(clean_feature_df[col])
                col_min = np.min(clean_feature_df[col])
                clean_feature_df[col] = (clean_feature_df[col]-col_min)/float(col_max-col_min)
    after_normalize = time.time()
    print("normalization time = {0}".format(time.time()-after_normalize))
    print("total time = {0}".format(time.time()-total_begin))
    return clean_feature_df
    


In [67]:
# DATA CREATION SCRIPTS
# Quora Dataset
#full_data = pd.read_csv("../data/questions.csv")
# Kaggle Dataset
# begin_time = time.time()
# kaggle_train = pd.read_csv("../data/kaggle_competition/origin/train.csv")
# kaggle_test = pd.read_csv("../data/kaggle_competition/origin/test.csv")
# print("data loaded, used {0} seconds".format(time.time()-begin_time))

# clean dataset
# begin_time = time.time()
# clean_train = clean_dataset(kaggle_train)
# clean_test = clean_dataset(kaggle_test)
#clean_train.to_csv("../data/kaggle_competition/clean_datasets/clean_train.csv", index=False)
#clean_test.to_csv("../data/kaggle_competition/clean_datasets/clean_test.csv", index=False)
#print("data cleaned, used {0} seconds".format(time.time()-begin_time))


# split and save dataset
#begin_time = time.time()
# since Kaggle has its own test set, test_ratio=0
# train_set, validation_set, _ = split_dataset(clean_train, 0.8, 0.2, 0)
# test_set = clean_test
# train_set.to_csv("../data/kaggle_competition/clean_kaggle_train.csv", index=False)
# validation_set.to_csv("../data/kaggle_competition/clean_kaggle_validation.csv", index=False)
# test_set.to_csv("../data/kaggle_competition/clean_kaggle_test.csv", index=False)
# load splitted dataset
#train_set, validation_set, test_set = load_datasets()
# print("data splitted, used {0} seconds".format(time.time()-begin_time))


# feature engineering
#begin_time = time.time()
#feature_train = feature_engineering(train_set)
#feature_train.to_csv("../data/kaggle_competition/feature_datasets/feature_train_v2.csv", index=False)
#feature_validation = feature_engineering(validation_set)
#feature_validation.to_csv("../data/kaggle_competition/feature_datasets/feature_validation_v2.csv", index=False)
#feature_test = feature_engineering(test_set)
#feature_test.to_csv("../data/kaggle_competition/feature_datasets/feature_test_v2.csv", index=False)
#print("data featurized, used {0} seconds".format(time.time()-begin_time))
#feature_train, feature_validation, feature_test= load_datasets(load_dir = "../data/kaggle_competition/feature_datasets", prefix="feature_", post_fix="_v2")
# load splitted dataset
# train_set, validation_set, test_set = load_datasets()

# split X, y
X_train, y_train = xy_split(feature_train)
X_validate, y_validate = xy_split(feature_validation)
#X_test=feature_test.as_matrix()

In [57]:
print(len(train_set))
print(len(validation_set))
print(len(test_set))
feature_train = feature_engineering(train_set.head(1000))

323432
80858
2345796


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


preprocessed  for 4.30770587921 seconds
length fueature loaded for 0.612040996552 seconds
first word feature loaded for 0.460093021393 seconds
similarity feature loaded for 1.04951405525 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


embedding feature loaded for 3.29013299942 seconds
entity feature loaded for 1.28866410255 seconds
normalization time = 0.0
total time = 11.0092570782


In [73]:
feature_train.head()

Unnamed: 0,is_duplicate,len_1,len_2,len_diff,len_diff_percent,first_word_match,overlap_percent,spacy_sentence_similarity,edit_distance,token_jaccard,...,emb_diff_dim_49,emb_dist,entity_same,entity_len_same,entity_len_diff,entity_jaccard,chunk_same,chunk_len_same,chunk_len_diff,chunk_jaccard
0,1,14,17,3,0.193548,True,0.645161,0.972389,8,0.47619,...,0.083625,0.565029,True,True,0,0,False,False,2,0.142857
1,1,6,7,1,0.153846,True,0.769231,0.967301,2,0.625,...,0.016534,0.738428,True,True,0,0,False,False,2,0.0
2,1,8,8,0,0.0,False,0.875,0.987928,1,0.777778,...,0.151677,0.489465,False,True,0,0,False,False,1,0.5
3,0,9,10,1,0.105263,False,0.421053,0.942117,6,0.266667,...,0.345644,0.899277,False,True,0,0,False,False,2,0.0
4,0,8,29,21,1.135135,False,0.108108,0.76864,27,0.064516,...,0.072233,2.038899,False,False,3,0,False,False,5,0.0


In [21]:
print X_train.shape
print y_train.shape
print X_validate.shape
print y_validate.shape
print X_test.shape

(323432, 6)
(323432,)
(80858, 6)
(80858,)
(2345796, 6)


In [121]:
#mat1 = full_feature_df["vec_q1"].iloc[0]
#mat2 = full_feature_df["vec_q2"].iloc[0]

#cosine_similarity(mat1[5], mat2[5])
#a = time.time()
#cosine_similarity(full_feature_df["vec_q1"].iloc[10],full_feature_df["vec_q2"].iloc[10] )
#print time.time()-a
#len(full_feature_df)
#full_feature_df.to_csv("../data/full_feature_df.csv", index=False)
#np.sum(clean_feature_df["cosine_sim"]==0)/float(len(clean_feature_df))

0.65178520641719695

In [100]:
#mat1 = mat1.astype(np.float)
#mat2 = mat2.astype(np.float)
#cosine_similarity(mat1, mat2)

array([[ 0.]])

In [25]:
# PREDICTIVE MODEL FUNCTIONS

In [68]:
# MODEL ANALYTICS FUNCTIONS
def all_test_metrics(y_pred, y_test, metrics_list=["acc", "auc", "f1", "nll"]):
    score_dict = {}
    # acc
    if "acc" in metrics_list:
        y_pred_acc = np.round(y_pred).astype(np.int8)
        acc = metrics.accuracy_score(y_test, y_pred_acc, normalize=True)
        score_dict["acc"] = acc 
    # auc
    if "auc" in metrics_list:
        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
        auc = metrics.auc(fpr, tpr)
        score_dict["auc"] = auc
        #score_dict["fpr"] = fpr
        #score_dict["tpr"] = tpr
    # f1-measure
    if "f1" in metrics_list:
        y_pred_acc = np.round(y_pred).astype(np.int8)
        f1 = metrics.f1_score(y_test, y_pred_acc, labels=[0,1], pos_label=1)
        score_dict["f1"] = f1
    # nll
    if "nll" in metrics_list:
        nll = metrics.log_loss(y_test, y_pred)
        score_dict["nll"] = nll
    return score_dict


def test_model(model, X_test, y_test, verbose=True, model_name="", y_pred_test=None, pred_lambda=None):
    """
    Function that generate performance stats for a model
    """
    if y_pred_test is None:
        if pred_lambda is None:
            y_pred_test = model.predict(X_test)
        else:
            y_pred_test = pred_lambda(model, X_test)  
    scores = all_test_metrics(y_pred_test, y_test)    
    if verbose:
        print(model_name+":")
        print(scores)
    return y_pred_test, scores
    
    

In [70]:
# MODEL ANALYTICS SCRIPT
n_valid = len(validation_set)
n_test = len(test_set)

# baseline 1: majority class
y_pred_valid = [0 for i in range(n_valid)]
#y_pred_test = [0 for i in range(n_test)]
_, score_majority_class_valid = test_model(None, None, y_validate, 
                                        verbose=True, model_name="Baseline 1 - Majority Class (Validation):",
                                        y_pred_test=y_pred_valid)

# baseline 2: simple word overlap
y_pred_valid = X_validate[:,5].astype(np.double)
#y_pred_test = X_test[:,5].astype(np.double)
_, score_majority_class_valid = test_model(None, None, y_validate, 
                                        verbose=True, model_name="Baseline 2 - Simple Word Overlap (Validation):",
                                        y_pred_test=y_pred_valid)

# baseline 3: logistic regression
lr = LogisticRegression()
lr_lambda = lambda model, x: model.predict_proba(x)[:,1]
lr.fit(X_train, y_train)
lr_pred_valid, score_majority_class_valid = test_model(lr, X_validate, y_validate, verbose=True,
                                                       model_name="Baseline 3 - Simple Logistic Regression (Validation):",
                                                      pred_lambda=lr_lambda)


Baseline 1 - Majority Class (Validation)::
{'acc': 0.63063642434885847, 'f1': 0.0, 'auc': 0.5, 'nll': 12.757365947839453}
Baseline 2 - Simple Word Overlap (Validation)::
{'acc': 0.66991515990996564, 'f1': 0.61955127291387524, 'auc': 0.73065771989901296, 'nll': 0.66146289432171168}
Baseline 3 - Simple Logistic Regression (Validation)::
{'acc': 0.68349452125949195, 'f1': 0.55166252058442244, 'auc': 0.75714678542401015, 'nll': 0.54561463901481544}


In [None]:
#XG BOOST
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.1
params['max_depth'] = 4

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_validate, label=y_validate)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=50)

Will train until valid error hasn't decreased in 50 rounds.
[0]	train-logloss:0.667396	valid-logloss:0.668245
[1]	train-logloss:0.647427	valid-logloss:0.647754
[2]	train-logloss:0.630885	valid-logloss:0.630666
[3]	train-logloss:0.616394	valid-logloss:0.616429
[4]	train-logloss:0.604267	valid-logloss:0.604557
[5]	train-logloss:0.593869	valid-logloss:0.594216
[6]	train-logloss:0.585130	valid-logloss:0.584907
[7]	train-logloss:0.577106	valid-logloss:0.577148
[8]	train-logloss:0.570334	valid-logloss:0.570522
[9]	train-logloss:0.564192	valid-logloss:0.564602
[10]	train-logloss:0.558790	valid-logloss:0.559089
[11]	train-logloss:0.554301	valid-logloss:0.554600
[12]	train-logloss:0.550374	valid-logloss:0.550688
[13]	train-logloss:0.546638	valid-logloss:0.547129
[14]	train-logloss:0.543676	valid-logloss:0.544075
[15]	train-logloss:0.541156	valid-logloss:0.541324
[16]	train-logloss:0.538388	valid-logloss:0.538522
[17]	train-logloss:0.535970	valid-logloss:0.536354
[18]	train-logloss:0.534203	vali

In [None]:
# Add single entity feature 
def upper_str(input_str):
    input_str = input_str.upper()
    return input_str

def upper_dataset(full_dataset):
    """
    Function that cleans the full dataset
    """
    full_dataset["clean_q1"] = full_dataset["clean_q1"].apply(upper_str,1)
    full_dataset["clean_q2"] = full_dataset["clean_q2"].apply(upper_str,1)
    return full_dataset

X_train=upper_dataset(X_train)
X_valid=upper_dataset(X_validate)
X_test=upper_dataset(X_test)

def singleentity(row):
    """
    Function that calculates where there is any entity difference between the two questions
    """
    doc_1 = nlp(row["clean_q1"])
    doc_2 = nlp(row["clean_q2"])
    if len(doc_1.ents)!=len(doc_2.ents):
        return 1
    else:
        for ent in doc_2.ents:
            if ent not in doc_1.ents:
                return 1
    return 0

X_train["singleentity"]=X_train.apply(singleentity,1)
X_valid["singleentity"]=X_valid.apply(singleentity,1)
X_test["singleentity"]=X_test.apply(singleentity,1)



In [None]:
# Add edit distance feature
def distance(row):
    """
    Function that calculates the percentage of edit distance over the average length
    """
    token_1 = nltk.word_tokenize(row["clean_q1"])
    token_2 = nltk.word_tokenize(row["clean_q2"])
    avg_length = float(len(token_1)+len(token_2))/2
    return float(nltk.edit_distance(token_1,token_2))/avg_length

X_train["Edit_distance"]=X_train.apply(distance,1)
X_valid["Edit_distance"]=X_valid.apply(distance,1)
X_test["Edit_distance"]=X_test.apply(distance,1)


In [None]:
# Add % of length of the longest common sequence
#### May not use as it takes a long time to run ####
def lcs(xstr, ystr):
    if not xstr or not ystr:
        return 0
    x, xs, y, ys = xstr[0], xstr[1:], ystr[0], ystr[1:]
    if x == y:
        return 1 + lcs(xs, ys)
    else:
        return max(lcs(xstr, ys), lcs(xs, ystr))

def longestcommonsequence(row):
    token_1 = nltk.word_tokenize(row["clean_q1"])
    token_2 = nltk.word_tokenize(row["clean_q2"])
    avg_length = float(len(token_1)+len(token_2))/2
    return lcs(token_1,token_2)/avg_length  

X_train["LCS"]=X_train.apply(longestcommonsequence,1)
X_valid["LCS"]=X_valid.apply(longestcommonsequence,1)
X_test["LCS"]=X_test.apply(longestcommonsequence,1)

In [None]:
# Add similarity score 
def similarity(row):
    sent_1 = nlp(row["clean_q1"])
    sent_2 = nlp(row["clean_q2"])
    return sent_1.similarity(sent_2)  

X_train["Similarity"]=X_train.apply(similarity,1)
X_valid["Similarity"]=X_valid.apply(similarity,1)
X_test["Similarity"]=X_test.apply(similarity,1)

In [None]:
#XG BOOST
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_validate, label=y_validate)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=50)