In [None]:
from __future__ import print_function, division
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', None)
plt.rcParams.update({'font.size': 22})
from tqdm import tqdm
from numpy import random
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn import pipeline, preprocessing

from sklearn.model_selection import train_test_split, cross_val_predict,cross_val_score, StratifiedShuffleSplit,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, \
                            accuracy_score, f1_score, roc_auc_score, roc_curve, \
                             precision_recall_curve,log_loss, confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
#from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, WhitespaceTokenizer
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
from nltk.stem.lancaster import LancasterStemmer
from xgboost import XGBClassifier
from fuzzywuzzy import fuzz
from sklearn.metrics.pairwise import pairwise_distances
#from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
import gensim
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from sklearn.manifold import TSNE

In [None]:
df = pd.read_csv('../data/quora_duplicate_questions.tsv', sep='\t')
df.fillna('',inplace = True)

In [None]:
X,y  = df[['question1','question2']], df['is_duplicate']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=89)

In [None]:
def get_token_features(text1, text2, n_gram):
    q1_token = word_tokenize(text1.decode('utf-8'))
    q2_token = word_tokenize(text2.decode('utf-8'))
    
    q1_tags = pos_tag(q1_token)
    q2_tags = pos_tag(q2_token)
    
    q1_noun = [word for word,pos in q1_tags \
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    
    q2_noun = [word for word,pos in q2_tags \
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    
    q1_verb = [stemmer.stem(word) for word,pos in q1_tags \
        if (pos == 'VB' or pos == 'VBD' or pos == 'VBG' \
            or pos == 'VBN' or pos == 'VBP' or pos == 'VBZ')]
    
    q2_verb = [stemmer.stem(word) for word,pos in q2_tags \
        if (pos == 'VB' or pos == 'VBD' or pos == 'VBG' \
            or pos == 'VBN' or pos == 'VBP' or pos == 'VBZ')]
    
    word_count_avg = (len(q1_token)+len(q2_token))/2.0
    word_count_diff = abs(len(q1_token)-len(q2_token))/2.0
    word_overlap_count = len(set(q1_token).intersection(set(q2_token)))
    word_overlap_ratio = word_overlap_count/word_count_avg if word_count_avg !=0 else 0
    
    noun_overlap_count = len(set(q1_noun).intersection(set(q2_noun)))
    
    if (len(q1_noun)+len(q2_noun))/2.0 !=0:
        noun_overlap_ratio = noun_overlap_count/((len(q1_noun)+len(q2_noun))/2.0)
    else:
        noun_overlap_ratio = 1
    
    verb_overlap_count = len(set(q1_verb).intersection(set(q2_verb)))
    
    if (len(q1_verb)+len(q2_verb))/2.0 !=0:
        verb_overlap_ratio = verb_overlap_count/((len(q1_verb)+len(q2_verb))/2.0)
    else:
        verb_overlap_ratio = 1
    
    
    return pd.Series({'word_count_avg': word_count_avg,\
                      'word_count_diff':word_count_diff,\
                      'word_overlap_count': word_overlap_count,\
                      'word_overlap_ratio': word_overlap_ratio,\
                      'noun_overlap_count': noun_overlap_count,\
                      'noun_overlap_ratio': noun_overlap_ratio,\
                      'verb_overlap_count': verb_overlap_count,\
                      'verb_overlap_ratio': verb_overlap_ratio
                     })

token_feature = X.apply(lambda x: get_token_features(x.question1, x.question2, 1), axis=1)    

question1_length = X.question1.apply(lambda x: len(x))
question2_length = X.question2.apply(lambda x: len(x))
length_avg = pd.Series(np.mean([question1_length,question2_length], axis=0),name='length_avg')
length_diff = pd.Series(abs(question1_length-question2_length)/2, name='length_diff')
length_ratio = pd.Series(length_diff/length_avg, name='length_ratio').fillna(0)

same_last_punct = pd.Series((X.question1.apply(lambda x: x[-1] if x != '' else '') == \
                   X.question2.apply(lambda x: x[-1] if x != '' else '')).astype(int), \
                            name='same_last_punct')

df_raw = pd.concat([length_avg,length_diff,length_ratio,same_last_punct,\
                       token_feature],\
                      axis=1)

df_raw.to_csv('../data/df_raw.csv', index=False)

In [None]:
def get_fuzz_features(text1, text2):
    fuzz_ratio = fuzz.ratio(text1, text2)
    fuzz_partial_ratio = fuzz.partial_ratio(text1, text2)
    fuzz_token_sort_ratio  = fuzz.token_sort_ratio(text1, text2)
    fuzz_token_set_ratio = fuzz.token_set_ratio(text1, text2)
    return pd.Series({'fuzz_ratio': fuzz_ratio,\
                      'fuzz_partial_ratio': fuzz_partial_ratio, \
                      'fuzz_token_sort_ratio': fuzz_token_sort_ratio, \
                      'fuzz_token_set_ratio': fuzz_token_set_ratio
                     })
df_fuzz = X.apply(lambda x: get_fuzz_features(x.question1, x.question2), axis=1)    
df_fuzz.to_csv('../data/df_fuzz.csv', index=False)

In [None]:
tfidf = TfidfVectorizer()
tfidf_stop = TfidfVectorizer(stop_words='english')


tf_matrix = tfidf.fit_transform(np.concatenate([X.question1.values, X.question2.values]))
tf_stop_matrix = tfidf_stop.fit_transform(np.concatenate([X.question1.values, X.question2.values]))

In [None]:
def get_reduced_matrix(matrix, prefix, pca_n=50):
    lsa = TruncatedSVD(pca_n, algorithm = 'arpack')
    reduced_tf_matrix = lsa.fit_transform(matrix)
    question1_tf_reduced = reduced_tf_matrix[:X.shape[0],:]
    question2_tf_reduced = reduced_tf_matrix[X.shape[0]:,:]
    
    tf_reduced_cosine_dis = []
    tf_reduced_manhattan_dis = []
    tf_reduced_euclidean_dis = []
    tf_reduced_braycurtis_dis = []

    column_name = ['cosine_dis','manhattan_dis','euclidean_dis','braycurtis_dis']
    new_column_name = [prefix+'_'+item for item in column_name]
    
    for i in tqdm(range(X.shape[0])):
        tf_reduced_cosine_dis.append(pairwise_distances(question1_tf_reduced[i,:].reshape(1,-1),\
                                                        question2_tf_reduced[i,:].reshape(1,-1), \
                                                        metric='cosine')[0][0])
        tf_reduced_manhattan_dis.append(pairwise_distances(question1_tf_reduced[i,:].reshape(1,-1),\
                                                        question2_tf_reduced[i,:].reshape(1,-1), \
                                                        metric='manhattan')[0][0])
        tf_reduced_euclidean_dis.append(pairwise_distances(question1_tf_reduced[i,:].reshape(1,-1),\
                                                        question2_tf_reduced[i,:].reshape(1,-1), \
                                                        metric='euclidean')[0][0])
        tf_reduced_braycurtis_dis.append(pairwise_distances(question1_tf_reduced[i,:].reshape(1,-1),\
                                                        question2_tf_reduced[i,:].reshape(1,-1), \
                                                        metric='braycurtis')[0][0])

    return pd.DataFrame(np.column_stack((tf_reduced_cosine_dis,tf_reduced_manhattan_dis,\
                 tf_reduced_euclidean_dis,tf_reduced_braycurtis_dis)),columns = new_column_name)
    

In [None]:
df_reduced_tf = get_reduced_matrix(tf_matrix, 'tf_reduced',pca_n= 100)

In [None]:
df_reduced_stop_tf = get_reduced_matrix(tf_stop_matrix, 'tf_red_stop',pca_n= 100)

In [None]:
df_reduced_tf.to_csv('../data/df_reduced_tf.csv',index=False)
df_reduced_stop_tf.to_csv('../data/df_reduced_stop_tf.csv', index=False)

In [None]:
def get_tfidf_features(vecor_model, text1, text2, prefix):
    tfidf_cosine_dis = pairwise_distances(vecor_model.transform([text1]),\
                                  vecor_model.transform([text2]), metric='cosine')[0][0]
    tfidf_manhattan_dis = pairwise_distances(vecor_model.transform([text1]),\
                                 vecor_model.transform([text2]), metric='manhattan')[0][0]
    tfidf_euclidean_dis = pairwise_distances(vecor_model.transform([text1]),\
                                  vecor_model.transform([text2]), metric='euclidean')[0][0]
    
    tfidf_jaccard_dis = pairwise_distances(vecor_model.transform([text1]).todense(),\
                                 vecor_model.transform([text2]).todense(),\
                                           metric='jaccard')[0][0]
    tfidf_braycurtis_dis = pairwise_distances(vecor_model.transform([text1]).todense(),\
                                 vecor_model.transform([text2]).todense(),\
                                           metric='braycurtis')[0][0]
    
    return pd.Series({prefix+'_cosine_dis': tfidf_cosine_dis,\
                      prefix+'_manhattan_dis':tfidf_manhattan_dis,\
                      prefix+'_euclidean_dis':tfidf_euclidean_dis,\
                      prefix+'_jaccard_dis':tfidf_jaccard_dis,\
                      prefix+'_braycurtis_dis':tfidf_braycurtis_dis
                     })

df_tfidf = X.apply(lambda x: \
                        get_tfidf_features(tfidf, x.question1, x.question2,'tfidf'), axis=1)

df_tfidf.to_csv('../data/df_tfidf.csv', index=False)

In [None]:
df_tfidf_stop = X.apply(lambda x: \
                        get_tfidf_features(tfidf_stop, x.question1, x.question2,'tfidf_stop'), axis=1)

df_tfidf_stop.to_csv('../data/df_tfidf_stop.csv', index=False)

In [None]:
gs_model = gensim.models.KeyedVectors.load_word2vec_format('~/Dropbox/DS/nlp_data/GoogleNews-vectors-negative300.bin.gz', binary=True)

#'eury' in gs_model.vocab

def sentence_to_vec(sentence):
    words = word_tokenize(sentence.decode('utf-8'))
    sentence_matrix = []
    for w in words:
        if w in gs_model.vocab:
            sentence_matrix.append(gs_model[w])
    if not sentence_matrix:
        sentence_vec = np.zeros(300,)
    else:
        sentence_matrix = np.array(sentence_matrix)
        sentence_vec = np.mean(sentence_matrix, axis=0)
    
    return sentence_vec
    

def get_sentence2vec_features(text1, text2):
    s2vec_cosine_dis = cosine(sentence_to_vec(text1),sentence_to_vec(text2))
    s2vec_manhattan_dis = cityblock(sentence_to_vec(text1),sentence_to_vec(text2))
    s2vec_canberra_dis = canberra(sentence_to_vec(text1),sentence_to_vec(text2))
    s2vec_euclidean_dis = euclidean(sentence_to_vec(text1),sentence_to_vec(text2))
    s2vec_braycurtis_dis = braycurtis(sentence_to_vec(text1),sentence_to_vec(text2))
    
        
    return pd.Series({'s2vec_cosine_dis': s2vec_cosine_dis,\
                      's2vec_manhattan_dis':s2vec_manhattan_dis,\
                      's2vec_canberra_dis':s2vec_canberra_dis,\
                      's2vec_euclidean_dis':s2vec_euclidean_dis,\
                      's2vec_braycurtis_dis':s2vec_braycurtis_dis
                     })

df_s2vec = X.apply(lambda x: get_sentence2vec_features(x.question1, x.question2) , axis=1)

df_s2vec = df_s2vec.apply(lambda x: x.fillna(x.max()),axis=0)

df_s2vec.to_csv('../data/df_s2vec.csv', index=False)