In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords

import pandas as pd
import scipy
from sklearn import *
import re
from SimpleCountVectorizerAMC import *
from TFIDFVectorizer import *

from nltk.stem import WordNetLemmatizer, SnowballStemmer

import xgboost as xgb

In [None]:
"""Given an int32 number, print it in English."""
def int_to_en(num):
    d = { 0 : 'zero', 1 : 'one', 2 : 'two', 3 : 'three', 4 : 'four', 5 : 'five',
          6 : 'six', 7 : 'seven', 8 : 'eight', 9 : 'nine', 10 : 'ten',
          11 : 'eleven', 12 : 'twelve', 13 : 'thirteen', 14 : 'fourteen',
          15 : 'fifteen', 16 : 'sixteen', 17 : 'seventeen', 18 : 'eighteen',
          19 : 'nineteen', 20 : 'twenty',
          30 : 'thirty', 40 : 'forty', 50 : 'fifty', 60 : 'sixty',
          70 : 'seventy', 80 : 'eighty', 90 : 'ninety' }
    k = 1000
    m = k * 1000
    b = m * 1000
    t = b * 1000

    assert(0 <= num)

    if (num < 20):
        return d[num]

    if (num < 100):
        if num % 10 == 0: return d[num]
        else: return d[num // 10 * 10] + '-' + d[num % 10]

    if (num < k):
        if num % 100 == 0: return d[num // 100] + ' hundred'
        else: return d[num // 100] + ' hundred and ' + int_to_en(num % 100)

    if (num < m):
        if num % k == 0: return int_to_en(num // k) + ' thousand'
        else: return int_to_en(num // k) + ' thousand, ' + int_to_en(num % k)

    if (num < b):
        if (num % m) == 0: return int_to_en(num // m) + ' million'
        else: return int_to_en(num // m) + ' million, ' + int_to_en(num % m)

    if (num < t):
        if (num % b) == 0: return int_to_en(num // b) + ' billion'
        else: return int_to_en(num // b) + ' billion, ' + int_to_en(num % b)

    if (num % t == 0): return int_to_en(num // t) + ' trillion'
    else: return int_to_en(num // t) + ' trillion, ' + int_to_en(num % t)

    raise AssertionError('num is too large: %s' % str(num))

# Count Vectorizer

In [None]:
train_df = pd.read_csv("./data/quora_train_data.csv")
test_df = pd.read_csv('./data/quora_test_data.csv')

# train_df, test_df = sklearn.model_selection.train_test_split(train_df, test_size=0.1, random_state=123)

In [None]:
train_df.shape, test_df.shape

In [None]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    # return list(map(lambda x: str(x), all_questions)) # Slower
    return [str(s) for s in mylist]

In [None]:
all_questions = cast_list_as_strings(list(train_df.loc[:, 'question1'])+list(train_df.loc[:, 'question2']))
print(set(type(x).__name__ for x in all_questions))

In [None]:
def num_conv(s):
    try:
        return int_to_en(int(s)).replace(",","").replace(" ","_")
    except:
        return s


def my_doc_cleaner(doc,
                  pat=r"[^a-zA-Z0-9]"):
    # Allow alphanumeric characters
    doc_cleaner_pattern=pat
    clean_doc_pattern = re.compile(doc_cleaner_pattern)
    doc_clean = clean_doc_pattern.sub(" ", doc)
    return doc.lower()


# stpw = set(stopwords.words("english"))
stpw = []
question_words = ['who','what','when','where','why','how']

stemmer =  SnowballStemmer(language='english')
lemmatizer = WordNetLemmatizer()

def my_tokenizer_func(doc, 
                      ngrams=(1,3), 
                      numbers_to_words=True,
                      stop_words=stpw,
                      pat=r"(?u)\b\w\S*\w*\b",
                      duplicate_question_words=question_words,
                      lem=True,
                      stem=True,
                      add_num_tokens=True):
    
    # Split using a patterm
#     pat=r"(?u)\b\w\w+\b"
#     pat=r"(?u)\b\w\S*\w*\b"
    token_pattern = re.compile(pat)
    lst = token_pattern.findall(doc)
    
    # Transform numbers into words
    if numbers_to_words:
        lst = list(map(lambda x: num_conv(x), lst))
        
    # Drop stopwords 
    lst = list(filter(lambda x : x not in stop_words, lst))
    
    # Duplicate key_words
    if len(duplicate_question_words)>0:
        lst += [value for value in lst if value.lower() in duplicate_question_words]
    
    #Stemmer
    if stem:
        lst = list(map(lambda x: stemmer.stem(x), lst))
    
    #Lemmatizer 
    if lem:
        lst = list(map(lambda x: lemmatizer.lemmatize(x), lst))
        
    if ngrams==(1,1):
        return lst
    
    # Generate ngrams
    lstRet = []
    for a in range(ngrams[0], ngrams[1]+1):
        if a!=1:
            lstRet+=list(zip(*[lst[i:] for i in range(a)]))
            
    # N-tokens
    if add_num_tokens:
        lst.append(num_conv(str(len(lst))) + 'tokens')
    
    return lstRet if ngrams[0]!=1 else lst+lstRet

In [None]:
count_vect = SimpleCountVectorizerAMC(
    doc_cleaner_func=my_doc_cleaner,
    tokenizer_func=my_tokenizer_func
)
count_vect.fit(all_questions)

In [None]:
def get_features_from_df(df, vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
    
    q1 = vectorizer.transform(q1_casted)
    q2 = vectorizer.transform(q2_casted)
    
    X_q1q2 = scipy.sparse.hstack((q1,q2))
        
    return X_q1q2

In [None]:
# %time X_tr_q1q2 = get_features_from_df(train_df,count_vect)
# %time X_te_q1q2  = get_features_from_df(test_df, count_vect)

# X_tr_q1q2.shape, train_df.shape, test_df.shape, X_te_q1q2.shape
# ((323432, 6166022), (323432, 6), (80858, 6), (80858, 6166022))

In [None]:
y_train = train_df["is_duplicate"].values
y_test = test_df['is_duplicate'].values

## TFIDF

In [None]:
tfidf_vectorizer = TFIDFVectorizer(count_vect.vocabulary, count_vect.word_to_ind, count_vect.tokenize)
tfidf_vectorizer.fit(all_questions)

X_tfidf_tr_q1q2 = get_features_from_df(train_df, tfidf_vectorizer)
X_tfidf_te_q1q2  = get_features_from_df(test_df, tfidf_vectorizer)

X_tfidf_tr_q1q2.shape, train_df.shape, test_df.shape, X_tfidf_te_q1q2.shape
# ((323432, 6595608), (323432, 6), (80858, 6), (80858, 6595608))

## XGBoost

In [None]:
import xgboost as xgb

N = 10000 # With early stopping
xgb_model = xgb.XGBClassifier(n_estimators=N)
xgb_model.fit(X_tfidf_tr_q1q2, y_train, 
              verbose=10, 
              eval_set=[(X_tfidf_tr_q1q2, y_train),(X_tfidf_te_q1q2, y_test)], 
              early_stopping_rounds =10,
              eval_metric=['auc','logloss'],
              )

#TRFIDF # [80]	validation_0-auc:0.76871	validation_0-logloss:0.561218	validation_1-auc:0.677593	validation_1-logloss:0.615557
#COUNTV # [80]	validation_0-auc:0.739739	validation_0-logloss:0.581786	validation_1-auc:0.738985	validation_1-logloss:0.582385

## Plot

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

results = xgb_model.evals_result()
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)

fig = plt.figure(figsize=(20,6))

# plot log loss
ax = fig.add_subplot(121)
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
ax.set_ylabel('Log Loss')
ax.set_title('XGBoost Log Loss')

# plot classification AUC
ax = fig.add_subplot(122)
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Test')
ax.legend()
ax.set_ylabel('Classification AUC')
ax.set_title('XGBoost Classification AUC')
plt.show()

### Save the model

In [None]:
from datetime import datetime

In [None]:
now = datetime.now().strftime("%d.%m_%H.%M")
xgb_model.save_model('models/model_{}.dat'.format(now))

### GridSearch

In [None]:
# from sklearn.model_selection import GridSearchCV

# parameters = {'nthread':[4], 
#               'objective':['binary:logistic'],
#               'learning_rate': [0.1,0.5,1], 
#               'max_depth': [3,5],
#               'scale_pos_weight':[1,5,20],
#               'min_child_weight': [4,5],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7, 0.9],
#               'n_estimators': [50]}

# N = 100
# xgb_model = xgb.XGBClassifier(n_estimators=N)
# grid = GridSearchCV(xgb_model, parameters,
#                     n_jobs = 5, scoring='roc_auc',
#                     verbose=True)
# grid.fit(X_tr_q1q2, y_train)

### Mistakes

In [None]:
def get_mistakes(clf, X_q1q2, y):

    predictions = np.around(clf.predict(X_q1q2)).astype(int)   
#     print(y[:10])
#     print(predictions[:10])
    incorrect_predictions = predictions!=y
#     print(incorrect_predictions[:10])
#     print(np.where(incorrect_predictions)[:10])
    incorrect_indices = np.where(incorrect_predictions)[0]
    
    if np.sum(incorrect_predictions)==0:
        print("no mistakes in this df")
    else:
        return incorrect_indices, predictions
    
def print_mistake_k(k, mistake_indices, predictions, df):
    print(df.iloc[mistake_indices[k]].question1)
    print(df.iloc[mistake_indices[k]].question2)
    print("true class:", df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    
def print_mistake_k_and_tokens(k, mistake_indices, predictions,
                               X_q1q2, count_vect, clf, df):
    q1 = df.iloc[mistake_indices[k]].question1
    q2 = df.iloc[mistake_indices[k]].question2
    
    print(q1)
    print(count_vect.tokenize(q1))
    print()
    print(q2)
    print(count_vect.tokenize(q2))
    print()
    print("true class:", df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    print()
    print("Probability vector: [P(0|x), P(1|x)]:")
    print(clf.predict_proba(X_q1q2)[mistake_indices[k],:])
    
    
def hist_errors(mistake_indices, predictions,
                               X_q1q2, count_vect, clf, df):
    qs = df.iloc[mistake_indices][['question1', 'question2']]
    qs['true_class']=df.iloc[mistake_indices].is_duplicate
    qs['prediction']=predictions[mistake_indices]
    qs['P(1|x)']=clf.predict_proba(X_q1q2)[mistake_indices,:][:,1]
    qs = qs.reset_index(drop=True)
    return qs


In [None]:
mistake_indices, predictions = get_mistakes(xgb_model, X_te_q1q2, y_test)

### Load the model

In [None]:
# now = 

In [None]:
bst = xgb.Booster({'nthread': 4})  # init booster
bst.load_model('models/model_{}.dat'.format(now))  # load data

In [None]:
xgb_loaded_model = xgb.XGBClassifier()
xgb_loaded_model._Booster=bst

In [None]:
mistake_indices, predictions = get_mistakes(xgb_loaded_model, X_te_q1q2, y_test)

In [None]:
len(mistake_indices)

In [None]:
print_mistake_k(0, mistake_indices, predictions, test_df)

In [None]:
print_mistake_k_and_tokens(0, mistake_indices, predictions,
                           X_te_q1q2, count_vect, xgb_loaded_model, test_df)

In [None]:
qs = hist_errors(mistake_indices, predictions,
                           X_te_q1q2, count_vect, xgb_loaded_model, test_df)

In [None]:
N = 20

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', N)
pd.set_option('display.max_colwidth', -1)

In [None]:
# Read mistakes
qs.head(N)

# TFIDF

In [None]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", verbose=1, max_iter=100)
logistic.fit(X_tfidf_tr_q1q2, y_train)

In [None]:
logistic.score(X_tfidf_tr_q1q2, y_train), logistic.score(X_tfidf_te_q1q2, y_test)

In [None]:
mistake_indices, predictions = get_mistakes(logistic, X_tfidf_te_q1q2, y_test)

In [None]:
print_mistake_k(0, mistake_indices, predictions)

In [None]:
print_mistake_k_and_tokens(0, mistake_indices, predictions,
                           X_tfidf_te_q1q2, tfidf_vectorizer, logistic)

In [None]:
######### SEARCH QUESTIONS WITH SAME TOKENS ###########

q1_casted =  cast_list_as_strings(list(train_df["question1"]))
q2_casted =  cast_list_as_strings(list(train_df["question2"]))

q1 = count_vect.transform(q1_casted)
q2 = count_vect.transform(q2_casted)

same_tokens_idxs = []
for i in range(len(q1_casted)):
    same_features = ( q1[i] != q2[i] ).nnz == 0
    if same_features:
        same_tokens_idxs.append(i)
    if i % 500 == 0: print(i)

same_tokens_idxs = np.array(same_tokens_idxs)


same_tokens = train_df.iloc[same_tokens_idxs]
duplicates = same_tokens['is_duplicate'] == 1

In [None]:
same_tokens[duplicates]

In [None]:
same_tokens[~duplicates]