In [41]:
import pandas as pd
import scipy
from sklearn import *
from matplotlib import pyplot as plt
from SimpleCountVectorizer import *
from sklearn.metrics import log_loss

# Count Vectorizer

In [42]:
train_df = pd.read_csv("C:/Users/toti/OneDrive/Jordi/.Master - Data Science UB/2nd SEMESTER/Natural Language Processing/course2020-master/quora-question-pairs/input/train.csv")

train_df, test_df = sklearn.model_selection.train_test_split(train_df, test_size=0.1, random_state=123)

In [43]:
train_df.shape, test_df.shape

((363861, 6), (40429, 6))

In [44]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    # return list(map(lambda x: str(x), all_questions)) # Slower
    return [str(s) for s in mylist]

In [45]:
all_questions = cast_list_as_strings(list(train_df.loc[:, 'question1'])+list(train_df.loc[:, 'question2']))
print(set(type(x).__name__ for x in all_questions))

{'str'}


In [46]:
count_vect = SimpleCountVectorizer()
count_vect.fit(all_questions)

HBox(children=(IntProgress(value=0, max=727722), HTML(value='')))




SimpleCountVectorizer(doc_cleaner_func=None, doc_cleaner_pattern='[^a-zA-Z]',
           dtype=<class 'numpy.float32'>, min_word_counts=1,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer_func=None,
           word_transformer_func=None)

In [47]:
def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
        
    q1 = count_vectorizer.transform(q1_casted)
    q2 = count_vectorizer.transform(q2_casted)
    
    X_q1q2 = scipy.sparse.hstack((q1,q2))
        
    return X_q1q2

In [48]:
X_tr_q1q2 = get_features_from_df(train_df,count_vect)
X_te_q1q2  = get_features_from_df(test_df, count_vect)

X_tr_q1q2.shape, train_df.shape, test_df.shape, X_te_q1q2.shape

((363861, 151608), (363861, 6), (40429, 6), (40429, 151608))

In [49]:
y_train = train_df["is_duplicate"].values
y_test = test_df["is_duplicate"].values
len(y_train)

363861

In [50]:
def get_mistakes(clf, df, X_q1q2, y):

    predictions = clf.predict(X_q1q2)    
    incorrect_predictions = predictions!=y
    incorrect_indices = np.where(incorrect_predictions)[0]
    
    if np.sum(incorrect_predictions)==0:
        print("no mistakes in this df")
    else:
        return incorrect_indices, predictions
    
def print_mistake_k(k, mistake_indices, predictions):
    print(train_df.iloc[mistake_indices[k]].question1)
    print(train_df.iloc[mistake_indices[k]].question2)
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    
def print_mistake_k_and_tokens(k, mistake_indices, predictions,
                               X_q1q2, count_vect, clf):
    q1 = train_df.iloc[mistake_indices[k]].question1
    q2 = train_df.iloc[mistake_indices[k]].question2
    
    print(q1)
    print(sorted(count_vect.tokenize(q1)))
    print("")
    print(q2)
    print(sorted(count_vect.tokenize(q2)))
    print("")
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    print("")
    print("Probability vector: [P(0|x), P(1|x)]:")
    print(clf.predict_proba(X_q1q2)[mistake_indices[k],:])


### Xgboost

In [75]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


def xgboost(X_train,y_train,X_test,y_test, estimations = 200):
    param_grid = {}
    param_grid['objective'] = 'binary:logistic'
    param_grid['eval_metric'] = ['auc', 'logloss']   #error: (wrong cases)/#(all cases)
    param_grid['eta'] = 0.02
    param_grid['max_depth'] = 4
    param_grid['n_estimators'] = estimations

    d_train = xgb.DMatrix(X_train, label=y_train)  # For sparse matrices
    d_valid = xgb.DMatrix(X_test, label=y_test)

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(param_grid, d_train, estimations, watchlist, early_stopping_rounds=50, verbose_eval=10)


In [76]:
xgboost(X_tr_q1q2,y_train,X_te_q1q2,y_test)

[0]	train-auc:0.55805	train-logloss:0.691318	valid-auc:0.55655	valid-logloss:0.691109
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-auc:0.598556	train-logloss:0.674358	valid-auc:0.595084	valid-logloss:0.67481
[20]	train-auc:0.626518	train-logloss:0.662253	valid-auc:0.624797	valid-logloss:0.663211
[30]	train-auc:0.650257	train-logloss:0.65358	valid-auc:0.647793	valid-logloss:0.654325
[40]	train-auc:0.667297	train-logloss:0.646349	valid-auc:0.664155	valid-logloss:0.647338
[50]	train-auc:0.675683	train-logloss:0.640345	valid-auc:0.672579	valid-logloss:0.641705
[60]	train-auc:0.68092	train-logloss:0.635421	valid-auc:0.678256	valid-logloss:0.636863
[70]	train-auc:0.68846	train-logloss:0.631581	valid-auc:0.686049	valid-logloss:0.633213
[80]	train-auc:0.693108	train-logloss:0.628203	valid-auc:0.690821	valid-logloss:0.629853
[90]	train-auc:0.698018	train-logloss:0.625086	valid-au

In [None]:
# mistake_indices, predictions = get_mistakes(xgb,train_df, X_tr_q1q2, y_train)

### Logistic

In [25]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, y_train)

In [40]:
pred_X_te_q1q2 = logistic.predict(X_te_q1q2)
logloss = log_loss(pred_X_te_q1q2, y_test)
print('Valid-logloss:', logloss)

Valid-logloss: 8.573087287445738


In [28]:
mistake_indices, predictions = get_mistakes(logistic,train_df, X_tr_q1q2, y_train)

In [13]:
print_mistake_k(0, mistake_indices, predictions)

What are the best places to visit in or near Juneau, Alaska?
What are the best places to visit in Alaska and why?
true class: 0
prediction: 1


In [14]:
print_mistake_k_and_tokens(10, mistake_indices, predictions,
                           X_tr_q1q2, count_vect, logistic)

Why do men like women's feet?
['do', 'feet', 'like', 'men', 'why', 'women']

Why do men like womens feet?
['do', 'feet', 'like', 'men', 'why', 'womens']

true class: 1
prediction: 0

Probability vector: [P(0|x), P(1|x)]:
[0.55506132 0.44493868]


# TFIDF