In [3]:
import pandas as pd
import scipy
from sklearn import *
from matplotlib import pyplot as plt
from SimpleCountVectorizer import *
from sklearn.metrics import log_loss

# Count Vectorizer

In [4]:
train_df = pd.read_csv("./data/quora_train_data.csv")
test_df = pd.read_csv('./data/quora_test_data.csv')

train_df, test_df = sklearn.model_selection.train_test_split(train_df, test_size=0.1, random_state=123)

In [5]:
train_df.shape, test_df.shape

((291088, 6), (32344, 6))

In [6]:
train_df['question1'] = train_df['question1'].str.lower()
train_df['question2'] = train_df['question2'].str.lower()

In [7]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    # return list(map(lambda x: str(x), all_questions)) # Slower
    return [str(s) for s in mylist]

In [8]:
all_questions = cast_list_as_strings(list(train_df.loc[:, 'question1'])+list(train_df.loc[:, 'question2']))
print(set(type(x).__name__ for x in all_questions))

{'str'}


In [9]:
count_vect = SimpleCountVectorizer()
count_vect.fit(all_questions)

HBox(children=(IntProgress(value=0, max=582176), HTML(value='')))




SimpleCountVectorizer(doc_cleaner_func=None, doc_cleaner_pattern='[^a-zA-Z]',
           dtype=<class 'numpy.float32'>, min_word_counts=1,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer_func=None,
           word_transformer_func=None)

In [10]:
def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
        
    q1 = count_vectorizer.transform(q1_casted)
    q2 = count_vectorizer.transform(q2_casted)
    
    X_q1q2 = scipy.sparse.hstack((q1,q2))
        
    return X_q1q2

In [11]:
X_tr_q1q2 = get_features_from_df(train_df,count_vect)
X_te_q1q2  = get_features_from_df(test_df, count_vect)

X_tr_q1q2.shape, train_df.shape, test_df.shape, X_te_q1q2.shape

((291088, 137980), (291088, 6), (32344, 6), (32344, 137980))

In [12]:
y_train = train_df["is_duplicate"].values
y_test = test_df["is_duplicate"].values
len(y_train)

291088

In [39]:
def get_mistakes(clf, df, X_q1q2, y):

    predictions = clf.predict(X_q1q2) 
    predictions = np.where(p_test > 0.5, 1, 0)
    incorrect_predictions = predictions!=y
    incorrect_indices = np.where(incorrect_predictions)[0]
    
    if np.sum(incorrect_predictions)==0:
        print("no mistakes in this df")
    else:
        return incorrect_indices, predictions
    
def print_mistake_k(k, mistake_indices, predictions):
    print(train_df.iloc[mistake_indices[k]].question1)
    print(train_df.iloc[mistake_indices[k]].question2)
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    
def print_mistake_k_and_tokens(k, mistake_indices, predictions,
                               X_q1q2, count_vect, clf):
    q1 = train_df.iloc[mistake_indices[k]].question1
    q2 = train_df.iloc[mistake_indices[k]].question2
    
    print(q1)
    print(sorted(count_vect.tokenize(q1)))
    print("")
    print(q2)
    print(sorted(count_vect.tokenize(q2)))
    print("")
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    print("")
    print("Probability vector: [P(0|x), P(1|x)]:")
    print(clf.predict_proba(X_q1q2)[mistake_indices[k],:])


### Xgboost

In [45]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {}
param_grid['objective'] = 'binary:logistic'
param_grid['eval_metric'] = ['auc', 'logloss']   #error: (wrong cases)/#(all cases)
param_grid['eta'] = 0.02
param_grid['max_depth'] = 4
param_grid['n_estimators'] = 50

d_train = xgb.DMatrix(X_tr_q1q2, label=y_train)  # For sparse matrices
d_valid = xgb.DMatrix(X_te_q1q2, label=y_test)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(param_grid, d_train, 50, watchlist, early_stopping_rounds=50, verbose_eval=10, )
#200 # train-auc:0.725745	train-logloss:0.604877	valid-auc:0.722053	valid-logloss:0.605704

[0]	train-auc:0.55769	train-logloss:0.691289	valid-auc:0.562277	valid-logloss:0.691048
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-auc:0.607363	train-logloss:0.67426	valid-auc:0.606802	valid-logloss:0.6742
[20]	train-auc:0.62609	train-logloss:0.662353	valid-auc:0.625992	valid-logloss:0.662287
[30]	train-auc:0.648365	train-logloss:0.653169	valid-auc:0.647757	valid-logloss:0.653149
[40]	train-auc:0.662526	train-logloss:0.646348	valid-auc:0.66187	valid-logloss:0.646229
[49]	train-auc:0.671251	train-logloss:0.641173	valid-auc:0.670221	valid-logloss:0.641062


In [47]:
mistake_indices, predictions = get_mistakes(bst, train_df, d_train, y_train)
print_mistake_k(1, mistake_indices, predictions)

is an all-out nuclear war survivable?
would all out nuclear war destroy all life on earth?
true class: 1
prediction: 0


In [53]:
print_mistake_k_and_tokens(10, mistake_indices, predictions,
                           X_tr_q1q2, count_vect, logistic)

can i hack any phone by just having his phone number?
['any', 'by', 'can', 'hack', 'having', 'his', 'just', 'number', 'phone', 'phone']

can someone hack into your iphone just by knowing your phone number?
['by', 'can', 'hack', 'into', 'iphone', 'just', 'knowing', 'number', 'phone', 'someone', 'your', 'your']

true class: 1
prediction: 0

Probability vector: [P(0|x), P(1|x)]:
[0.29346835 0.70653165]


### GradientBoostingClassifier
+ No funciona con matrices sparse. Warning, peta el ordenador.

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier

# parameters = {
#     "loss":["deviance"],
#     'n_estimators':[32, 100],
#     "learning_rate": [0.010, 0.05, 0.1],
# #     "max_depth":[8,10,12, 15, 18, 20, 25],
# #     "max_features":[8, 12, 14, 15, 18, 20, 25, 30, 35],
# #     "criterion": ["friedman_mse",  "mae"],
# #     "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
#     }

# clf_gradboost2 = GridSearchCV(GradientBoostingClassifier(), parameters, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
# clf_gradboost2.fit(X_tr_q1q2, y_train)

# print("Best parameters found: ",clf_gradboost2.best_params_)
# print("Accuracy score (training): {0:.3f}".format(clf_gradboost2.score(X_tr_q1q2, y_train)))
# print("Accuracy score (validation): {0:.3f}".format(clf_gradboost2.score(X_te_q1q2,y_test)))

### Logistic

In [48]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [49]:
pred_X_te_q1q2 = logistic.predict(X_te_q1q2)
logloss = log_loss(pred_X_te_q1q2, y_test)
print('Valid-logloss:', logloss)
# Valid-logloss: 8.501325433826551

Valid-logloss: 8.501325433826551


In [50]:
mistake_indices, predictions = get_mistakes(logistic,train_df, X_tr_q1q2, y_train)

In [51]:
print_mistake_k(1, mistake_indices, predictions)

is an all-out nuclear war survivable?
would all out nuclear war destroy all life on earth?
true class: 1
prediction: 0


In [52]:
print_mistake_k_and_tokens(10, mistake_indices, predictions,
                           X_tr_q1q2, count_vect, logistic)

can i hack any phone by just having his phone number?
['any', 'by', 'can', 'hack', 'having', 'his', 'just', 'number', 'phone', 'phone']

can someone hack into your iphone just by knowing your phone number?
['by', 'can', 'hack', 'into', 'iphone', 'just', 'knowing', 'number', 'phone', 'someone', 'your', 'your']

true class: 1
prediction: 0

Probability vector: [P(0|x), P(1|x)]:
[0.29346835 0.70653165]


# TFIDF