In [1]:
import pandas as pd
import scipy
from sklearn import *
from matplotlib import pyplot as plt
from SimpleCountVectorizer import *
from TFIDFVectorizer import *
from sklearn.metrics import log_loss
import sklearn.pipeline


# Count Vectorizer

In [2]:
train_df = pd.read_csv("./data/quora_train_data.csv")
test_df = pd.read_csv('./data/quora_test_data.csv')

train_df, test_df = sklearn.model_selection.train_test_split(train_df, test_size=0.1, random_state=123)

In [3]:
train_df.shape, test_df.shape

((291088, 6), (32344, 6))

In [4]:
train_df['question1'] = train_df['question1'].str.lower()
train_df['question2'] = train_df['question2'].str.lower()

In [5]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    # return list(map(lambda x: str(x), all_questions)) # Slower
    return [str(s) for s in mylist]

In [6]:
all_questions = cast_list_as_strings(list(train_df.loc[:, 'question1'])+list(train_df.loc[:, 'question2']))
print(set(type(x).__name__ for x in all_questions))

{'str'}


In [7]:
count_vect = SimpleCountVectorizer()
count_vect.fit(all_questions)

HBox(children=(IntProgress(value=0, max=582176), HTML(value='')))




SimpleCountVectorizer(doc_cleaner_func=None, doc_cleaner_pattern='[^a-zA-Z]',
           dtype=<class 'numpy.float32'>, min_word_counts=1,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer_func=None,
           word_transformer_func=None)

In [8]:
def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
        
    q1 = count_vectorizer.transform(q1_casted)
    q2 = count_vectorizer.transform(q2_casted)
    
    X_q1q2 = scipy.sparse.hstack((q1,q2))
        
    return X_q1q2

In [9]:
X_tr_q1q2 = get_features_from_df(train_df,count_vect)
X_te_q1q2  = get_features_from_df(test_df, count_vect)

X_tr_q1q2.shape, train_df.shape, test_df.shape, X_te_q1q2.shape

((291088, 137980), (291088, 6), (32344, 6), (32344, 137980))

In [10]:
y_train = train_df["is_duplicate"].values
y_test = test_df["is_duplicate"].values
len(y_train)

291088

In [11]:
def get_mistakes(clf, df, X_q1q2, y):

    predictions = clf.predict(X_q1q2) 
    predictions = np.where(predictions > 0.5, 1, 0)
    incorrect_predictions = predictions!=y
    incorrect_indices = np.where(incorrect_predictions)[0]
    
    if np.sum(incorrect_predictions)==0:
        print("no mistakes in this df")
    else:
        return incorrect_indices, predictions
    
def print_mistake_k(k, mistake_indices, predictions):
    print(train_df.iloc[mistake_indices[k]].question1)
    print(train_df.iloc[mistake_indices[k]].question2)
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    
def print_mistake_k_and_tokens(k, mistake_indices, predictions,
                               X_q1q2, count_vect, clf):
    q1 = train_df.iloc[mistake_indices[k]].question1
    q2 = train_df.iloc[mistake_indices[k]].question2
    
    print(q1)
    print(sorted(count_vect.tokenize(q1)))
    print("")
    print(q2)
    print(sorted(count_vect.tokenize(q2)))
    print("")
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    print("")
    print("Probability vector: [P(0|x), P(1|x)]:")
    print(clf.predict(X_q1q2)[mistake_indices[k],:])


# TFIDF

In [37]:
tfidf_vectorizer = TFIDFVectorizer(count_vect.vocabulary, count_vect.word_to_ind, count_vect.tokenize)
tfidf_vectorizer.fit(all_questions)

TFIDF fit finished in 9.82 seconds


In [38]:
X_tfidf_tr_q1q2 = get_features_from_df(train_df, tfidf_vectorizer)
X_tfidf_te_q1q2  = get_features_from_df(test_df, tfidf_vectorizer)

X_tfidf_tr_q1q2.shape, train_df.shape, test_df.shape, X_tfidf_te_q1q2.shape

TFIDF transform finished in 6.57 seconds
TFIDF transform finished in 6.78 seconds
TFIDF transform finished in 0.74 seconds
TFIDF transform finished in 0.79 seconds


((291088, 137980), (291088, 6), (32344, 6), (32344, 137980))

### Xgboost

In [54]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# PARAM GRID
param_grid = {}
param_grid['objective'] = 'binary:logistic'
param_grid['eval_metric'] = ['auc', 'logloss']   #error: (wrong cases)/#(all cases)
param_grid['eta'] = 0.02
param_grid['max_depth'] = 10
param_grid['n_estimators'] = 300

d_train = xgb.DMatrix(X_tr_q1q2, label=y_train)  # For sparse matrices
d_valid = xgb.DMatrix(X_te_q1q2, label=y_test)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(param_grid, d_train, 300, watchlist, early_stopping_rounds=50, verbose_eval=25, )
#300 # train-auc:0.818362	train-logloss:0.527855	valid-auc:0.801083	valid-logloss:0.53815

[0]	train-auc:0.657194	train-logloss:0.690061	valid-auc:0.655265	valid-logloss:0.689945
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[25]	train-auc:0.711955	train-logloss:0.636014	valid-auc:0.707644	valid-logloss:0.636853
[50]	train-auc:0.741775	train-logloss:0.608835	valid-auc:0.734113	valid-logloss:0.611128
[75]	train-auc:0.760416	train-logloss:0.591464	valid-auc:0.750003	valid-logloss:0.595166
[100]	train-auc:0.772393	train-logloss:0.578186	valid-auc:0.760752	valid-logloss:0.583033
[125]	train-auc:0.781855	train-logloss:0.568135	valid-auc:0.769198	valid-logloss:0.573895
[150]	train-auc:0.790791	train-logloss:0.559518	valid-auc:0.777192	valid-logloss:0.56613
[175]	train-auc:0.797886	train-logloss:0.552163	valid-auc:0.783239	valid-logloss:0.559622
[200]	train-auc:0.802936	train-logloss:0.546385	valid-auc:0.787655	valid-logloss:0.554458
[225]	train-auc:0.807772	train-logloss:0.5404

In [55]:
mistake_indices, predictions = get_mistakes(bst, train_df, d_train, y_train)
print_mistake_k(0, mistake_indices, predictions)

[0.5083925  0.22813556 0.3240189  ... 0.41875947 0.4998089  0.19342262]
how do i get home tutors?
how can i trust a home tutor?
true class: 0
prediction: 1


In [58]:
print_mistake_k_and_tokens(3, mistake_indices, predictions,
                           d_train, count_vect, bst)

what are good ideas to help fall asleep quickly?
['are', 'asleep', 'fall', 'good', 'help', 'ideas', 'quickly', 'to', 'what']

what are some ways to fall asleep faster?
['are', 'asleep', 'fall', 'faster', 'some', 'to', 'ways', 'what']

true class: 1
prediction: 0

Probability vector: [P(0|x), P(1|x)]:


IndexError: too many indices for array

## Pipeline

+ Work in Progress

In [53]:
model_pipeline = sklearn.pipeline.Pipeline([("countvectorizer", count_vect),
                                            ("tfidf", tfidf_vectorizer),
                                            ("xgboost", bst)]
                                            )

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "xgbrg__n_estimators": [10, 50, 100, 500],
    "xgbrg__learning_rate": [0.1, 0.2],
}

fit_params = {"xgbrg__eval_set": [(d_valid)], 
              "xgbrg__early_stopping_rounds": 10, 
              "xgbrg__verbose": False} 

searchCV = GridSearchCV(model_pipeline, cv=5, param_grid=param_grid, fit_params=fit_params)
searchCV.fit(train_X, train_y)

In [None]:
def hist_errors(mistake_indices, predictions,
                               X_q1q2, count_vect, clf):
    qs = train_df.iloc[mistake_indices][['question1', 'question2']]
    qs['true_class']=train_df.iloc[mistake_indices].is_duplicate
    qs['prediction']=predictions[mistake_indices]
#     qs['P(1|x)']=clf.predict(X_q1q2)[mistake_indices,:][:,1]
    qs = qs.reset_index(drop=True)
    return qs


In [None]:
qs = hist_errors(mistake_indices, predictions,
                           X_tr_q1q2, count_vect, bst)
qs
# qs.sort_values(by='P(1|x)', ascending=False)

### GradientBoostingClassifier
+ No funciona con matrices sparse. Warning, peta el ordenador.

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier

# parameters = {
#     "loss":["deviance"],
#     'n_estimators':[32, 100],
#     "learning_rate": [0.010, 0.05, 0.1],
# #     "max_depth":[8,10,12, 15, 18, 20, 25],
# #     "max_features":[8, 12, 14, 15, 18, 20, 25, 30, 35],
# #     "criterion": ["friedman_mse",  "mae"],
# #     "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
#     }

# clf_gradboost2 = GridSearchCV(GradientBoostingClassifier(), parameters, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
# clf_gradboost2.fit(X_tr_q1q2, y_train)

# print("Best parameters found: ",clf_gradboost2.best_params_)
# print("Accuracy score (training): {0:.3f}".format(clf_gradboost2.score(X_tr_q1q2, y_train)))
# print("Accuracy score (validation): {0:.3f}".format(clf_gradboost2.score(X_te_q1q2,y_test)))

### Logistic

In [None]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, y_train)

In [None]:
pred_X_te_q1q2 = logistic.predict(X_te_q1q2)
logloss = log_loss(pred_X_te_q1q2, y_test)
print('Valid-logloss:', logloss)
# Valid-logloss: 8.501325433826551

In [None]:
mistake_indices, predictions = get_mistakes(logistic,train_df, X_tr_q1q2, y_train)

In [None]:
print_mistake_k(1, mistake_indices, predictions)

In [None]:
print_mistake_k_and_tokens(10, mistake_indices, predictions,
                           X_tr_q1q2, count_vect, logistic)