In [1]:
import pandas as pd
import scipy
from sklearn import *
from matplotlib import pyplot as plt
from SimpleCountVectorizer import *
from sklearn.metrics import log_loss

# Count Vectorizer

In [2]:
train_df = pd.read_csv("./data/quora_train_data.csv")
test_df = pd.read_csv('./data/quora_test_data.csv')

train_df, test_df = sklearn.model_selection.train_test_split(train_df, test_size=0.1, random_state=123)

In [3]:
train_df.shape, test_df.shape

((291088, 6), (32344, 6))

In [4]:
train_df['question1'] = train_df['question1'].str.lower()
train_df['question2'] = train_df['question2'].str.lower()

In [5]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    # return list(map(lambda x: str(x), all_questions)) # Slower
    return [str(s) for s in mylist]

In [6]:
all_questions = cast_list_as_strings(list(train_df.loc[:, 'question1'])+list(train_df.loc[:, 'question2']))
print(set(type(x).__name__ for x in all_questions))

{'str'}


In [7]:
class Simple2CountVectorizer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    
    def __init__(self,
                 min_word_counts=1,
                 doc_cleaner_pattern=r"[^a-zA-Z]",
                 token_pattern=r"(?u)\b\w\w+\b",
                 dtype=np.float32,
                 doc_cleaner_func=None,
                 tokenizer_func=None,
                 word_transformer_func=None):
        
        self._retype = type(re.compile('hello, world'))

        self.min_word_counts     = min_word_counts
        self.doc_cleaner_pattern = doc_cleaner_pattern
        self.token_pattern       = token_pattern
        self.dtype               = dtype
        
        self.doc_cleaner_func      = doc_cleaner_func
        self.tokenizer_func        = tokenizer_func
        self.word_transformer_func = word_transformer_func

        self.vocabulary = set()
        self.word_to_ind = {}


    def build_doc_cleaner(self, lower=True):
        """
        Returns a function that cleans undesirable substrings in a string.
        It also lowers the input string if lower=True
        """
        if self.doc_cleaner_func:
            return self.doc_cleaner_func
        else:
            if isinstance(self.doc_cleaner_pattern, self._retype):
                #clean_doc_pattern = self.doc_cleaner_pattern.sub(" ", doc)
                clean_doc_pattern = re.compile(self.doc_cleaner_pattern)
            else:
                clean_doc_pattern = re.compile(self.doc_cleaner_pattern)

            if lower:
                 return lambda doc: clean_doc_pattern.sub(" ", doc).lower()
            else:
                 return lambda doc: clean_doc_pattern.sub(" ", doc)

    def build_tokenizer(self):
        """Returns a function that splits a string into a sequence of tokens"""
        if self.tokenizer_func:
            return self.tokenizer_func
        
        else:
            token_pattern = re.compile(self.token_pattern)
            return lambda doc: token_pattern.findall(doc)

    def build_word_transformer(self):
        """Returns a stemmer or lemmaitzer if object has any"""
        
        if self.word_transformer_func:
            return self.word_transformer_func
        else:
            return lambda word: word
        
    def tokenize(self, doc):
        doc_cleaner      = self.build_doc_cleaner()
        doc_tokenizer    = self.build_tokenizer()
        doc     = doc_cleaner(doc)
        words = doc_tokenizer(doc)
            
        return words
        
    def fit(self, X):

        assert self.vocabulary == set(), "self.vocabulary is not empty it has {} words".format(len(self.vocabulary))
        assert isinstance(X,list), "X is expected to be a list of documents"
        
        i = 0
        word_to_ind = {}
        doc_cleaner      = self.build_doc_cleaner()
        doc_tokenizer    = self.build_tokenizer()
        word_transformer = self.build_word_transformer()
        
        for x in X:
            words = doc_cleaner(x)
            words = doc_tokenizer(words)
            for word in words:
                if word not in word_to_ind:
                    word_to_ind[word] = i
                    i += 1
        self.word_to_ind = word_to_ind
        self.n_features = len(word_to_ind)    
                
        self.vocabulary = set(word_to_ind.keys())
                
        return self
    
    def transform(self, X, memory_efficient=False):
        
        doc_cleaner      = self.build_doc_cleaner()
        doc_tokenizer    = self.build_tokenizer()
        word_transformer = self.build_word_transformer()      
        
        col_indices = []
        row_indices = []
        sp_data     = []
                
        if memory_efficient:
            for m, x in enumerate(X):  
                words = doc_cleaner(x)
                words = doc_tokenizer(words)
                for word in words: 
                    index = self.word_to_ind[word]
                    
                    col_indices.append(index)
                    row_indices.append(m)
                    sp_data.append(1)
                    
            encoded_X = sp.csr_matrix((sp_data, (row_indices, col_indices)), shape = (len(X) ,self.n_features)) 
                        
        else:
            
            encoded_X = np.zeros((len(X), len(self.word_to_ind)))
            for m, x in enumerate(X):  
                words = doc_cleaner(x)
                words = doc_tokenizer(words)
                for word in words: 
                    index = self.word_to_ind[word]
                    encoded_X[m, index] += 1
        
        return encoded_X
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        encoded_X = self.transform(X)
        return encoded_X
    
    def _words_in_vocab(self, X):
        
        if isinstance(X, str):
            return [w for w in self.tokenize(X) if w in self.vocabulary]
        
        X_words_in_vocab = []
        for sentence in X:
            X_words_in_vocab.append(self.tokenize(sentence))
            
        return X_words_in_vocab

In [12]:
count_vect = SimpleCountVectorizer()
count_vect.fit(all_questions)

HBox(children=(IntProgress(value=0, max=582176), HTML(value='')))




SimpleCountVectorizer(doc_cleaner_func=None, doc_cleaner_pattern='[^a-zA-Z]',
           dtype=<class 'numpy.float32'>, min_word_counts=1,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer_func=None,
           word_transformer_func=None)

In [13]:
def get_features_from_df(df, count_vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
        
    q1 = count_vectorizer.transform(q1_casted)
    q2 = count_vectorizer.transform(q2_casted)
    
    X_q1q2 = scipy.sparse.hstack((q1,q2))
        
    return X_q1q2

In [14]:
X_tr_q1q2 = get_features_from_df(train_df,count_vect)
X_te_q1q2  = get_features_from_df(test_df, count_vect)

X_tr_q1q2.shape, train_df.shape, test_df.shape, X_te_q1q2.shape

((291088, 137980), (291088, 6), (32344, 6), (32344, 137980))

In [15]:
y_train = train_df["is_duplicate"].values
y_test = test_df["is_duplicate"].values
len(y_train)

291088

In [16]:
def get_mistakes(clf, df, X_q1q2, y):

    predictions = clf.predict(X_q1q2) 
    predictions = np.where(predictions > 0.5, 1, 0)
    incorrect_predictions = predictions!=y
    incorrect_indices = np.where(incorrect_predictions)[0]
    
    if np.sum(incorrect_predictions)==0:
        print("no mistakes in this df")
    else:
        return incorrect_indices, predictions
    
def print_mistake_k(k, mistake_indices, predictions):
    print(train_df.iloc[mistake_indices[k]].question1)
    print(train_df.iloc[mistake_indices[k]].question2)
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    
def print_mistake_k_and_tokens(k, mistake_indices, predictions,
                               X_q1q2, count_vect, clf):
    q1 = train_df.iloc[mistake_indices[k]].question1
    q2 = train_df.iloc[mistake_indices[k]].question2
    
    print(q1)
    print(sorted(count_vect.tokenize(q1)))
    print("")
    print(q2)
    print(sorted(count_vect.tokenize(q2)))
    print("")
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    print("")
    print("Probability vector: [P(0|x), P(1|x)]:")
    print(clf.predict(X_q1q2)[mistake_indices[k],:])


### Xgboost

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {}
param_grid['objective'] = 'binary:logistic'
param_grid['eval_metric'] = ['auc', 'logloss']   #error: (wrong cases)/#(all cases)
param_grid['eta'] = 0.02
param_grid['max_depth'] = 10
param_grid['n_estimators'] = 200

d_train = xgb.DMatrix(X_tr_q1q2, label=y_train)  # For sparse matrices
d_valid = xgb.DMatrix(X_te_q1q2, label=y_test)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(param_grid, d_train, 200, watchlist, early_stopping_rounds=50, verbose_eval=10, )
#200 # train-auc:0.802859	train-logloss:0.546541	valid-auc:0.787583	valid-logloss:0.554585

In [None]:
mistake_indices, predictions = get_mistakes(bst, train_df, d_train, y_train)
print_mistake_k(1, mistake_indices, predictions)

In [None]:
print_mistake_k_and_tokens(11, mistake_indices, predictions,
                           X_tr_q1q2, count_vect, bst)

In [None]:
def hist_errors(mistake_indices, predictions,
                               X_q1q2, count_vect, clf):
    qs = train_df.iloc[mistake_indices][['question1', 'question2']]
    qs['true_class']=train_df.iloc[mistake_indices].is_duplicate
    qs['prediction']=predictions[mistake_indices]
#     qs['P(1|x)']=clf.predict(X_q1q2)[mistake_indices,:][:,1]
    qs = qs.reset_index(drop=True)
    return qs


In [None]:
qs = hist_errors(mistake_indices, predictions,
                           X_tr_q1q2, count_vect, bst)
qs
# qs.sort_values(by='P(1|x)', ascending=False)

### GradientBoostingClassifier
+ No funciona con matrices sparse. Warning, peta el ordenador.

In [None]:
# from sklearn.ensemble import GradientBoostingClassifier

# parameters = {
#     "loss":["deviance"],
#     'n_estimators':[32, 100],
#     "learning_rate": [0.010, 0.05, 0.1],
# #     "max_depth":[8,10,12, 15, 18, 20, 25],
# #     "max_features":[8, 12, 14, 15, 18, 20, 25, 30, 35],
# #     "criterion": ["friedman_mse",  "mae"],
# #     "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
#     }

# clf_gradboost2 = GridSearchCV(GradientBoostingClassifier(), parameters, cv=5, n_jobs=-1, verbose=1, scoring='accuracy')
# clf_gradboost2.fit(X_tr_q1q2, y_train)

# print("Best parameters found: ",clf_gradboost2.best_params_)
# print("Accuracy score (training): {0:.3f}".format(clf_gradboost2.score(X_tr_q1q2, y_train)))
# print("Accuracy score (validation): {0:.3f}".format(clf_gradboost2.score(X_te_q1q2,y_test)))

### Logistic

In [None]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, y_train)

In [None]:
pred_X_te_q1q2 = logistic.predict(X_te_q1q2)
logloss = log_loss(pred_X_te_q1q2, y_test)
print('Valid-logloss:', logloss)
# Valid-logloss: 8.501325433826551

In [None]:
mistake_indices, predictions = get_mistakes(logistic,train_df, X_tr_q1q2, y_train)

In [None]:
print_mistake_k(1, mistake_indices, predictions)

In [None]:
print_mistake_k_and_tokens(10, mistake_indices, predictions,
                           X_tr_q1q2, count_vect, logistic)

# TFIDF