In [1]:
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('universal_tagset')

from nltk.corpus import stopwords

import pandas as pd
import scipy
from sklearn import *
import re

from SimpleCountVectorizer import *
from SimpleCountVectorizerAMC import *

from TFIDFVectorizer import *
from utils import *

from nltk.stem import WordNetLemmatizer, SnowballStemmer

import xgboost as xgb

# Count Vectorizer

In [2]:
train_df = pd.read_csv("./data/quora_train_data.csv")
test_df = pd.read_csv('./data/quora_test_data.csv')

In [3]:
train_df.shape, test_df.shape

((323432, 6), (80858, 6))

In [4]:
all_questions = cast_list_as_strings(list(train_df.loc[:, 'question1'])+list(train_df.loc[:, 'question2']))
print(set(type(x).__name__ for x in all_questions))

{'str'}


### Document cleaner

This function manages the cleaning of a document. After trying several things, we decided to simply filter by alphanumeric characters and replace the upper case with lower case.

In [5]:
def my_doc_cleaner(doc,
                  pat=r"[^a-zA-Z0-9]"):
    """
    Document cleaner. We allow alphanumeric characters.
    """
    
    # Allow alphanumeric characters
    doc_cleaner_pattern=pat
    clean_doc_pattern = re.compile(doc_cleaner_pattern)
    doc_clean = clean_doc_pattern.sub(" ", doc)
    return doc.lower()

### Tokenizer function

The tokenization function is the most important function of our CountVectorizer. It is in charge of deciding which tokens will represent a document (or phrase). As we can see, multiple functionalities have been added, which we will detail below:

* **Stopwords**: deactivated by default, it removes the most common English words. 
    This functionality made us reduce the evaluation metrics in that specific problem but it is a good functionality to take into account in future projects.


* **Numbers to words**: allows to solve problems like:
    * Q1: How much is 2+2?
    * Q2: What is the sum of two plus two?
    
    In this case the numbers are converted to their string representation thanks to a function implemented in the utils library.


* **Stemmer and Lemmatizer**: Two great allies of any text model, they serve to standardize the words by converting them to their root word, remove the 's' from the plurals...


* **N-grams**: To improve prediction and not use only tokens, we have introduced tuples of tokens. As in sklearn, we can specify the size of the N-grams with a function parameter.

* **N-tokens**: We added an extra field to indicate the number of tokens of that document. This feature helps to improve accuracy.

* **Duplicate question words**: In order to enhance the type of the question, we duplicate the keyword.

* **Duplicate verbs**: Verbs are extremely important in deciphering the underlying meaning of a sentence. Therefore, we attributed more importance to them via duplication. 

* **Duplicate nouns**: Nouns are extremely important in deciphering the underlying meaning of a sentence. Therefore, we attributed more importance to them via duplication.


In [6]:
# stpw = set(stopwords.words("english"))
stpw = []
question_words = ['who','what','when','where','why','how','which']
stemmer =  SnowballStemmer(language='english')
lemmatizer = WordNetLemmatizer()

In [7]:
def my_tokenizer_func(doc, 
                      ngrams=(1,3), 
                      numbers_to_words=True,
                      stop_words=stpw,
                      duplicate_question_words=question_words,
                      duplicate_verbs=False,
                      duplicate_nouns=True,
                      pat=r"(?u)\b\w\S*\w*\b",
                      lem=True,
                      stem=True,
                      add_num_tokens=True):
    
    # Split using a pattern
    # Notice that the pattern has been changed and now it accepts a wider
    # range of words. 
    
    # Example: V2.3.4 (A version of a program) will be transformed into 'v2.3.4'
    token_pattern = re.compile(pat)
    lst = token_pattern.findall(doc)
    
    # Transform numbers into words
    if numbers_to_words:
        lst = list(map(lambda x: num_conv(x), lst))
        
    # Drop stopwords 
    lst = list(filter(lambda x : x not in stop_words, lst))
    
    # Duplicate key_words
    if len(duplicate_question_words)>0:
        lst += [value for value in lst if value.lower() in duplicate_question_words]
    
     # Duplicate verbs
    if duplicate_verbs:
        lst += [x[0] for x in nltk.pos_tag(lst,tagset='universal') if x[1] == 'VERB' and len(x[0]) > 1]
        
    # Duplicate nouns
    if duplicate_nouns:
        lst += [x[0] for x in nltk.pos_tag(lst,tagset='universal') if x[1] == 'NOUN' and len(x[0]) > 1]
    
    #Stemmer
    if stem:
        lst = list(map(lambda x: stemmer.stem(x), lst))
    
    #Lemmatizer 
    if lem:
        lst = list(map(lambda x: lemmatizer.lemmatize(x), lst))
    
    # N-tokens
    ntoks = []
    if add_num_tokens:
        ntoks = [num_conv(str(len(lst))) + '_tokens']
    
    if ngrams==(1,1):
        return lst+ntoks
    
    # Generate ngrams
    lstRet = []
    for a in range(ngrams[0], ngrams[1]+1):
        if a!=1:
            lstRet+=list(zip(*[lst[i:] for i in range(a)]))
    return lstRet+ntoks if ngrams[0]!=1 else lst+lstRet+ntoks

### Fitting the improved SimpleCountVectorizer

In [12]:
count_vect = SimpleCountVectorizerAMC(
    doc_cleaner_func=my_doc_cleaner,
    tokenizer_func=my_tokenizer_func
)
count_vect.fit(all_questions)

HBox(children=(FloatProgress(value=0.0, max=646864.0), HTML(value='')))




SimpleCountVectorizerAMC(doc_cleaner_func=<function my_doc_cleaner at 0x000001FA8A018EE8>,
                         doc_cleaner_pattern='[^a-zA-Z]',
                         dtype=<class 'numpy.float32'>, min_word_counts=1,
                         token_pattern='(?u)\\b\\w\\w+\\b',
                         tokenizer_func=<function my_tokenizer_func at 0x000001FA87DD19D8>,
                         word_transformer_func=None)

### Transforming the datasets into sparse matrices

In [13]:
def get_features_from_df(df, vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
    
    q1 = vectorizer.transform(q1_casted)
    q2 = vectorizer.transform(q2_casted)
    
    X_q1q2 = scipy.sparse.hstack((q1,q2))
    
    return X_q1q2

In [14]:
%time X_tr_q1q2 = get_features_from_df(train_df,count_vect)
%time X_te_q1q2  = get_features_from_df(test_df, count_vect)

X_tr_q1q2.shape, train_df.shape, test_df.shape, X_te_q1q2.shape

Wall time: 16min 27s
Wall time: 4min 5s


((323432, 9425768), (323432, 6), (80858, 6), (80858, 9425768))

In [15]:
y_train = train_df["is_duplicate"].values
y_test = test_df['is_duplicate'].values

## Trying with a simple model (Linear Regression)

In [31]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", verbose=1, max_iter=100)
logistic.fit(X_tr_q1q2, y_train)

[LibLinear]

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=1,
                   warm_start=False)

In [32]:
logistic.score(X_tr_q1q2, y_train), logistic.score(X_te_q1q2, y_test)

(0.998689059833288, 0.8136486185658809)

## Improving results (XGBoost)

In [6]:
import xgboost as xgb
xgb_model = xgb.Booster({'nthread': 4})  # init model
bst.load_model('model.bin')  # load data
# xgb_model = xgb.XGBClassifier(n_estimators=10000)
dir(xgb_model)
xgb

['__class__',
 '__copy__',
 '__deepcopy__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_validate_features',
 'attr',
 'attributes',
 'boost',
 'booster',
 'copy',
 'dump_model',
 'eval',
 'eval_set',
 'feature_names',
 'get_dump',
 'get_fscore',
 'get_score',
 'get_split_value_histogram',
 'handle',
 'load_config',
 'load_model',
 'load_rabit_checkpoint',
 'predict',
 'save_config',
 'save_model',
 'save_rabit_checkpoint',
 'save_raw',
 'set_attr',
 'set_param',
 'trees_to_dataframe',
 'update']

In [36]:
import xgboost as xgb

N = 10000 # With early stopping
xgb_model = xgb.XGBClassifier(n_estimators=N)
xgb_model.fit(X_tr_q1q2, y_train, 
              verbose=10, 
              eval_set=[(X_tr_q1q2, y_train),(X_te_q1q2, y_test)], 
              early_stopping_rounds =10,
              eval_metric=['auc','logloss'],
              )

## Training curves

In [37]:
from matplotlib import pyplot as plt
%matplotlib inline

results = xgb_model.evals_result()
epochs = len(results['validation_0']['logloss'])
x_axis = range(0, epochs)

fig = plt.figure(figsize=(20,6))

# plot log loss
ax = fig.add_subplot(121)
ax.plot(x_axis, results['validation_0']['logloss'], label='Train')
ax.plot(x_axis, results['validation_1']['logloss'], label='Test')
ax.legend()
ax.set_ylabel('Log Loss')
ax.set_title('XGBoost Log Loss')

# plot classification AUC
ax = fig.add_subplot(122)
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Test')
ax.legend()
ax.set_ylabel('Classification AUC')
ax.set_title('XGBoost Classification AUC')
plt.show()

### Save the model

In [38]:
xgb_model.save_model('models/model_count.dat')

In [None]:
xgb.compat()

In [9]:
dir(xgb)

['Booster',
 'DMatrix',
 'RabitTracker',
 'VERSION_FILE',
 'XGBClassifier',
 'XGBModel',
 'XGBRFClassifier',
 'XGBRFRegressor',
 'XGBRanker',
 'XGBRegressor',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'callback',
 'compat',
 'core',
 'cv',
 'dask',
 'f',
 'libpath',
 'os',
 'plot_importance',
 'plot_tree',
 'plotting',
 'rabit',
 'sklearn',
 'sys',
 'to_graphviz',
 'tracker',
 'train',
 'training',

# TFIDF

In our case, TFIDF has a lower performance than SimpleCountVectorizer (with the default parameters it already had it). We have managed to raise the score a little bit although we only use the CountVectorizer implemented at the beginning to predict.

In [22]:
tfidf_vectorizer = TFIDFVectorizer(count_vect.vocabulary, count_vect.word_to_ind, count_vect.tokenize)
tfidf_vectorizer.fit(all_questions)

HBox(children=(FloatProgress(value=0.0, description='Building corpus: ', max=646864.0, style=ProgressStyle(des…


TFIDF fit finished in 1004.63 seconds


In [23]:
X_tfidf_tr_q1q2 = get_features_from_df(train_df, tfidf_vectorizer)
X_tfidf_te_q1q2 = get_features_from_df(test_df, tfidf_vectorizer)

X_tfidf_tr_q1q2.shape, train_df.shape, test_df.shape, X_tfidf_te_q1q2.shape

TFIDF transform finished in 496.35 seconds
TFIDF transform finished in 519.39 seconds
TFIDF transform finished in 128.98 seconds
TFIDF transform finished in 129.89 seconds


((323432, 9425766), (323432, 6), (80858, 6), (80858, 9425766))

In [29]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", verbose=1, max_iter=1000)
logistic.fit(X_tfidf_tr_q1q2, y_train)

logistic.score(X_tfidf_tr_q1q2, y_train), logistic.score(X_tfidf_te_q1q2, y_test)

[LibLinear]

(0.9039334388681393, 0.7883078977961364)

In [30]:
N = 10000 # With early stopping
xgb_model = xgb.XGBClassifier(n_estimators=N)
xgb_model.fit(X_tfidf_tr_q1q2, y_train, 
              verbose=10, 
              eval_set=[(X_tfidf_tr_q1q2, y_train),(X_tfidf_te_q1q2, y_test)], 
              early_stopping_rounds =10,
              eval_metric=['auc','logloss'],
              )

KeyboardInterrupt: 

In [43]:
xgb_model.save_model('models/model_tfidf.dat')

Cosas importantes:
He subido el notebook final 1: F1_Building_the_model
He separado algunas funciones en una libreria utils.py (int2num, cast2int)
He creado tambien una libreria mistakes.py con las cuatro funciones de mistakes. Quien escriba el notebook, que lo tenga en cuenta de no ponerlas en el notebook, solo importar 

from utils import *
from mistakes import *

He generado dos modelos finales model_count.dat, model_tfidf.dat, que pueden ser cargados y comparados con los de sklearn. El primero llega a una AUC de 88% i el segundo a 80%

In [10]:
import xgboost as xgb
N = 10000 # With early stopping
xgb_model = xgb.XGBClassifier(n_estimators=N)
dir(xgb_model)
xgb_model.load_model('model_tfidf.dat')

In [None]:
xgb_model.predict()