In [14]:
import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('universal_tagset')

from nltk.corpus import stopwords

import pandas as pd
import scipy
from sklearn import *
import re

from SimpleCountVectorizer import *
from SimpleCountVectorizerAMC import *

from TFIDFVectorizer import *
from utils import *

from nltk.stem import WordNetLemmatizer, SnowballStemmer
import pickle
import xgboost as xgb

# Count Vectorizer

In [15]:
train_df = pd.read_csv("./data/quora_train_data.csv")
test_df = pd.read_csv('./data/quora_test_data.csv')

In [16]:
train_df.shape, test_df.shape

((323432, 6), (80858, 6))

In [17]:
all_questions = cast_list_as_strings(list(train_df.loc[:, 'question1'])+list(train_df.loc[:, 'question2']))
print(set(type(x).__name__ for x in all_questions))

{'str'}


### Tokenizer function

The tokenization function is the most important function of our CountVectorizer. It is in charge of deciding which tokens will represent a document (or phrase). As we can see, multiple functionalities have been added, which we will detail below:

* **Stopwords**: deactivated by default, it removes the most common English words. 
    This functionality made us reduce the evaluation metrics in that specific problem but it is a good functionality to take into account in future projects.


* **Numbers to words**: allows to solve problems like:
    * Q1: How much is 2+2?
    * Q2: What is the sum of two plus two?
    
    In this case the numbers are converted to their string representation thanks to a function implemented in the utils library.


* **Stemmer and Lemmatizer**: Two great allies of any text model, they serve to standardize the words by converting them to their root word, remove the 's' from the plurals...


* **N-grams**: To improve prediction and not use only tokens, we have introduced tuples of tokens. As in sklearn, we can specify the size of the N-grams with a function parameter.

* **N-tokens**: We added an extra field to indicate the number of tokens of that document. This feature helps to improve accuracy.

* **Duplicate question words**: In order to enhance the type of the question, we duplicate the keyword.

* **Duplicate verbs**: Verbs are extremely important in deciphering the underlying meaning of a sentence. Therefore, we attributed more importance to them via duplication. 

* **Duplicate nouns**: Nouns are extremely important in deciphering the underlying meaning of a sentence. Therefore, we attributed more importance to them via duplication.


### Fitting the improved SimpleCountVectorizer
+ Thanks to pickle we load the data directly. For more details of the process check the notebook **F1_Building_the_model**.

In [18]:
# count_vect = pickle.load(open("models/CountVect.pkl", 'rb'))

### Transforming the datasets into sparse matrices

In [19]:
X_tr_q1q2 = scipy.sparse.load_npz('models/X_tr_q1q2.npz')
X_te_q1q2 = scipy.sparse.load_npz('models/X_te_q1q2.npz')

## Checking shapes
X_tr_q1q2.shape, train_df.shape, X_te_q1q2.shape, test_df.shape

((323432, 9425768), (323432, 6), (80858, 9425768), (80858, 6))

In [24]:
y_train = train_df["is_duplicate"].values
y_test = test_df['is_duplicate'].values

y_train.shape, y_test.shape

((323432,), (80858,))

## Base model (Linear Regression)

In [31]:
# load the model from disk
loaded_linear_reg = pickle.load(open("models/model_lr_count.pkl", 'rb'))
result_train = loaded_linear_reg.score(X_tr_q1q2, y_train)
result_test = loaded_linear_reg.score(X_te_q1q2, y_test)

print("Accuracy in training:", result_train)
print("Accuracy in testing:",result_test)
loaded_linear_reg

Accuracy in training: 0.9990075193549185
Accuracy in testing: 0.8131662915234115


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=1,
                   warm_start=False)

## Improving results (XGBoost)

In [36]:
xgb_model_countvect = pickle.load(open("models/xgboost_model_countvect.pkl", 'rb'))

result_train = xgb_model_countvect.score(X_tr_q1q2, y_train)
result_test = xgb_model_countvect.score(X_te_q1q2, y_test)

print("Accuracy in training:", result_train)
print("Accuracy in testing:",result_test)

In [34]:
N=10000
xgb_model = xgb.XGBClassifier(n_estimators=N)
xgb_model.load_model('models/model_tfidf.dat')
xgb_model.predict(X_tr_q1q2)

ValueError: attempt to get argmax of an empty sequence

# Some coments

+ TFIDF vectorizer underperformed with respect to SimpleCountVectorizer in all the attempts performed. We hypothesize that the TFIDF vectors are worse for this specific task and one of the reasons could be that, using TFIDF, the question words (such as what, why, when, who, etc) appear in lots of documents rendering them less important in the TFIDF feature vector. This could very well induce a strong retraction in the performance of any classifier.

