In [1]:
import pandas as pd
import scipy
from sklearn import *

from SimpleCountVectorizer import *
from TFIDFVectorizer import *

# Count Vectorizer

In [2]:
train_df = pd.read_csv("./data/quora_train_data.csv")
test_df = pd.read_csv('./data/quora_test_data.csv')

# train_df, test_df = sklearn.model_selection.train_test_split(train_df, test_size=0.1, random_state=123)

In [3]:
train_df.shape, test_df.shape

((323432, 6), (80858, 6))

In [4]:
def cast_list_as_strings(mylist):
    """
    return a list of strings
    """
    assert isinstance(mylist, list), f"the input mylist should be a list it is {type(mylist)}"
    
    # return list(map(lambda x: str(x), all_questions)) # Slower
    return [str(s) for s in mylist]

In [5]:
all_questions = cast_list_as_strings(list(train_df.loc[:, 'question1'])+list(train_df.loc[:, 'question2']))
print(set(type(x).__name__ for x in all_questions))

{'str'}


In [6]:
count_vect = SimpleCountVectorizer()
count_vect.fit(all_questions)

HBox(children=(IntProgress(value=0, max=646864), HTML(value='')))




SimpleCountVectorizer(doc_cleaner_func=None, doc_cleaner_pattern='[^a-zA-Z]',
                      dtype=<class 'numpy.float32'>, min_word_counts=1,
                      token_pattern='(?u)\\b\\w\\w+\\b', tokenizer_func=None,
                      word_transformer_func=None)

In [7]:
def get_features_from_df(df, vectorizer):
    """
    returns a sparse matrix containing the features build by the count vectorizer.
    Each row should contain features from question1 and question2.
    """
    q1_casted =  cast_list_as_strings(list(df["question1"]))
    q2_casted =  cast_list_as_strings(list(df["question2"]))
    
    q1 = vectorizer.transform(q1_casted)
    q2 = vectorizer.transform(q2_casted)
    
    X_q1q2 = scipy.sparse.hstack((q1,q2))
        
    return X_q1q2

In [8]:
X_tr_q1q2 = get_features_from_df(train_df,count_vect)
X_te_q1q2  = get_features_from_df(test_df, count_vect)

X_tr_q1q2.shape, train_df.shape, test_df.shape, X_te_q1q2.shape

((323432, 144312), (323432, 6), (80858, 6), (80858, 144312))

In [9]:
y_train = train_df["is_duplicate"].values
y_test = test_df['is_duplicate'].values

In [10]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tr_q1q2, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
logistic.score(X_tr_q1q2, y_train), logistic.score(X_te_q1q2, y_test)

(0.8086862153404735, 0.7522323084914294)

In [12]:
def get_mistakes(clf, X_q1q2, y):

    predictions = clf.predict(X_q1q2)    
    incorrect_predictions = predictions!=y
    incorrect_indices = np.where(incorrect_predictions)[0]
    
    if np.sum(incorrect_predictions)==0:
        print("no mistakes in this df")
    else:
        return incorrect_indices, predictions
    
def print_mistake_k(k, mistake_indices, predictions):
    print(train_df.iloc[mistake_indices[k]].question1)
    print(train_df.iloc[mistake_indices[k]].question2)
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    
def print_mistake_k_and_tokens(k, mistake_indices, predictions,
                               X_q1q2, count_vect, clf):
    q1 = train_df.iloc[mistake_indices[k]].question1
    q2 = train_df.iloc[mistake_indices[k]].question2
    
    print(q1)
    print(sorted(count_vect.tokenize(q1)))
    print("")
    print(q2)
    print(sorted(count_vect.tokenize(q2)))
    print("")
    print("true class:", train_df.iloc[mistake_indices[k]].is_duplicate)
    print("prediction:", predictions[mistake_indices[k]])
    print("")
    print("Probability vector: [P(0|x), P(1|x)]:")
    print(clf.predict_proba(X_q1q2)[mistake_indices[k],:])


In [13]:
mistake_indices, predictions = get_mistakes(logistic, X_te_q1q2, y_test)

In [14]:
print_mistake_k(0, mistake_indices, predictions)

How do I study for Honeywell company recruitment?
How do I study for Honeywell company recruitments?
true class: 1
prediction: 0


In [15]:
print_mistake_k_and_tokens(0, mistake_indices, predictions,
                           X_te_q1q2, count_vect, logistic)

How do I study for Honeywell company recruitment?
['company', 'do', 'for', 'honeywell', 'how', 'recruitment', 'study']

How do I study for Honeywell company recruitments?
['company', 'do', 'for', 'honeywell', 'how', 'recruitments', 'study']

true class: 1
prediction: 0

Probability vector: [P(0|x), P(1|x)]:
[0.69260781 0.30739219]


# TFIDF

In [16]:
tfidf_vectorizer = TFIDFVectorizer(count_vect.vocabulary, count_vect.word_to_ind, count_vect.tokenize)
tfidf_vectorizer.fit(all_questions)

TFIDF fit finished in 10.22 seconds


In [17]:
X_tfidf_tr_q1q2 = get_features_from_df(train_df, tfidf_vectorizer)
X_tfidf_te_q1q2  = get_features_from_df(test_df, tfidf_vectorizer)

X_tfidf_tr_q1q2.shape, train_df.shape, test_df.shape, X_tfidf_te_q1q2.shape

TFIDF transform finished in 7.08 seconds
TFIDF transform finished in 7.78 seconds
TFIDF transform finished in 1.94 seconds
TFIDF transform finished in 1.79 seconds


((323432, 144312), (323432, 6), (80858, 6), (80858, 144312))

In [18]:
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear")
logistic.fit(X_tfidf_tr_q1q2, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
logistic.score(X_tfidf_tr_q1q2, y_train), logistic.score(X_tfidf_te_q1q2, y_test)

(0.7930878824606099, 0.7559672512305523)

In [20]:
mistake_indices, predictions = get_mistakes(logistic, X_tfidf_te_q1q2, y_test)

In [21]:
print_mistake_k(0, mistake_indices, predictions)

How do I study for Honeywell company recruitment?
How do I study for Honeywell company recruitments?
true class: 1
prediction: 0


In [22]:
print_mistake_k_and_tokens(0, mistake_indices, predictions,
                           X_tfidf_te_q1q2, tfidf_vectorizer, logistic)

How do I study for Honeywell company recruitment?
['company', 'do', 'for', 'honeywell', 'how', 'recruitment', 'study']

How do I study for Honeywell company recruitments?
['company', 'do', 'for', 'honeywell', 'how', 'recruitments', 'study']

true class: 1
prediction: 0

Probability vector: [P(0|x), P(1|x)]:
[0.68263501 0.31736499]


In [None]:
######### SEARCH QUESTIONS WITH SAME TOKENS ###########

# q1_casted =  cast_list_as_strings(list(train_df["question1"]))
# q2_casted =  cast_list_as_strings(list(train_df["question2"]))

# q1 = count_vect.transform(q1_casted)
# q2 = count_vect.transform(q2_casted)

# same_tokens_idxs = []
# for i in range(len(q1_casted)):
#     same_features = ( q1[i] != q2[i] ).nnz == 0
#     if same_features:
#         same_tokens_idxs.append(i)
#     if i % 500 == 0: print(i)

# same_tokens_idxs = np.array(same_tokens_idxs)


# same_tokens = train_df.iloc[same_tokens_idxs]
# duplicates = same_tokens['is_duplicate'] == 1

# same_tokens[~duplicates]