# ALTEGRAD challenge Fall 2017 - Feature Extraction
Can you predict whether two short texts have the same meaning?

https://www.kaggle.com/c/altegrad-challenge-fall-17

The goal of this competition is to predict which of the provided pairs of questions contain two questions with the same meaning.

The ground truth is a set of labels supplied by human experts. This is inherently subjective, as the true meaning of sentences can not be known with certainty. Human labeling is a 'noisy' process, and different people would probably disagree. As a result, ground truth labels on this dataset should be taken as indications but not 100% accurate, and may include incorrect labeling.

In [22]:
# General
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
import re 
import itertools
import operator
import copy
import heapq
import spacy
import nltk
from nltk import pos_tag
from sklearn.model_selection import train_test_split
from collections import Counter
from collections import defaultdict
from scipy.spatial.distance import cosine, euclidean, jaccard

import os
import warnings
import csv

from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from nltk.stem import SnowballStemmer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances

from fuzzywuzzy import fuzz
from jellyfish import jaro_distance, jaro_winkler

import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

from gensim.models.word2vec import Word2Vec

import spacy
from collections import Counter
#In the default models, the parser is loaded and enabled as part of the standard processing pipeline. 
#If you don't need any of the syntactic information, you should disable the parser. 
#Disabling the parser will make spaCy load and run much faster.
nlp = spacy.load('en', disable=['parser'])

warnings.filterwarnings('ignore')

# Preparation of our data

In [3]:
train = pd.read_csv('data/train.csv', names=['row_ID', 'text_a_ID', 'text_b_ID', 'text_a_text', 'text_b_text', 'target'])
train.head()

Unnamed: 0,row_ID,text_a_ID,text_b_ID,text_a_text,text_b_text,target
0,0,199954,384085,What are the some of the best novels?,What are some of the greatest novels of all ti...,0
1,1,128681,237407,What are the pictures that made you look twice?,What are some amazing pictures one has to see ...,0
2,2,170846,240621,Have the ellectoral college members ever voted...,When has the electoral college voted against t...,1
3,3,55110,177468,Did Ravana really have 10 heads?,Why did Ravana have 10 heads?,1
4,4,425513,400256,What's a book that you feel helped you to impr...,What books or magazines should I read to impro...,0


In [4]:
train.shape

(80100, 6)

In [6]:
test = pd.read_csv('data/test.csv', names=['row_ID', 'text_a_ID', 'text_b_ID', 'text_a_text', 'text_b_text'])
test.head()

Unnamed: 0,row_ID,text_a_ID,text_b_ID,text_a_text,text_b_text
0,0,245776,2705,What are the best sites to book a hotel online?,What is the best hotel booking service?
1,1,104796,48346,How can I stop masturbation?,How can I stop doing masturbation?
2,2,41770,383018,Which is the best way to control anger?,What is the best way to control your anger?
3,3,81132,401393,Why is my Miniature Pinscher/Chihuahua mix afr...,Why is my Black Lab/Pitbull mix puppy afraid o...
4,4,244572,7520,How do I get rid off from porn addiction?,What is the best way to overcome an porn addic...


In [7]:
test['target'] = 7 # False target to reuse our functions
test.shape

(20179, 6)

In [10]:
# Full Dataset :
df_all_texts = pd.concat([train, test])
df_all_texts.shape

(100279, 6)

# Cleaning

In [33]:
def clean_text_simple(text, remove_stopwords=True, pos_filtering=True, stemming=True):
    #print(text)
    english_stopwords = set([stopword for stopword in stopwords.words('english')])
    punct = set(string.punctuation)
    punct.update(["``", "`", "..."])
    text = text.lower()
    text = ''.join(l for l in text if l not in punct) # remove punctuation (preserving intra-word dashes)
    text = re.sub(' +',' ',text) # strip extra white space
    text = text.strip() # strip leading and trailing white space
    
    # tokenize (split based on whitespace)
    tokens = text.split(' ')
    
    if pos_filtering == True:
        # POS tag and retain only nouns and adjectives
        tagged_tokens = pos_tag(tokens)
        tokens_keep = []
        for item in tagged_tokens:
            if (
            item[1] == 'NN' or
            item[1] == 'NNS' or
            item[1] == 'NNP' or
            item[1] == 'NNPS' or
            item[1] == 'JJ' or
            item[1] == 'JJS' or
            item[1] == 'JJR'
            ):
                tokens_keep.append(item[0])
        tokens = tokens_keep
    
    if remove_stopwords:
        # remove stopwords
        tokens = [token for token in tokens if token not in english_stopwords and len(token)>1]
    
    if stemming:
        # apply Porter's stemmer
        stemmer = nltk.stem.PorterStemmer()
        tokens_stemmed = list()
        for token in tokens:
            tokens_stemmed.append(stemmer.stem(token))
        tokens = tokens_stemmed
    
    return(' '.join(tokens))

### Construction of pairs ans texts array

In [34]:
def construct_pairs(train, test, remove_stopwords= True, pos_filtering=False, stemming = True):
    texts = {}
    pairs_train = []
    pairs_test = []
    y_train = []
    y_true = []
    ids2ind = {} # will contain the row idx of each unique text in the TFIDF matrix 

    for idx, l in enumerate(train.values):
        if l[1] not in texts:
            texts[l[1]] = clean_text_simple(l[3], remove_stopwords = remove_stopwords, 
                                            pos_filtering = pos_filtering, 
                                            stemming = stemming)
            
        if l[2] not in texts:
            texts[l[2]] = clean_text_simple(l[4], remove_stopwords = remove_stopwords, 
                                            pos_filtering = pos_filtering, 
                                            stemming = stemming)

        pairs_train.append([l[1], l[2]])
        y_train.append(int(l[5]))

    for idx, l in enumerate(test.values):
        if l[1] not in texts:
            texts[l[1]] = clean_text_simple(l[3], remove_stopwords = remove_stopwords, 
                                            pos_filtering = pos_filtering, 
                                            stemming = stemming)
            
        if l[2] not in texts:
            texts[l[2]] = clean_text_simple(l[4], remove_stopwords = remove_stopwords, 
                                            pos_filtering = pos_filtering, 
                                            stemming = stemming)

        pairs_test.append([l[1], l[2]])
        y_true.append(int(l[5])) 
        
    for qid in texts:
        ids2ind[qid] = len(ids2ind)
    
    return texts, pairs_train, pairs_test, y_train, y_true, ids2ind

## Features engineering 

### Tf-Idf

In [35]:
def tfIdf(texts):
    return TfidfVectorizer().fit_transform(texts.values())

### Spacy tags
https://spacy.io/usage/linguistic-features

In [36]:
def spacy_tag(sentence):
    sentence = nlp(sentence)
    count_tags = Counter([w.pos_ for w in sentence])
    return count_tags

In [37]:
def similarity(count_tags1, count_tags2):
    bag_of_tags1 = list(count_tags1.keys())
    bag_of_tags_values1 = [count_tags1.get(l) for l in bag_of_tags1]
    bag_of_tags2 = list(count_tags2.keys())
    bag_of_tags_values2 = [count_tags2.get(l) for l in bag_of_tags2]
    
    everseen = list()
    diff = 0
    for i, tag in enumerate(bag_of_tags1):
        if tag in bag_of_tags2:
            everseen.append(tag)
            index = bag_of_tags2.index(tag)
            diff = diff + np.abs(bag_of_tags_values1[i] - bag_of_tags_values2[index])
        else :
            everseen.append(tag)
            diff = diff + bag_of_tags_values1[i]
    
    for i, tag in enumerate(bag_of_tags2):
        if tag not in everseen:
            everseen.append(tag)
            diff = diff + bag_of_tags_values2[i]
            
    return diff / (np.sum(bag_of_tags_values1) + np.sum(bag_of_tags_values2)) # We normalize

### Numbers of Words in common and derived features (Word operations)

In [38]:
def common_words(q1, q2):
    return len(set(q1).intersection(set(q2)))

def diff_words(q1, q2):
    q1 = q1.split(' ')
    q2 = q2.split(' ')
    everseen = list()
    diff = 0
    for tag in q1:
        if tag not in q2:
            everseen.append(tag)
            diff = diff + 1
        else :
            everseen.append(tag)          
    for tag in q2:
        if tag not in (everseen and q2):
            everseen.append(tag)
            diff = diff + 1
    return diff

def total_unique_words(q1, q2):
    return len(set(q1).union(q2))

def total_unq_words_stop(q1, q2):
    stops = set([stopword for stopword in stopwords.words('english')])
    return len([x for x in set(q1).union(q2) if x not in stops])

def wc_diff(q1, q2):
    return abs(len(q1) - len(q2))

def wc_ratio(q1, q2):
    l1 = len(q1)*1.0 
    l2 = len(q2)
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2
    
def wc_diff_unique(q1, q2):
    return abs(len(set(q1)) - len(set(q2)))

def wc_ratio_unique(q1, q2):
    l1 = len(set(q1)) * 1.0
    l2 = len(set(q2))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique_stop(q1, q2):
    stops = set([stopword for stopword in stopwords.words('english')])
    return abs(len([x for x in set(q1) if x not in stops]) - len([x for x in set(q2) if x not in stops]))

def wc_ratio_unique_stop(q1, q2):
    stops = set([stopword for stopword in stopwords.words('english')])
    l1 = len([x for x in set(q1) if x not in stops])*1.0 
    l2 = len([x for x in set(q2) if x not in stops])
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(q1, q2):
    if not q1 or not q2:
        return np.nan
    return int(q1[0] == q2[0])

def same_last_word(q1, q2):
    if not q1 or not q2:
        return np.nan
    return int(q1[-1] == q2[-1])

def char_diff(q1, q2):
    return abs(len(''.join(q1)) - len(''.join(q2)))

def char_ratio(q1, q2):
    l1 = len(''.join(q1)) 
    l2 = len(''.join(q2))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def char_diff_unique_stop(q1, q2):
    stops = set([stopword for stopword in stopwords.words('english')])
    return abs(len(''.join([x for x in set(q1) if x not in stops])) - len(''.join([x for x in set(q2) if x not in stops])))

def word_match_share(q1, q2):
    stops = set([stopword for stopword in stopwords.words('english')])
    q1words = {}
    q2words = {}
    for word in q1:
        if word not in stops:
            q1words[word] = 1
    for word in q2:
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

### Fuzzy String Matching
Calculate edit distances between each question pair (Levenshtein, Jaro, Jaro-Winkler, ...).


In [39]:
def fuzzy(q1_text, q2_text):   
    q1_tokens=q1_text.split()
    q2_tokens=q2_text.split()
    fuzzy_distances = np.array([
        fuzz.ratio(q1_tokens, q2_tokens),
        fuzz.partial_ratio(q1_tokens, q2_tokens),
        fuzz.token_sort_ratio(q1_tokens, q2_tokens),
        fuzz.token_set_ratio(q1_tokens, q2_tokens),
        fuzz.partial_token_sort_ratio(q1_tokens, q2_tokens),
    ], dtype='float')
    
    # Normalize to [0 - 1] range.
    fuzzy_distances /= 100
    
    jelly_distances = np.array([
        jaro_distance(q1_text, q2_text),
        jaro_winkler(q1_text, q2_text),
    ])
    
    return np.concatenate([fuzzy_distances, jelly_distances])

### Character N-Gram Jaccard Index
Calculate Jaccard similarities between sets of character $n$-grams for different values of $n$.

In [40]:
NGRAM_RANGE = range(2, 6)

def jaccard(q1, q2):
    wic = set(q1).intersection(set(q2))
    uw = set(q1).union(q2)
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def get_char_ngrams(doc, n):
    return [doc[i:i + n] for i in range(len(doc) - n + 1)]

def get_jaccard_set_similarities(a, b):
    len_intersection = len(a.intersection(b))
    jaccard_index = len_intersection / len(a.union(b))
    jaccard_index_norm_a = len_intersection / len(a)
    jaccard_index_norm_b = len_intersection / len(b)
    
    return jaccard_index, jaccard_index_norm_a, jaccard_index_norm_b

def get_jaccard_similarities(q1, q2, n):
    if len(q1) < max(NGRAM_RANGE) and len(q2) < max(NGRAM_RANGE):
        return 1, 1, 1
    if len(q1) < max(NGRAM_RANGE) or len(q2) < max(NGRAM_RANGE):
        return 0, 0, 0
    
    q1_ngrams = set(get_char_ngrams(q1, n))
    q2_ngrams = set(get_char_ngrams(q2, n))
    return get_jaccard_set_similarities(q1_ngrams, q2_ngrams)

def get_question_pair_features(q1,q2):
    
    features = []
    for n in NGRAM_RANGE:
        features.extend(get_jaccard_similarities(q1, q2, n))
    
    return features


### LDA Topic Distances
Train a Latent Dirichlet Allocation model with 300 topics on the question corpus and compute topic distances between the question pairs.

In [41]:
def stem_pair(pair):
    stemmer = SnowballStemmer('english')
    return [
        [stemmer.stem(token) for token in texts[pair[0]].split()],
        [stemmer.stem(token) for token in texts[pair[1]].split()],
    ]

def compute_topic_distances(q1, q2, lda_dictionary, model):
    
    q1_bow = lda_dictionary.doc2bow(q1)
    q2_bow = lda_dictionary.doc2bow(q2)
    
    q1_topic_vec = np.array(model.get_document_topics(q1_bow, minimum_probability=0))[:, 1].reshape(1, -1)
    q2_topic_vec = np.array(model.get_document_topics(q2_bow, minimum_probability=0))[:, 1].reshape(1, -1)
    
    return [
        cosine_distances(q1_topic_vec, q2_topic_vec)[0][0],
        euclidean_distances(q1_topic_vec, q2_topic_vec)[0][0],
    ]

In [42]:
def lda_init(pairs_train, pairs_test, df_all_texts):
    NUM_TOPICS = 300
    RANDOM_SEED = 42
    
    lda_tokens = list()
    for i in range(len(pairs_train)):
        lda_tokens.append(stem_pair(pairs_train[i]))
    for i in range(len(pairs_test)):
        lda_tokens.append(stem_pair(pairs_test[i]))
        
    lda_documents = list(np.array(lda_tokens).ravel()) # When a view is desired, arr.reshape(-1) may be preferable.
    lda_dictionary = Dictionary(lda_documents)
    lda_corpus = [lda_dictionary.doc2bow(document) for document in lda_documents]

    model = LdaMulticore(
        lda_corpus,
        num_topics=NUM_TOPICS,
        id2word=lda_dictionary,
        random_state=RANDOM_SEED,
    )

    lda_distances = list()
    for i in lda_tokens:
        lda_distances.append(compute_topic_distances(i[0], i[1],lda_dictionary, model))

    lda_train = np.array(lda_distances[:len(pairs_train)], dtype='float64')
    lda_test = np.array(lda_distances[len(pairs_train):], dtype='float64')
    
    columns_lda=['lda_1','lda_2']
    lda_distances = pd.DataFrame(
    lda_distances,
    columns=columns_lda
    )
    
    return lda_distances

In [43]:
def get_lda(q1,q2, lda_df):
    
    raw = lda_df[lda_df['text_a_ID'] == q1][lda_df[lda_df['text_a_ID'] == q1]['text_b_ID'] == q2]
    raw1 = lda_df[lda_df['text_a_ID'] == q2][lda_df[lda_df['text_a_ID'] == q2]['text_b_ID'] == q1]

    if(raw1.empty): return raw
    elif(raw.empty): return raw1
    else: return 0

### POS/NER Tag Similarity
Derive bag-of-POS-tag (part of speech tagging) and bag-of-NER-tag (Named Entity Recognition) vectors from each question and calculate their vector distances.

(POS) https://spacy.io/usage/linguistic-features#pos-tagging

(NER) https://spacy.io/usage/linguistic-features#named-entities

(NER) labels sequences of words in a text which are the names of things, such as person and company names, or gene and protein names.

In [44]:
def create_counter(name, df_all_texts, num_raw_features, pos_tags_whitelist, ner_tags_whitelist):
    
    X1 = np.zeros((len(df_all_texts), num_raw_features))
    pipe_q1 = nlp.pipe(df_all_texts[name].values, n_threads=os.cpu_count())

    for i, doc in enumerate(pipe_q1):
        pos_counter = Counter(token.pos_ for token in doc)
        ner_counter = Counter(ent.label_ for ent in doc.ents)
        X1[i, :] = np.array(
            [pos_counter[pos_tag] for pos_tag in pos_tags_whitelist] +
            [ner_counter[ner_tag] for ner_tag in ner_tags_whitelist]
        )
    return X1

def get_vector_distances(i, X1, X2, pos_tags_whitelist, ner_tags_whitelist):
    return [
        # POS distances.
        cosine(X1[i, 0:len(pos_tags_whitelist)], X2[i, 0:len(pos_tags_whitelist)]),
        euclidean(X1[i, 0:len(pos_tags_whitelist)], X2[i, 0:len(pos_tags_whitelist)]),

        # NER distances.
        euclidean(X1[i, -len(ner_tags_whitelist):], X2[i, -len(ner_tags_whitelist):]),
        np.abs(np.sum(X1[i, -len(ner_tags_whitelist):]) - np.sum(X2[i, -len(ner_tags_whitelist):])),
    ]

In [45]:
def pos_ner_tags(df_all_texts):
    pos_tags_whitelist = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'NUM', 'VERB']
    ner_tags_whitelist = ['GPE', 'LOC', 'ORG', 'NORP', 'PERSON', 'PRODUCT', 'DATE', 'TIME', 'QUANTITY', 'CARDINAL']

    num_raw_features = len(pos_tags_whitelist) + len(ner_tags_whitelist)

    X1 = create_counter('text_a_text', df_all_texts, num_raw_features, pos_tags_whitelist, ner_tags_whitelist)
    X2 = create_counter('text_b_text', df_all_texts, num_raw_features, pos_tags_whitelist, ner_tags_whitelist)

    df_pos_q1 = pd.DataFrame(
        X1[:, 0:len(pos_tags_whitelist)],
        columns=['pos_q1_' + pos_tag.lower() for pos_tag in pos_tags_whitelist])
    df_pos_q2 = pd.DataFrame(
        X2[:, 0:len(pos_tags_whitelist)],
        columns=['pos_q2_' + pos_tag.lower() for pos_tag in pos_tags_whitelist])
    df_ner_q1 = pd.DataFrame(
        X1[:, -len(ner_tags_whitelist):],
        columns=['ner_q1_' + ner_tag.lower() for ner_tag in ner_tags_whitelist])
    df_ner_q2 = pd.DataFrame(
        X2[:, -len(ner_tags_whitelist):],
        columns=['ner_q2_' + ner_tag.lower() for ner_tag in ner_tags_whitelist])
    
    
    tags_distances = list()
    for i in list(range(len(df_all_texts))):
        tags_distances.append(get_vector_distances(i,X1, X2, pos_tags_whitelist, ner_tags_whitelist))

    tags_columns=[
            'pos_tag_cosine',
            'pos_tag_euclidean',
            'ner_tag_euclidean',
            'ner_tag_count_diff',
        ]

    tags_distances = pd.DataFrame(tags_distances, columns = tags_columns)

    return tags_distances

In [46]:
def get_tags(q1, q2, df_tags):
    raw = df_tags[df_tags['text_a_ID'] == q1][df_tags[df_tags['text_a_ID'] == q1]['text_b_ID'] == q2]
    raw1 = df_tags[df_tags['text_a_ID'] == q2][df_tags[df_tags['text_a_ID'] == q2]['text_b_ID'] == q1]

    if(raw1.empty): return raw
    elif(raw.empty): return raw1
    else: return 0

### Frequency of questions

More frequent questions are more likely to be duplicates

In [47]:
def compute_question_freq(train, test):

    df1 = train[['text_a_text']].copy()
    df2 = train[['text_b_text']].copy()
    df1_test = test[['text_a_text']].copy()
    df2_test = test[['text_b_text']].copy()

    df2.rename(columns = {'question2':'question1'},inplace=True)
    df2_test.rename(columns = {'question2':'question1'},inplace=True)

    train_questions = df1.append(df2)
    train_questions = train_questions.append(df1_test)
    train_questions = train_questions.append(df2_test)
    train_questions.drop_duplicates(subset = ['text_a_text'],inplace=True)
    train_questions.reset_index(inplace=True,drop=True)
    questions_dict = pd.Series(train_questions.index.values,index=train_questions.text_a_text.values).to_dict()

    train_cp = train.copy()
    test_cp = test.copy()
    train_cp.drop(['text_a_ID','text_b_ID'],axis=1,inplace=True)
    test_cp.drop(['text_a_ID','text_b_ID'],axis=1,inplace=True)
    test_cp['target'] = -1

    comb = pd.concat([train_cp,test_cp])
    comb['q1_hash'] = comb['text_a_text'].map(questions_dict)
    comb['q2_hash'] = comb['text_b_text'].map(questions_dict)

    q1_vc = comb.q1_hash.value_counts().to_dict()
    q2_vc = comb.q2_hash.value_counts().to_dict()

    def try_apply_dict(x,dict_to_apply):
        try:
            return dict_to_apply[x]
        except KeyError:
            return 0

    comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
    comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))

    train_comb = comb[comb['target'] >= 0][['row_ID','q1_hash','q2_hash','q1_freq','q2_freq','target']]
    test_comb = comb[comb['target'] < 0][['row_ID','q1_hash','q2_hash','q1_freq','q2_freq']]
    
    train_comb.reset_index(inplace=True,drop=True)
    test_comb.reset_index(inplace=True,drop=True)

    return train_comb.sort_index(), test_comb.sort_index()

### Questions intersection

In [48]:
def compute_question_intersect(train, test):

    ques = pd.concat([train[['text_a_text', 'text_b_text']], test[['text_a_text', 'text_b_text']]], axis=0).reset_index(drop='index')

    q_dict = defaultdict(set)
    for i in range(ques.shape[0]):
        q_dict[ques.text_a_text[i]].add(ques.text_b_text[i])
        q_dict[ques.text_b_text[i]].add(ques.text_a_text[i])

    def q1_q2_intersect(row):
        return(len(set(q_dict[row['text_a_text']]).intersection(set(q_dict[row['text_b_text']]))))

    train['q1_q2_intersect'] = train.apply(q1_q2_intersect, axis=1, raw=True)
    test['q1_q2_intersect'] = test.apply(q1_q2_intersect, axis=1, raw=True)
    
    return train['q1_q2_intersect'], test['q1_q2_intersect']

### K-cores

In [69]:
def compute_k_cores(texts, train, test):

    stat_dico = {}
    for an_id in texts:
        stat_dico[an_id] = [0,0,0,0] # count_qid1, count_qid1_post, count_qid2, count_qid2_post

    for idx in range(train.shape[0]):
        qid1 = train.loc[idx,'text_a_ID']
        qid2 = train.loc[idx,'text_b_ID']
        the_target = train.loc[idx,'target']

        stat_dico[qid1][0] += 1 # incrementation du count pour ce statement
        stat_dico[qid2][2] += 1 # incrementation du count pour ce statement

        if(the_target == 1):
            stat_dico[qid1][1] += 1 # incrementation du count_pos pour ce statement
            stat_dico[qid2][3] += 1 # incrementation du count_pos pour ce statement

    core_dico = {}        
    for key in stat_dico:
        if stat_dico[key][0] != 0:
            max_k_core_q1 = stat_dico[key][1] / stat_dico[key][0]
        else: 
            max_k_core_q1 = 0

        if stat_dico[key][2] != 0:
            max_k_core_q2 = stat_dico[key][3] / stat_dico[key][2]
        else:
            max_k_core_q2 = 0

        core_dico[key] = [max_k_core_q1, max_k_core_q2]

    core1_col = []
    core2_col = []
    for i in range(train.shape[0]):
        q1id = train.loc[i,'text_a_ID']
        q2id = train.loc[i,'text_b_ID']
        core1_col.append(core_dico[q1id][0])
        core2_col.append(core_dico[q2id][1])
    
    train_cores = pd.DataFrame(columns=['qid1_max_kcore', 'qid2_max_kcore'])
    train_cores['qid1_max_kcore'] = core1_col
    train_cores['qid2_max_kcore'] = core2_col
                          
    core1_col = []
    core2_col = []
    for i in range(test.shape[0]):
        q1id = test.loc[i,'text_a_ID']
        q2id = test.loc[i,'text_b_ID']

        core1_col.append(core_dico[q1id][0])
        core2_col.append(core_dico[q2id][1])
        
    test_cores = pd.DataFrame(columns=['qid1_max_kcore', 'qid2_max_kcore'])
    test_cores['qid1_max_kcore'] = core1_col
    test_cores['qid2_max_kcore'] = core2_col
    
    return train_cores, test_cores

### We save into csv files some features

In [50]:
def save_lda_df(lda_df, name):
    with open(name, 'w') as f:
        f.write("lda_1,lda_2,text_a_ID,text_b_ID\n")
        for i in range(len(lda_df)):
            f.write(str(lda_df['lda_1'][i])
                    +','
                    +str(lda_df['lda_2'][i])
                    +','
                    +str(lda_df['text_a_ID'][i])
                    +','
                    +str(lda_df['text_b_ID'][i])+'\n')

In [51]:
def save_df_tags(df_tags, name):
    with open(name, 'w') as f:
        f.write("pos_tag_cosine,pos_tag_euclidean,ner_tag_euclidean,ner_tag_count_diff,text_a_ID,text_b_ID\n")
        for i in range(len(df_tags)):
            f.write(str(df_tags['pos_tag_cosine'][i])
                    +','
                    +str(df_tags['pos_tag_euclidean'][i])
                    +','
                    +str(df_tags['ner_tag_euclidean'][i])
                    +','
                    +str(df_tags['ner_tag_count_diff'][i])
                    +','
                    +str(df_tags['text_a_ID'][i])
                    +','
                    +str(df_tags['text_b_ID'][i])+'\n')

### Feature Construction Function

In [52]:
# features name
features=['cosine similarity (CS)',
          'total length (TL)', 
          'difference length (DL)',
          'sim pos tags (POS)', 
          'POS*DL',
          'POS*TL', 
          'POS*POS*TL',
          'POS*POS*DL',
          'common words (CW)', 
          'CW*POS','CW*POS*DL',
          'CW*POS*TL',
          'fuzz ratio',
          'fuzz partial_ratio',
          'fuzz token_sort_ratio',
          'fuzz token_set_ratio',
          'fuzz partial_token_sort_ratio',
          'jaro_distance',
          'jaro_winkler',
          'jaccard',
          'jaccard_index n=2',
          'jaccard_index_norm_a n=2',
          'jaccard_index_norm_b n=2',
          'jaccard_index n=3',
          'jaccard_index_norm_a n=3', 
          'jaccard_index_norm_b n=3',
          'jaccard_index n=4', 
          'jaccard_index_norm_a n=4', 
          'jaccard_index_norm_b n=4',
          'jaccard_index n=5', 
          'jaccard_index_norm_a n=5', 
          'jaccard_index_norm_b n=5',
          'pos_tag_cosine',
          'pos_tag_euclidean',
          'ner_tag_euclidean',
          'ner_tag_count_diff',
          'lda_1',
          'lda_2',
          'common_diff',
          'common_min',
          'common_max',
          'mean_len',
          'freq_q1',
          'freq_q2',
          'word_match_share',
          'diff_words',
          'total_unique_words',
          'total_unq_words_stop',
          'wc_diff',
          'wc_ratio',
          'wc_diff_unique',
          'wc_ratio_unique',
          'wc_diff_unique_stop',
          'wc_ratio_unique_stop',
          'same_start_word',
          'same_last_word',
          'char_diff',
          'char_ratio',
          'char_diff_unique_stop',
          'q1_q2_intersect',
          'qid1_max_kcore',
          'qid2_max_kcore'
         ]

In [72]:
# faire varier la construction des pairs avec steeming, ...
def construct_data(pairs_train, A, lda_df, df_tags, train_freq, train_intersect, train_cores):
    
    N_train = len(pairs_train)
    X_train = np.zeros((N_train, 62))
    SAFE_DIV = 0.0001

    for i in range(N_train):
          
        q1 = pairs_train[i][0]
        q2 = pairs_train[i][1]
        
        X_train[i, 0] = cosine_similarity(A[ids2ind[q1], :], A[ids2ind[q2], :])  
        X_train[i, 1] = len(texts[q1]) + len(texts[q2])
        X_train[i, 2] = abs(len(texts[q1]) - len(texts[q2])) 
        
         # Similarity on spacy tag: noun, verb, ...
        X_train[i, 3] = similarity(spacy_tag(texts[q1]), spacy_tag(texts[q2]))
        X_train[i, 4] = X_train[i, 2] * X_train[i, 3]
        X_train[i, 5] = X_train[i, 1] * X_train[i, 3]
        X_train[i, 6] = X_train[i, 5] * X_train[i, 3]
        X_train[i, 7] = X_train[i, 4] * X_train[i, 3]
        
        X_train[i, 8] = common_words(texts[q1].split(), texts[q2].split())
        X_train[i, 9] = X_train[i, 8] * X_train[i, 3]
        X_train[i, 10] = X_train[i, 8] * X_train[i, 4]
        X_train[i, 11] = X_train[i, 8] * X_train[i, 5]
        
        # Fuzzy distances
        a = fuzzy(texts[q1], texts[q2])
        for j in range(7):
            X_train[i, 12 + j] = a[j]
          
        # Jaccard N-grams
        X_train[i, 19] = jaccard(texts[q1].split(), texts[q2].split())
        b = get_question_pair_features(texts[q1], texts[q2])
        for j in range(12):
            X_train[i, 20 + j] = b[j]
            
        # POS / NER tags
        c = get_tags(q1, q2, df_tags)
        tags_columns=[
            'pos_tag_cosine',
            'pos_tag_euclidean',
            'ner_tag_euclidean',
            'ner_tag_count_diff',
        ]
        for j in range(4):
            X_train[i, 32 + j] = c[tags_columns[j]]
                
        # LDA
        columns_lda=['lda_1','lda_2']
        d = get_lda(q1, q2, lda_df)
        for j in range(2):
            X_train[i, 36 + j] = d[columns_lda[j]]
            
        X_train[i, 38] = X_train[i, 8] / (X_train[i, 45] + SAFE_DIV) # ratio nb words in common, nb words not shared
        X_train[i, 39] = X_train[i, 8] / (min(len(texts[q1].split()), len(texts[q2].split())) + SAFE_DIV) # common on min
        X_train[i, 40] = X_train[i, 8] / (max(len(texts[q1].split()), len(texts[q2].split())) + SAFE_DIV) # common on max
        X_train[i, 41] = (len(texts[q1].split()) + len(texts[q2].split()))/2 # mean lenght of words
            
        # Question frequency
        X_train[i, 42] = train_freq.loc[i]['q1_freq']
        X_train[i, 43] = train_freq.loc[i]['q2_freq']
        
        # Word operation features
        X_train[i, 44] = word_match_share(texts[q1].split(), texts[q2].split())
        X_train[i, 45] = diff_words(texts[q1], texts[q2])
        X_train[i, 46] = total_unique_words(texts[q1].split(), texts[q2].split())
        X_train[i, 47] = total_unq_words_stop(texts[q1].split(), texts[q2].split())
        X_train[i, 48] = wc_diff(texts[q1].split(), texts[q2].split())
        X_train[i, 49] = wc_ratio(texts[q1].split(), texts[q2].split())
        X_train[i, 50] = wc_diff_unique(texts[q1].split(), texts[q2].split())
        X_train[i, 51] = wc_ratio_unique(texts[q1].split(), texts[q2].split())
        X_train[i, 52] = wc_diff_unique_stop(texts[q1].split(), texts[q2].split())
        X_train[i, 53] = wc_ratio_unique_stop(texts[q1].split(), texts[q2].split())
        X_train[i, 54] = same_start_word(texts[q1].split(), texts[q2].split())
        X_train[i, 55] = same_last_word(texts[q1].split(), texts[q2].split())
        X_train[i, 56] = char_diff(texts[q1].split(), texts[q2].split())
        X_train[i, 57] = char_ratio(texts[q1].split(), texts[q2].split())
        X_train[i, 58] = char_diff_unique_stop(texts[q1].split(), texts[q2].split())
                                 
        # Questionw intersection
        X_train[i, 59] = train_intersect.loc[i]
                                 
        # K-cores
        X_train[i, 60] = train_cores.loc[i]['qid1_max_kcore']
        X_train[i, 61] = train_cores.loc[i]['qid2_max_kcore']
          
    return X_train

## Final Build

In [58]:
print('Pairs building..')
texts, pairs_train, pairs_test, y_train, y_true, ids2ind = construct_pairs(train, 
                                                                          test, 
                                                                          remove_stopwords = True, 
                                                                          pos_filtering = False, 
                                                                          stemming = True)
print('tfIdf building..')
A = tfIdf(texts)

print('LDA building..')
lda_distances = lda_init(pairs_train, pairs_test, df_all_texts)
lda_df = pd.DataFrame([lda_distances.lda_1.values,lda_distances.lda_2.values, df_all_texts['text_a_ID'].values, df_all_texts['text_b_ID'].values]).T 
lda_df.columns=['lda_1', 'lda_2', 'text_a_ID','text_b_ID']

save_lda_df(lda_df, "lda_df_real.csv")
# lda_df = pd.read_csv('lda_df_real.csv')

print('tags building..')
tags_distances = pos_ner_tags(df_all_texts)
df_tags = pd.DataFrame([tags_distances.pos_tag_cosine.values,
                tags_distances.pos_tag_euclidean.values, 
                tags_distances.ner_tag_euclidean.values,
                tags_distances.ner_tag_count_diff.values,
                df_all_texts['text_a_ID'].values,
                df_all_texts['text_b_ID'].values]).T 
df_tags.columns = ['pos_tag_cosine', 'pos_tag_euclidean','ner_tag_euclidean','ner_tag_count_diff', 'text_a_ID','text_b_ID']

save_df_tags(df_tags,"df_tags_real.csv")
# df_tags = pd.read_csv('df_tags_real.csv')

Pairs building..
tfIdf building..
LDA building..
tags building..


In [70]:
print('Frequencies building..')
train_freq, test_freq = compute_question_freq(train, test)
print('Intersect building..')
train_intersect, test_intersect = compute_question_intersect(train, test)
print('K-cores building..')
train_cores, test_cores = compute_k_cores(texts, train, test)

Frequencies building..
Intersect building..
K-cores building..


In [73]:
print('Features construction..')
X_train = construct_data(pairs_train, A, lda_df, df_tags, train_freq, train_intersect, train_cores)
X_test = construct_data(pairs_test, A, lda_df, df_tags, test_freq,  test_intersect, test_cores)

Features construction..


## Final Csv Writting

In [78]:
pd.DataFrame(X_train).to_csv('data/X_train_processed.csv', index=False, header=features)
pd.DataFrame(X_test).to_csv('data/X_test_processed.csv', index=False, header=features)
pd.DataFrame(y_train).to_csv('data/y_train_processed.csv', index=False, header=False)