# IHLT Final Project: Semantinc Textual Similarity
Jordi Armengol - Joan LLop

## Data collection
We start by downloading the SemEval 2012 dataset.

In [223]:
!mkdir -p data
!wget https://gebakx.github.io/ihlt/sts/resources/train.tgz --directory-prefix=data
!wget https://gebakx.github.io/ihlt/sts/resources/test-gold.tgz --directory-prefix=data
%cd data
!tar zxvf train.tgz
!tar zxvf test-gold.tgz
%cd ..

--2019-12-08 17:06:23--  https://gebakx.github.io/ihlt/sts/resources/train.tgz
Resolving gebakx.github.io (gebakx.github.io)... 185.199.109.153, 185.199.111.153, 185.199.110.153, ...
Connecting to gebakx.github.io (gebakx.github.io)|185.199.109.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 125822 (123K) [application/octet-stream]
Saving to: ‘data/train.tgz.9’


2019-12-08 17:06:24 (1,92 MB/s) - ‘data/train.tgz.9’ saved [125822/125822]

--2019-12-08 17:06:24--  https://gebakx.github.io/ihlt/sts/resources/test-gold.tgz
Resolving gebakx.github.io (gebakx.github.io)... 185.199.111.153, 185.199.110.153, 185.199.108.153, ...
Connecting to gebakx.github.io (gebakx.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118094 (115K) [application/octet-stream]
Saving to: ‘data/test-gold.tgz.9’


2019-12-08 17:06:24 (1,82 MB/s) - ‘data/test-gold.tgz.9’ saved [118094/118094]

/home/nhikia/Documents/AI/IHLT/IHLT-MAI/lab/

## Corpus assembly
Train and test sets. The test set will not be used for learning or model selection.

In [5]:
import os
import numpy as np
train_files = ['MSRpar', 'MSRvid', 'SMTeuroparl']
train_data = []
train_labels = []
for file in train_files:
    with open(os.path.join('data', 'train', 'STS.input.' + file + '.txt'), 'r') as f:
        train_data += [sent.split('\t') for sent in f.readlines()]
    with open(os.path.join('data', 'train', 'STS.gs.' + file + '.txt'), 'r') as f:
        train_labels += [float(num) for num in f.readlines()]
train_data = np.array(train_data)
train_labels = np.array(train_labels)

test_files = ['MSRpar', 'MSRvid', 'SMTeuroparl', 'surprise.OnWN', 'surprise.SMTnews']
test_data = []
test_labels = []
for file in test_files:
    with open(os.path.join('data', 'test-gold', 'STS.input.' + file + '.txt'), 'r') as f:
        test_data += [sent.split('\t') for sent in f.readlines()]
    with open(os.path.join('data', 'test-gold', 'STS.gs.'+ file + '.txt'), 'r') as f:
        test_labels += [float(num) for num in f.readlines()]
test_data = np.array(test_data)
test_labels = np.array(test_labels)

## General class/interface
There are so many things to try that we will start by defining a general class/interface for all the models that we will use. This class has, among other features, a cross-validation method (obviously, using only the train set, not the test set). The models will inherit the methods of this class and, essentially, they will only have to implement the a method from extracting the features from data.

In [11]:
import pickle
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy import stats
import sklearn

class Model:
    def __init__(self,  x, y, regr=LinearRegression(),):
        self.regr = regr
        self.x_features = self._extract_features(x)
        self.y = y
        self.name = None
        self.description = None
    
    
    def save(self):
        pickle.dump(self, open(self.name + '.model', 'wb').write())
    
    @classmethod
    def load(cls, name):
        return pickle.load(open(self.name + '.model', 'rb').read())
        
    
    def _extract_features(self, x):
        raise NotImplementedError 
    
    def fit(self, x, y):
        self.x_features = self._extract_features(x)
        self.y = y
        self.regr.fit(self.x_features, self.y)
    
    
    def predict(self, new_x):
        new_x_features = self._extract_features(new_x)
        return self.regr.predict(new_x_features)
        
        
    def evaluate(self, true_labels, predicted_labels):
        pearson, p_value = stats.pearsonr(true_labels, predicted_labels)
        return pearson, p_value
    
    
    def cross_validate(self, n_folds=5, seed=1):
        assert self.x_features is not None
        kf = sklearn.model_selection.KFold(n_splits=n_folds, random_state=seed)
        average_pearson = 0
        for train_index, val_index in kf.split(self.x_features):
            X_train, X_val = self.x_features[train_index], self.x_features[val_index]
            y_train, y_val = self.y[train_index], self.y[val_index]
            self.regr.fit(X_train, y_train)
            predicted_labels = self.regr.predict(X_val)
            pearson, _ = self.evaluate(y_val, predicted_labels)
            average_pearson += abs(pearson)
        return average_pearson/n_folds

## Alternative 1: Linguistic feature engineering and classical machine learning
Firstly, we will try the classical NLP strategies that we have seen in class. Since there are so many possibilities to try, we will start by fixing the regression algorithm to linear regression, and just change the particular features we input to the model (preprocessing, text representation strategies, features and ways of aggregating such features such as distances). Once we have a general idea of the features that seem to be useful, we will aggregate them into a single model. Finally, we will try different regresion algorithms, specifically non-linear ones.

### Jaccard distance of some basic features

In [16]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.metrics import jaccard_distance
from nltk.corpus import stopwords

class JaccardModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'JaccardModel'
        self.description = 'Jaccard distance, some basic features'
        self.stop_words = set(stopwords.words('english')) 
        super().__init__(*kwargs)
        
    def _extract_features(self, x):
        def preprocess(sent):
            preprocessed = ""
            for char in sent:
                if char.isdigit():
                    preprocessed += char
                elif char.isalpha():
                    preprocessed += char.lower()
                elif char == ' ':
                    preprocessed += char

            return str(preprocessed)

        x = [[preprocess(sent1), preprocess(sent2)] for sent1, sent2 in x]
        
        def lemmatize(token, pos):
            if pos in {'N','V'}:
                return wnl.lemmatize(token.lower(), pos.lower())
            return token.lower()


        def nltk_pos_to_wordnet_pos(nltk_pos):
            mapping = {'NN': wn.NOUN, 'JJ': wn.ADJ, 'VB': wn.VERB, 'RB': wn.ADV}
            if nltk_pos in mapping:
                return mapping[nltk_pos]
            else:
                return None


        def get_synsets(sent):
            saved_synsets = []
            tokens = word_tokenize(sent)
            pos_tags = pos_tag(tokens)
            lemmas = [lemmatize(t, pos) for t, pos in zip(tokens, pos_tags)]
            for token, pos, lemma in zip(tokens, pos_tags, lemmas):
                wordnet_pos = nltk_pos_to_wordnet_pos(pos[1])
                if wordnet_pos is not None:
                    word_synsets = wn.synsets(lemma, wordnet_pos)
                    if len(word_synsets) > 0:
                        most_freq_synset = word_synsets[0] # The most frequent synset is the first one
                        saved_synsets.append(most_freq_synset)
            return saved_synsets


        def get_features_from_word(sent, index, pos):
            word = sent[index]
            features = []
            features.append(str(pos)) # Part-of-Speech                   
            features.append(str(len(word))) # length of word
            features.append(str(index==0)) # beggining of a sentence
            features.append(str(index==len(sent)-1)) # end of sentence
            features.append(str(word.isdigit())) # is a digit
            return features

        def sent2features(sent):
            features = []
            tokens = [word for word in word_tokenize(sent) if not word in self.stop_words]
            features.append(tokens)
            pos_tags = pos_tag(tokens)
            features.append(pos_tags)
            lemmas = [lemmatize(t, pos) for t, pos in zip(tokens, pos_tags)]
            features.append(lemmas)
            synsets = get_synsets(sent)
            if len(synsets) > 0:
                features.append(synsets)
            else:
                features.append([0])
            temp_f = []
            for i in range(len(tokens)):
                temp_f += get_features_from_word(tokens, i, pos_tags[i])
            features.append(temp_f)

            return features
        
        def distance(features1, features2, sent1, sent2, index):
            distances = []
            init = True
            for f1, f2 in zip(features1, features2):
                distances.append(jaccard_distance(set(f1), set(f2)))
            return distances

        
        pairs_of_features = [(sent2features(sent1), sent2features(sent2)) for sent1, sent2 in x]
        distances = np.array([distance(features1, features2, sent1, sent2, index) for index, ((features1, features2), (sent1, sent2)) in enumerate(zip(pairs_of_features, x))])
        return distances

In [17]:
jaccardModel = JaccardModel(train_data, train_labels)
jaccardModel.cross_validate()

0.5744670790260964

### Bag of words
Cosine similarity of the union of the sets of bag of words.

In [10]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
import numpy as np
from nltk.corpus import stopwords 
from nltk.metrics import jaccard_distance
from string import punctuation

class BagWordsModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'BagWords'
        self.description = 'We get the bag of words of both setences, \
        calculate the union, get the count of each word of the union for each sentence \
        and get the distance as the sum of the element-wise distance'
        self.stop_words = set(stopwords.words('english'))
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        def preprocess(data):
            processed_data = []
            for sent1, sent2 in data:
                sent1 = sent1.lower()
                sent2 = sent2.lower()
                processed_sent1 = ""
                processed_sent2 = ""
                for char in sent1: 
                    if char.isalnum() or char == ' ': 
                        processed_sent1 += char
                for char in sent2: 
                    if char.isalnum() or char == ' ': 
                        processed_sent2 += char
                processed_data.append([processed_sent1, processed_sent2])
            return processed_data
        
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_bag_of_words(sent1, sent2):
            tokens1 = [word for word in word_tokenize(sent1) if not word in self.stop_words]
            tokens2 = [word for word in word_tokenize(sent2) if not word in self.stop_words]
            union = np.union1d(tokens1, tokens2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for token in tokens1:
                count1[np.where(union == token)] += 1
            for token in tokens2:
                count2[np.where(union == token)] += 1
            return [cosine_similarity(count1, count2)]
        
        preprocessed_x = preprocess(x)
        BoW = np.array([get_bag_of_words(sent1, sent2) for sent1, sent2 in preprocessed_x])
        return BoW

bagWordsModel = BagWordsModel(train_data, train_labels)
bagWordsModel.cross_validate()

0.5968105783849429

### Bag of lemmas
Same as before, but with lemmas. Recall that we are still using a linear regression.

In [20]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
import numpy as np
from nltk.corpus import stopwords 
from nltk.metrics import jaccard_distance


class BagLemmasModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'BagLemmasModel'
        self.description = 'We get the bag of lemmas of both setences, calculate the union,\
        get the count of each word of the union for each sentence and get the cosine distance \
        between them'
        self.stop_words = set(stopwords.words('english'))
        self.wnl = WordNetLemmatizer()
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        def preprocess(data):
            processed_data = []
            for sent1, sent2 in data:
                sent1 = sent1.lower()
                sent2 = sent2.lower()
                processed_sent1 = ""
                processed_sent2 = ""
                for char in sent1: 
                    if char.isalnum() or char == ' ': 
                        processed_sent1 += char 
                for char in sent2: 
                    if char.isalnum() or char == ' ': 
                        processed_sent2 += char
                processed_data.append([processed_sent1, processed_sent2])
            return processed_data
        
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_cosine_of_frequencies(vec1, vec2):
            union = np.union1d(vec1, vec2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in vec1:
                count1[np.where(union == elem)] += 1
            for elem in vec2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
        
        
        def lemmatize(token):
            return self.wnl.lemmatize(token)
        
        
        def get_bag_of_lemmas(sent1, sent2):
            tokens1 = [word for word in word_tokenize(sent1) if not word in self.stop_words]
            tokens2 = [word for word in word_tokenize(sent2) if not word in self.stop_words]
            lemmas1 = [lemmatize(word) for word in tokens1]
            lemmas2 = [lemmatize(word) for word in tokens2]
            return get_cosine_of_frequencies(lemmas1, lemmas2)
                
            
        preprocessed_x = preprocess(x)
        BoW = np.array([get_bag_of_lemmas(sent1, sent2) for sent1, sent2 in preprocessed_x])
        return BoW

bagLemmasModel = BagLemmasModel(train_data, train_labels)
bagLemmasModel.cross_validate()

0.6076451713539923

### Bag of stems

In [21]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
import numpy as np
from nltk.corpus import stopwords 
from nltk.metrics import jaccard_distance


class BagStemsModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'BagWords'
        self.description = 'We get the bag of stems of both setences, calculate the union, \
        get the count of each word of the union for each sentence and get the cosine distance between them'
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        def preprocess(data):
            processed_data = []
            for sent1, sent2 in data:
                sent1 = sent1.lower()
                sent2 = sent2.lower()
                processed_sent1 = ""
                processed_sent2 = ""
                for char in sent1: 
                    if char.isalnum() or char == ' ': 
                        processed_sent1 += char
                for char in sent2: 
                    if char.isalnum() or char == ' ': 
                        processed_sent2 += char
                processed_data.append([processed_sent1, processed_sent2])
            return processed_data
        
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_cosine_of_frequencies(vec1, vec2):
            union = np.union1d(vec1, vec2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in vec1:
                count1[np.where(union == elem)] += 1
            for elem in vec2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
        
        
        def stemmatize(token):
            return self.stemmer.stem(token)
        
        
        def get_bag_of_stems(sent1, sent2):
            tokens1 = [word for word in word_tokenize(sent1) if not word in self.stop_words]
            tokens2 = [word for word in word_tokenize(sent2) if not word in self.stop_words]
            stems1 = [stemmatize(word) for word in tokens1]
            stems2 = [stemmatize(word) for word in tokens2]
            return get_cosine_of_frequencies(stems1, stems2)                
            
            
        preprocessed_x = preprocess(x)
        BoW = np.array([get_bag_of_stems(sent1, sent2) for sent1, sent2 in preprocessed_x])
        return BoW

bagStemsModel = BagStemsModel(train_data, train_labels)
bagStemsModel.cross_validate()

0.6488292812545408

### Bigrams vector representation
We compute the cosine similarity of a frequency bigram representation of the sentences.

In [22]:
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


class BiGramsModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'BiGramsModel'
        self.description = 'BiGramsModel'
        self.allowed = np.array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', \
                                 '(', ')', '.', ' ', '!', '?', 'a', 'b', 'c', 'd', 'e', \
                                 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', \
                                 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])
        self.not_allowed_words = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        def get_allowed_characters(sent):
            sent = sent.lower()
            tokens = [word for word in word_tokenize(sent) if not word in self.not_allowed_words]
            stems = [self.stemmer.stem(word) for word in tokens]
            new_sent = " ".join(stems)
            return "".join([char for char in new_sent if char in self.allowed])
        
        
        def preprocess(data):
            word_freq = {}
            processed_data = []
            for sent1, sent2 in data:
                allowed_chars1 = get_allowed_characters(sent1)
                allowed_chars2 = get_allowed_characters(sent2)
                processed_data.append([allowed_chars1, allowed_chars2])
            return processed_data
        
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_vector_bigrams(sent):
            count = np.zeros(len(self.allowed)**2) # number of possible bigrams
            for i in range(len(sent)-1):
                idx_f = int(np.where(self.allowed == sent[i])[0])
                idx_s = int(np.where(self.allowed == sent[i+1])[0])
                count[idx_f*len(self.allowed) + idx_s] += 1
            return count
            
                
        def get_cos_of_bigrams(sent1, sent2):
            return [cosine_similarity(get_vector_bigrams(sent1), get_vector_bigrams(sent2))]                
            
            
        preprocessed_x = preprocess(x)
        BiGms = np.array([get_cos_of_bigrams(sent1, sent2) for sent1, sent2 in preprocessed_x])
        return BiGms

biGramsModel = BiGramsModel(train_data, train_labels)
biGramsModel.cross_validate()

0.663661970721763

### Synsets
We apply the same algorithm as in the case of words and lemmas, but with synsets and lemmas.

In [23]:
import numpy as np
from functools import reduce
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer

class SynsetsLemmasModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'SynsetsLemmasModel'
        self.description = 'SynsetsLemmasModel'
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_cosine_of_frequencies(vec1, vec2):
            union = np.union1d(vec1, vec2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in vec1:
                count1[np.where(union == elem)] += 1
            for elem in vec2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
        
        
        def get_union_SynLemmas(word):
            synsets = wordnet.synsets(word)
            if len(synsets) > 0:
                return reduce(np.union1d, ([str(lemma.name()) for lemma in synsets[0].lemmas()]))
            return np.empty(0)
        
        
        def get_sentence_union_synsets(tokens):
            return reduce(np.union1d, ([get_union_SynLemmas(word) for word in tokens]))
        
        
        def get_union_synsets(sent1, sent2):
            synsets1 = get_sentence_union_synsets(word_tokenize(sent1))
            synsets2 = get_sentence_union_synsets(word_tokenize(sent2))
            return get_cosine_of_frequencies(synsets1, synsets2)
            
        
        BoW = np.array([get_union_synsets(sent1, sent2) for sent1, sent2 in x])
        return BoW

synsetsLemmasModel = SynsetsLemmasModel(train_data, train_labels)
synsetsLemmasModel.cross_validate()

0.46043551305575275

### Bigrams specific counts
Like in Bag of stems. 
This way we can try trigrams without that much sparsity.

In [35]:
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


class BiGramsSpecificModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'BiGramsSpecificModel'
        self.description = 'BiGramsSpecificModel'
        self.allowed = np.array(['0', '1', '2', '3', '4', '5', '6', '7',
                                 '8', '9', '(', ')', '.', ' ', '!', '?',
                                 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
                                 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r',
                                 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])
        self.not_allowed_words = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):        
        
        def get_cosine_of_frequencies(vec1, vec2):
            union = np.union1d(vec1, vec2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in vec1:
                count1[np.where(union == elem)] += 1
            for elem in vec2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
        
        
        def get_allowed_characters(sent):
            sent = sent.lower()
            tokens = np.array([word for word in word_tokenize(sent) if not word in self.not_allowed_words])
            stems = [self.stemmer.stem(word) for word in tokens]
            new_sent = " ".join(stems)
            final_sent = "".join([char for char in new_sent if char in self.allowed])
            return final_sent
        
        
        def preprocess(data):
            processed_data = []
            for sent1, sent2 in data:
                allowed_chars1 = get_allowed_characters(sent1)
                allowed_chars2 = get_allowed_characters(sent2)
                processed_data.append([allowed_chars1, allowed_chars2])
            return processed_data
        
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_vector_bigrams(sent):
            bigrams = []
            for i in range(len(sent)):
                bigrams.append(sent[i])
            return np.array(bigrams)
        
        
        def get_cosine_distance_of_count_of_bigrams(sent1, sent2):
            bigrams1 = get_vector_bigrams(sent1)
            bigrams2 = get_vector_bigrams(sent2)
            union = np.union1d(bigrams1, bigrams2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in bigrams1:
                count1[np.where(union == elem)] += 1
            for elem in bigrams2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
            
                
        def get_cosinus_of_bigrams(sent1, sent2):
            return get_cosine_distance_of_count_of_bigrams(sent1, sent2)              
            
            
            
        preprocessed_x = preprocess(x)
        nBiGms = np.array([get_cosinus_of_bigrams(sent1, sent2) for sent1, sent2 in preprocessed_x])
        return nBiGms

biGramsSpecificModel = BiGramsSpecificModel(train_data, train_labels)
biGramsSpecificModel.cross_validate()

0.6049352032417628

### Noun phrases and verbs comparison
Spacy is a NLP toolkit similar to Stanford CoreNLP, but without requiring a Java server. We will use it to retrieve noun phrases and verbs.

In [29]:
!python3 -m pip install spacy --user
!python3 -m spacy download en_core_web_sm --user

Collecting en_core_web_sm==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz (37.4MB)
[K     |████████████████████████████████| 37.4MB 19.2MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: en-core-web-sm
  Building wheel for en-core-web-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-sm: filename=en_core_web_sm-2.0.0-cp35-none-any.whl size=37406825 sha256=7e310d00ddd6164c7c0602d415746d2033bc9c6903d7ad7089bf826c23ae7513
  Stored in directory: /tmp/pip-ephem-wheel-cache-_q8i_uv4/wheels/54/7c/d8/f86364af8fbba7258e14adae115f18dd2c91552406edc3fdaa
Successfully built en-core-web-sm
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-2.0.0

[93m    Linking successful[0m
    /home/jordiae/.local/lib/python3.5/site-packages/en_core_web_sm -->
    /home/jordiae/.local/lib/python3.5/site-packages/spacy/data/en_core_web_sm

    You can now load the

In [38]:
import numpy as np
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import math


class NounPhraseModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'NounPhraseModel'
        self.description = 'NounPhraseModel'
        # Load English tokenizer, tagger, parser, NER and word vectors
        self.nlp = spacy.load("en_core_web_sm")
        self.not_allowed_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        
        
        def get_distance(v1, v2):
            if not v1 or not v2: return 0
            count_pairs = 0
            for sent1 in v1:
                for sent2 in v2:
                    if sent1 in sent2 or sent2 in sent1:
                        count_pairs += 1
            return count_pairs/max(len(v1), len(v2))
        
        
        def get_noun_phrases(sent):
            doc = self.nlp(sent)
            noun_phrases = [chunk.text for chunk in doc.noun_chunks]
            return noun_phrases
        
        
        def get_verbs(sent):
            doc = self.nlp(sent)
            verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
            return verbs
        
        
        def get_name_entities(sent):
            doc = self.nlp(sent)
            ne = [entity.text for entity in doc.ents]
            return ne
        
        def get_similarity(sent1, sent2):
            noun_phrases_1 = get_noun_phrases(str(sent1))
            noun_phrases_2 = get_noun_phrases(str(sent2))
            verbs1 = get_verbs(str(sent1))
            verbs2 = get_verbs(str(sent2))
            ne1 = get_name_entities(str(sent1))
            ne2 = get_name_entities(str(sent2))
            dist_nouns = get_distance(noun_phrases_1, noun_phrases_2)
            dist_verbs = get_distance(verbs1, verbs2)
            dist_ne = get_distance(ne1, ne2)
            return [dist_verbs, dist_nouns, dist_ne]
        
        
        
        sm = np.array([get_similarity(sent1, sent2) for sent1, sent2 in x])
        return sm

nounPhraseModel = NounPhraseModel(train_data, train_labels)
nounPhraseModel.cross_validate()

0.4110560196683541

Let us do a summary of what have been experimented with. So far, we have tested many features, still with linear regression. There are many tests that we do not include here, for brevity (eg. the same models but without removing stop words, which gave a worse result). Since linear regression is a very simple algorithm, many times we employed distances and cosine similarities as features, instead of the bags of words (for instance) themselves.

Now, are going to aggregate some features into a single model. The assumption here is that their contribution will be additive or at least will not damage the result. As preliminary research, we conducted some experiments on some of the combinations, but for brevity and computational constraints is not possible to test all the combinations. Instead, we will include the features that seemed the most promising. In the case of the NounPhraseModel, for instance, we will not include it, because it is way slower than the other ones and the obtained results were  mediocre.

### Combining features

Aggregating features gives better results, but we are still using a linear regression. Now, we are going to try non-linear models. Since they are more powerful, it may not be a good idea to reduce the dimensionality of the feature set by applying distances or cosine similarities.

In [99]:
class AggregatedFeaturesModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'AggregatedFeaturesModel'
        self.description = 'AggregatedFeaturesModel'
        self.allowed = np.array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', \
                                 '(', ')', '.', ' ', '!', '?', 'a', 'b', 'c', 'd', 'e', \
                                 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', \
                                 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])
        self.not_allowed_words = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        self.stop_words = set(stopwords.words('english'))
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        def get_cosine_of_frequencies(vec1, vec2):
            union = np.union1d(vec1, vec2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in vec1:
                count1[np.where(union == elem)] += 1
            for elem in vec2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
        
        
        def preprocess(data):
            word_freq = {}
            processed_data = []
            for sent1, sent2 in data:
                allowed_chars1 = get_allowed_characters(sent1)
                allowed_chars2 = get_allowed_characters(sent2)
                processed_data.append([allowed_chars1, allowed_chars2])
            return processed_data

        def get_allowed_characters(sent):
            sent = sent.lower()
            tokens = [word for word in word_tokenize(sent) if not word in self.not_allowed_words]
            stems = [self.stemmer.stem(word) for word in tokens]
            new_sent = " ".join(stems)
            return "".join([char for char in new_sent if char in self.allowed])


        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim


        def get_vector_bigrams(sent):
            count = np.zeros(len(self.allowed)**2) # number of possible bigrams
            for i in range(len(sent)-1):
                idx_f = int(np.where(self.allowed == sent[i])[0])
                idx_s = int(np.where(self.allowed == sent[i+1])[0])
                count[idx_f*len(self.allowed) + idx_s] += 1
            return count


        def get_cos_of_bigrams(sent1, sent2):
            return [cosine_similarity(get_vector_bigrams(sent1), get_vector_bigrams(sent2))]   

        def stemmatize(token):
            return self.stemmer.stem(token)


        def get_bag_of_stems(sent1, sent2):
            tokens1 = [word for word in word_tokenize(sent1) if not word in self.stop_words]
            tokens2 = [word for word in word_tokenize(sent2) if not word in self.stop_words]
            stems1 = [stemmatize(word) for word in tokens1]
            stems2 = [stemmatize(word) for word in tokens2]
            return get_cosine_of_frequencies(stems1, stems2)
        
        def lemmatize(token, pos):
            if pos in {'N','V'}:
                return wnl.lemmatize(token.lower(), pos.lower())
            return token.lower()


        def nltk_pos_to_wordnet_pos(nltk_pos):
            mapping = {'NN': wn.NOUN, 'JJ': wn.ADJ, 'VB': wn.VERB, 'RB': wn.ADV}
            if nltk_pos in mapping:
                return mapping[nltk_pos]
            else:
                return None


        def get_synsets(sent):
            saved_synsets = []
            tokens = word_tokenize(sent)
            pos_tags = pos_tag(tokens)
            lemmas = [lemmatize(t, pos) for t, pos in zip(tokens, pos_tags)]
            for token, pos, lemma in zip(tokens, pos_tags, lemmas):
                wordnet_pos = nltk_pos_to_wordnet_pos(pos[1])
                if wordnet_pos is not None:
                    word_synsets = wn.synsets(lemma, wordnet_pos)
                    if len(word_synsets) > 0:
                        most_freq_synset = word_synsets[0] # The most frequent synset is the first one
                        saved_synsets.append(most_freq_synset)
            return saved_synsets


        def get_features_from_word(sent, index, pos):
            word = sent[index]
            features = []
            features.append(str(pos)) # Part-of-Speech                   
            features.append(str(len(word))) # length of word
            features.append(str(index==0)) # beggining of a sentence
            features.append(str(index==len(sent)-1)) # end of sentence
            features.append(str(word.isdigit())) # is a digit
            return features

        def sent2features(sent):
            features = []
            tokens = [word for word in word_tokenize(sent) if not word in self.stop_words]
            features.append(tokens)
            pos_tags = pos_tag(tokens)
            features.append(pos_tags)
            lemmas = [lemmatize(t, pos) for t, pos in zip(tokens, pos_tags)]
            features.append(lemmas)
            synsets = get_synsets(sent)
            if len(synsets) > 0:
                features.append(synsets)
            else:
                features.append([0])
            temp_f = []
            for i in range(len(tokens)):
                temp_f += get_features_from_word(tokens, i, pos_tags[i])
            features.append(temp_f)

            return features
        
        def distance(features1, features2, sent1, sent2, index):
            distances = []
            init = True
            for f1, f2 in zip(features1, features2):
                distances.append(jaccard_distance(set(f1), set(f2)))
            return distances
        def sent2features(sent):
            features = []
            tokens = [word for word in word_tokenize(sent) if not word in self.stop_words]
            features.append(tokens)
            pos_tags = pos_tag(tokens)
            features.append(pos_tags)
            lemmas = [lemmatize(t, pos) for t, pos in zip(tokens, pos_tags)]
            features.append(lemmas)
            synsets = get_synsets(sent)
            if len(synsets) > 0:
                features.append(synsets)
            else:
                features.append([0])
            temp_f = []
            for i in range(len(tokens)):
                temp_f += get_features_from_word(tokens, i, pos_tags[i])
            features.append(temp_f)

            return features
        
        def distance(features1, features2, sent1, sent2, index):
            distances = []
            init = True
            for f1, f2 in zip(features1, features2):
                distances.append(jaccard_distance(set(f1), set(f2)))
            return distances


        preprocessed_x = preprocess(x)
        pairs_of_features = [(sent2features(sent1), sent2features(sent2)) for sent1, sent2 in preprocessed_x]
        distances = np.array([distance(features1, features2, sent1, sent2, index) for index, ((features1, features2), (sent1, sent2)) in enumerate(zip(pairs_of_features, x))])
        
        feat = np.array([get_cos_of_bigrams(sent1, sent2) + get_bag_of_stems(sent1, sent2)
                           for sent1, sent2 in preprocessed_x])
        
        feat = np.concatenate((feat, distances), axis=1)
        return feat


        

In [100]:
class AggregatedFeaturesLRModel(AggregatedFeaturesModel):    
    def __init__(self, *kwargs):
        self.name = 'AggregatedFeaturesLRModel'
        self.description = 'AggregatedFeaturesLRModel'
        super().__init__(*kwargs)

aggregatedFeaturesLRModel = AggregatedFeaturesLRModel(train_data, train_labels)
aggregatedFeaturesLRModel.cross_validate()

0.6442796735068107

In [96]:
from sklearn.ensemble import RandomForestRegressor

class AggregatedFeaturesRFModel(AggregatedFeaturesModel):
    def __init__(self, *kwargs):
        self.name = 'AggregatedFeaturesRFModel'
        self.description = 'AggregatedFeaturesRFModel'
        super().__init__(*kwargs)
        self.regr = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=1)
        
aggregatedFeaturesRFModel = AggregatedFeaturesRFModel(train_data, train_labels)
aggregatedFeaturesRFModel.cross_validate()

0.6491118599806167

In [79]:
from sklearn.ensemble import AdaBoostRegressor

class AggregatedFeaturesAdaBoostModel(AggregatedFeaturesModel):
    def __init__(self, *kwargs):
        self.name = 'AggregatedFeaturesAdaBoostModel'
        self.description = 'AggregatedFeaturesAdaBoostModel'
        super().__init__(*kwargs)
        self.regr = AdaBoostRegressor(random_state=1)
    
aggregatedFeaturesAdaBoostModel = AggregatedFeaturesAdaBoostModel(train_data, train_labels)
aggregatedFeaturesAdaBoostModel.cross_validate()

0.6438921599375783

In [102]:
from sklearn.neural_network import MLPRegressor

class AggregatedFeaturesMLPModel(AggregatedFeaturesModel):
    def __init__(self, *kwargs):
        self.name = 'AggregatedFeaturesAdaBoostModel'
        self.description = 'AggregatedFeaturesAdaBoostModel'
        super().__init__(*kwargs)
        self.regr =  MLPRegressor(
            early_stopping=True, random_state=1, max_iter=1000, hidden_layer_sizes=(10))
        
aggregatedFeaturesMLPModel = AggregatedFeaturesMLPModel(train_data, train_labels)
aggregatedFeaturesMLPModel.cross_validate()

0.6400979480439546

In [104]:
from sklearn.svm import SVR

class AggregatedFeaturesSVRModel(AggregatedFeaturesModel):
    def __init__(self, *kwargs):
        self.name = 'AggregatedFeaturesSVRModel'
        self.description = 'AggregatedFeaturesSVRModel'
        super().__init__(*kwargs)
        self.regr = SVR(gamma='scale')
        
aggregatedFeaturesSVRModel = AggregatedFeaturesSVRModel(train_data, train_labels)
aggregatedFeaturesSVRModel.cross_validate()

0.642291372735609

## Alternative 2: Transfer learning

### Word embeddings

In [6]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip --directory-prefix=data
%cd data
!unzip wiki-news-300d-1M.vec.zip
%cd ..

--2019-12-06 15:12:23--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.22.166, 104.20.6.166, 2606:4700:10::6814:6a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.22.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘data/wiki-news-300d-1M.vec.zip’


2019-12-06 15:13:28 (10,1 MB/s) - ‘data/wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

/home/jordiae/MAI/IHLT-MAI-clone/lab/project/data
Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   
/home/jordiae/MAI/IHLT-MAI-clone/lab/project


In [3]:
import os
import numpy as np
from nltk import word_tokenize
train_tokens = [word_tokenize(sent1) + word_tokenize(sent2) for sent1, sent2 in train_data]
vocabulary = set([])
for tokenized in train_tokens:
    for token in tokenized:
        vocabulary.add(token)
pretrained_embeddings_path = os.path.join('data', 'wiki-news-300d-1M.vec')
needed_tokens = set()
embedding_table = {}
dim = 0
for line in open(pretrained_embeddings_path, 'r').readlines():
    if dim == 0:
        dim = int(line.split()[1])
        continue
    row = line.split()
    token = row[0]
    if token not in vocabulary:
        continue
    vector = np.array(list(map(float, row[1:])))
    embedding_table[token] = vector

In [45]:
import os
import numpy as np
from nltk import word_tokenize
import scipy
import sklearn

from nltk.corpus import stopwords
import string
 
stop_words = set(stopwords.words('english'))

class WordEmbeddingsModel(Model):
    
    
    def __init__(self, embedding_table, dim, method, *kwargs):
        assert method in ['avg', 'sum', 'max']
        self.embedding_table = embedding_table
        self.dim = dim
        self.method = method
        super().__init__(*kwargs)
    
    def _get_sentence_embedding(self, sent):
        tokenized = word_tokenize(sent)
        def contains_punct(token):
            for c in string.punctuation:
                if c in token:
                    return True
            return False
        tokenized = [token for token in tokenized if not contains_punct(token)] # empitjora
        tokenized = [token.lower() for token in tokenized if token.lower() not in stop_words] # empitjora
        embeddings = np.zeros((len(tokenized), self.dim))
        for idx, token in enumerate(tokenized):
            if token in embedding_table:
                embeddings[idx] = self.embedding_table[token]
            else:
                embeddings[idx] = np.zeros(dim)       
        if self.method == 'avg':
            aggregated_embeddings = np.mean(embeddings, axis=0)
        elif self.method == 'sum':
            aggregated_embeddings = np.sum(embeddings, axis=0)
        elif self.method == 'max':
            aggregated_embeddings = np.max(embeddings, axis=0)
        return aggregated_embeddings
    
    def _get_embeddings_and_cosine_similarity(self, sent1, sent2):
        
        emb1 = self._get_sentence_embedding(sent1)
        emb2 = self._get_sentence_embedding(sent2)
        cos_sim = scipy.spatial.distance.cosine(emb1, emb2)
        emb1_emb2 = np.concatenate([emb1, emb2])
        return cos_sim, emb1_emb2

        
    def _extract_features(self, x):
        pass

In [59]:
class NegatedModel():
            def fit(self, X, y):
                return self
            def predict(self, X):
                pred = []
                for row in X:
                    pred.append(-row[0])
                return pred

class WordEmbeddingsCosineSimilarityModel(WordEmbeddingsModel):
    def __init__(self, *kwargs):
        super().__init__(*kwargs)
        self.regr = NegatedModel()
        self.name = 'WordEmbeddingsCosineSimilarityModel'
        self.description = 'Pre-trained word Embeddings + stop words and punctuation filtering + cosine sim'
        
    def _extract_features(self, x):
        return np.array([[self._get_embeddings_and_cosine_similarity(sent1, sent2)[0]] for sent1, sent2 in x])

wordEmbeddingsCosineSimilarityModel = WordEmbeddingsCosineSimilarityModel(
    embedding_table, dim, 'avg', train_data, train_labels)
wordEmbeddingsCosineSimilarityModel.cross_validate()

0.6213398555295904

In [58]:
from sklearn.neural_network import MLPRegressor

class WordEmbeddingsMLPModel(WordEmbeddingsModel):
    def __init__(self, *kwargs):
        super().__init__(*kwargs)
        self.regr =  MLPRegressor(
            early_stopping=True, random_state=1, max_iter=1000, hidden_layer_sizes=(300, 300))
        self.name = 'WordEmbeddingsMLPModel'
        self.description = 'Pre-trained word Embeddings + stop words and punctuation filtering + 2 layer MLP'
        
    def _extract_features(self, x):
        return np.array([self._get_embeddings_and_cosine_similarity(sent1, sent2)[1] for sent1, sent2 in x])
wordEmbeddingsMLPModel = WordEmbeddingsMLPModel(
    embedding_table, dim, 'avg', train_data, train_labels)
wordEmbeddingsMLPModel.cross_validate()

0.4173597246401437

## Contextual embeddings

In [84]:
!python3 -m pip install transformers --user



In [127]:
import transformers, torch
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
model = transformers.BertModel.from_pretrained('bert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
#padding = [0] * ( 128 - len(input_ids))
#input_ids += padding

#attn_mask = input_ids.ne(0) # I added this to create a mask for padded indices
outputs = model(input_ids)#, attention_mask=attn_mask)
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
last_hidden_states

I1206 18:17:41.475112 140300861650688 file_utils.py:319] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt not found in cache or force_download set to True, downloading to /tmp/tmppil2k8dg
100%|██████████| 213450/213450 [00:00<00:00, 398927.28B/s]
I1206 18:17:42.593273 140300861650688 file_utils.py:334] copying /tmp/tmppil2k8dg to cache at /home/jordiae/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
I1206 18:17:42.603806 140300861650688 file_utils.py:338] creating metadata file for /home/jordiae/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
I1206 18:17:42.610208 140300861650688 file_utils.py:347] removing temp file /tmp/tmppil2k8dg
I1206 18:17:42.612698 140300861650688 tokenization_utils.py:379] loading file https://s3.amazonaws.com/models.hug

tensor([[[ 0.5132,  0.5097,  0.1991,  ..., -0.3900,  0.4053, -0.2315],
         [ 0.5395, -0.3658,  0.6667,  ..., -0.3920,  0.2505,  0.0202],
         [ 0.7767,  0.6823,  0.7110,  ..., -0.0420, -0.3718,  0.3748],
         ...,
         [ 0.3555,  0.4486,  0.6175,  ..., -0.0388, -0.2631,  0.3514],
         [ 0.7927, -0.1282,  0.2737,  ..., -0.5220,  0.4836,  0.0937],
         [ 1.2903,  1.0356,  0.5054,  ..., -0.4344,  1.1973, -0.4236]]],
       grad_fn=<AddcmulBackward>)

In [128]:
last_hidden_states.detach().numpy().shape
#np.mean(last_hidden_states.detach().numpy()[0], axis=0)

(1, 8, 768)

In [129]:
import scipy

def get_sentence_embedding(sent, model, dim, method='avg'):
    assert method in ['avg', 'sum']
    input_ids = torch.tensor(tokenizer.encode(sent)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs[0]
    if method == 'avg':
        aggregated_embeddings = np.mean(last_hidden_states.detach().numpy()[0], axis=0)
    elif method == 'sum':
        aggregated_embeddings = np.sum(last_hidden_states.detach().numpy()[0], axis=0)
    return aggregated_embeddings

def get_embeddings_and_cosine_similarity(sent1, sent2, model, dim):
    emb1 = get_sentence_embedding(sent1, model, dim)
    emb2 = get_sentence_embedding(sent2, model, dim)
    cos_sim = scipy.spatial.distance.cosine(emb1, emb2)
    emb1_emb2 = np.concatenate([emb1, emb2])
    return cos_sim, emb1_emb2

cosine_similarities = np.zeros((len(train_data), 1))
embeddings = np.zeros((len(train_data), 768*2))
for idx, (sent1, sent2) in enumerate(train_data):
    if idx % 10 == 0:
        print(idx, 'of', len(train_data))
    cos_sim, emb1_emb2 = get_embeddings_and_cosine_similarity(sent1, sent2, model, 768)
    cosine_similarities[idx] = np.array([cos_sim])
    embeddings[idx] = np.array(emb1_emb2)

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
#cross_validate(cosine_similarities, train_labels, RandomForestRegressor(max_depth=4, random_state=0))

class NegatedModel():
    def fit(self, X, y):
        return self
    def predict(self, X):
        pred = []
        for row in X:
            pred.append(-row[0])
        return pred
print(cross_validate(cosine_similarities, train_labels, NegatedModel()))
print(cross_validate(embeddings, train_labels, LinearRegression()))

0 of 2234
10 of 2234
20 of 2234
30 of 2234
40 of 2234
50 of 2234
60 of 2234
70 of 2234
80 of 2234
90 of 2234
100 of 2234
110 of 2234
120 of 2234
130 of 2234
140 of 2234
150 of 2234
160 of 2234
170 of 2234
180 of 2234
190 of 2234
200 of 2234
210 of 2234
220 of 2234
230 of 2234
240 of 2234
250 of 2234
260 of 2234
270 of 2234
280 of 2234
290 of 2234
300 of 2234
310 of 2234
320 of 2234
330 of 2234
340 of 2234
350 of 2234
360 of 2234
370 of 2234
380 of 2234
390 of 2234
400 of 2234
410 of 2234
420 of 2234
430 of 2234
440 of 2234
450 of 2234
460 of 2234
470 of 2234
480 of 2234
490 of 2234
500 of 2234
510 of 2234
520 of 2234
530 of 2234
540 of 2234
550 of 2234
560 of 2234
570 of 2234
580 of 2234
590 of 2234
600 of 2234
610 of 2234
620 of 2234
630 of 2234
640 of 2234
650 of 2234
660 of 2234
670 of 2234
680 of 2234
690 of 2234
700 of 2234
710 of 2234
720 of 2234
730 of 2234
740 of 2234
750 of 2234
760 of 2234
770 of 2234
780 of 2234
790 of 2234
800 of 2234
810 of 2234
820 of 2234
830 of 2234
840

### Pre-trained sentence embeddings

In [62]:
!python3 -m pip install sentence-transformers --user

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b9/6e/5c98f5f26698276bacd09077b039fa1a00797ed080a628ee844bd9f281d4/sentence-transformers-0.2.4.1.tar.gz (49kB)
[K     |████████████████████████████████| 51kB 686kB/s eta 0:00:01
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-0.2.4.1-cp35-none-any.whl size=69118 sha256=7026083d88d1bb4496f67b2f2ad06d0e0a5d64f6f57afe708c328f59dd4456fc
  Stored in directory: /home/jordiae/.cache/pip/wheels/12/a5/1c/03b7d87e027121fe1e23048007594e73f39a23e833658529c7
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-0.2.4.1


In [63]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

SyntaxError: invalid syntax (datasets.py, line 24)

### Google's Universal Sentence Encoder

In [23]:
!python3 -m pip install tensorflow --user
!python3 -m pip install tensorflow_hub --user

Collecting tensorflow
  Using cached https://files.pythonhosted.org/packages/de/f0/96fb2e0412ae9692dbf400e5b04432885f677ad6241c088ccc5fe7724d69/tensorflow-1.14.0-cp36-cp36m-manylinux1_x86_64.whl
Installing collected packages: tensorflow
[33m  The scripts freeze_graph, saved_model_cli, tensorboard, tf_upgrade_v2, tflite_convert, toco and toco_from_protos are installed in '/home/jordiae/.local/bin' which is not on PATH.
Successfully installed tensorflow-1.14.0
[33mYou are using pip version 18.0, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting tensorflow_hub
  Using cached https://files.pythonhosted.org/packages/00/0e/a91780d07592b1abf9c91344ce459472cc19db3b67fdf3a61dca6ebb2f5c/tensorflow_hub-0.7.0-py2.py3-none-any.whl
Installing collected packages: tensorflow-hub
[33m  The script make_image_classifier is installed in '/home/jordiae/.local/bin' which is not on PATH.
Successfully installed tensorflow-hub-0.7.

In [8]:
import scipy
import tensorflow as tf
print(tf.__version__)
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

print(embeddings)

2.0.0
tf.Tensor(
[[-0.03133016 -0.06338634 -0.01607501 ... -0.0324278  -0.04575741
   0.05370457]
 [ 0.05080863 -0.0165243   0.01573782 ...  0.00976661  0.03170121
   0.01788118]], shape=(2, 512), dtype=float32)


In [11]:
class NegatedModel():
    def fit(self, X, y):
        return self
    def predict(self, X):
        pred = []
        for row in X:
            pred.append(-row[0])
        return pred
class UniversalSentenceEncoderEmbeddingsCosineSimModel(Model):
    
    
    def __init__(self, dim, *kwargs):
        self.embed =hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
        self.dim = dim
        super().__init__(*kwargs)
        self.regr = NegatedModel()
    def _get_embeddings_and_cosine_similarity(self, sent1, sent2):
        embeddings = embed([sent1, sent2])
        emb1 = embeddings[0]
        emb2 = embeddings[1]
        cos_sim = scipy.spatial.distance.cosine(emb1, emb2)
        emb1_emb2 = np.concatenate([emb1, emb2])
        return cos_sim, emb1_emb2

        
    def _extract_features(self, x):
        return np.array(
            [[self._get_embeddings_and_cosine_similarity(sent1, sent2)[0]] for sent1, sent2 in x])
    

universalSentenceEncoderEmbeddingsCosineSimModel = UniversalSentenceEncoderEmbeddingsCosineSimModel(
    512, train_data, train_labels)
universalSentenceEncoderEmbeddingsCosineSimModel.cross_validate()

0.5772823478552047

In [None]:
from sklearn.neural_network import MLPRegressor

class UniversalSentenceEncoderEmbeddingsMLPModel(Model):
    
    
    def __init__(self, dim, *kwargs):
        self.embed =hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
        self.dim = dim
        super().__init__(*kwargs)
        self.regr = MLPRegressor(
            early_stopping=True, random_state=1, max_iter=1000, hidden_layer_sizes=(512))
    def _get_embeddings_and_cosine_similarity(self, sent1, sent2):
        embeddings = embed([sent1, sent2])
        emb1 = embeddings[0]
        emb2 = embeddings[1]
        cos_sim = scipy.spatial.distance.cosine(emb1, emb2)
        emb1_emb2 = np.concatenate([emb1, emb2])
        return cos_sim, emb1_emb2

        
    def _extract_features(self, x):
        return np.array(
            [[self._get_embeddings_and_cosine_similarity(sent1, sent2)[0]] for sent1, sent2 in x])

universalSentenceEncoderEmbeddingsMLPModel = UniversalSentenceEncoderEmbeddingsMLPModel(
    512, train_data, train_labels)
universalSentenceEncoderEmbeddingsMLPModel.cross_validate()