# IHLT Final Project: Semantinc Textual Similarity
Jordi Armengol - Joan LLop

## Data collection
We start by downloading the SemEval 2012 dataset.

In [223]:
!mkdir -p data
!wget https://gebakx.github.io/ihlt/sts/resources/train.tgz --directory-prefix=data
!wget https://gebakx.github.io/ihlt/sts/resources/test-gold.tgz --directory-prefix=data
%cd data
!tar zxvf train.tgz
!tar zxvf test-gold.tgz
%cd ..

--2019-12-08 17:06:23--  https://gebakx.github.io/ihlt/sts/resources/train.tgz
Resolving gebakx.github.io (gebakx.github.io)... 185.199.109.153, 185.199.111.153, 185.199.110.153, ...
Connecting to gebakx.github.io (gebakx.github.io)|185.199.109.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 125822 (123K) [application/octet-stream]
Saving to: ‘data/train.tgz.9’


2019-12-08 17:06:24 (1,92 MB/s) - ‘data/train.tgz.9’ saved [125822/125822]

--2019-12-08 17:06:24--  https://gebakx.github.io/ihlt/sts/resources/test-gold.tgz
Resolving gebakx.github.io (gebakx.github.io)... 185.199.111.153, 185.199.110.153, 185.199.108.153, ...
Connecting to gebakx.github.io (gebakx.github.io)|185.199.111.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118094 (115K) [application/octet-stream]
Saving to: ‘data/test-gold.tgz.9’


2019-12-08 17:06:24 (1,82 MB/s) - ‘data/test-gold.tgz.9’ saved [118094/118094]

/home/nhikia/Documents/AI/IHLT/IHLT-MAI/lab/

## Corpus assembly

In [224]:
import os
import numpy as np
train_files = ['MSRpar', 'MSRvid', 'SMTeuroparl']
train_data = []
train_labels = []
for file in train_files:
    with open(os.path.join('data', 'train', 'STS.input.' + file + '.txt'), 'r') as f:
        train_data += [sent.split('\t') for sent in f.readlines()]
    with open(os.path.join('data', 'train', 'STS.gs.' + file + '.txt'), 'r') as f:
        train_labels += [float(num) for num in f.readlines()]
train_data = np.array(train_data)
train_labels = np.array(train_labels)

test_files = ['MSRpar', 'MSRvid', 'SMTeuroparl', 'surprise.OnWN', 'surprise.SMTnews']
test_data = []
test_labels = []
for file in test_files:
    with open(os.path.join('data', 'test-gold', 'STS.input.' + file + '.txt'), 'r') as f:
        test_data += [sent.split('\t') for sent in f.readlines()]
    with open(os.path.join('data', 'test-gold', 'STS.gs.'+ file + '.txt'), 'r') as f:
        test_labels += [float(num) for num in f.readlines()]
test_data = np.array(test_data)
test_labels = np.array(test_labels)

## General class/interface


In [225]:
import pickle
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy import stats
import sklearn

class Model:
    def __init__(self,  x, y, regr=LinearRegression(),):
        self.regr = regr
        self.x_features = self._extract_features(x)
        self.y = y
        self.name = None
        self.description = None
    
    
    def save(self):
        pickle.dump(self, open(self.name + '.model', 'wb').write())
    
    @classmethod
    def load(cls, name):
        return pickle.load(open(self.name + '.model', 'rb').read())
        
    
    def _extract_features(self, x):
        raise NotImplementedError 
    
    def fit(self, x, y):
        self.x_features = self._extract_features(x)
        self.y = y
        self.regr.fit(self.x_features, self.y)
    
    
    def predict(self, new_x):
        new_x_features = self._extract_features(new_x)
        return self.regr.predict(new_x_features)
        
        
    def evaluate(self, true_labels, predicted_labels):
        pearson, p_value = stats.pearsonr(true_labels, predicted_labels)
        return pearson, p_value
    
    
    def cross_validate(self, n_folds=5, seed=1):
        assert self.x_features is not None
        kf = sklearn.model_selection.KFold(n_splits=n_folds, random_state=seed)
        average_pearson = 0
        for train_index, val_index in kf.split(self.x_features):
            X_train, X_val = self.x_features[train_index], self.x_features[val_index]
            y_train, y_val = self.y[train_index], self.y[val_index]
            self.regr.fit(X_train, y_train)
            predicted_labels = self.regr.predict(X_val)
            pearson, _ = self.evaluate(y_val, predicted_labels)
            average_pearson += abs(pearson)
        return average_pearson/n_folds

## Alternative 1: Linguistic feature engineering and classical machine learning

In [264]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.metrics import jaccard_distance
from nltk.corpus import stopwords

class JaccardModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'prova0'
        self.description = 'primera prova'
        self.stop_words = set(stopwords.words('english')) 
        super().__init__(*kwargs)

    
        
    def _extract_features(self, x):
        def preprocess(sent):
            preprocessed = ""
            for char in sent:
                if char.isdigit():
                    preprocessed += char
                elif char.isalpha():
                    preprocessed += char.lower()
                elif char == ' ':
                    preprocessed += char

            return str(preprocessed)

        x = [[preprocess(sent1), preprocess(sent2)] for sent1, sent2 in x]
        
        def lemmatize(token, pos):
            if pos in {'N','V'}:
                return wnl.lemmatize(token.lower(), pos.lower())
            return token.lower()


        def nltk_pos_to_wordnet_pos(nltk_pos):
            mapping = {'NN': wn.NOUN, 'JJ': wn.ADJ, 'VB': wn.VERB, 'RB': wn.ADV}
            if nltk_pos in mapping:
                return mapping[nltk_pos]
            else:
                return None


        def get_synsets(sent):
            saved_synsets = []
            tokens = word_tokenize(sent)
            pos_tags = pos_tag(tokens)
            lemmas = [lemmatize(t, pos) for t, pos in zip(tokens, pos_tags)]
            for token, pos, lemma in zip(tokens, pos_tags, lemmas):
                wordnet_pos = nltk_pos_to_wordnet_pos(pos[1])
                if wordnet_pos is not None:
                    word_synsets = wn.synsets(lemma, wordnet_pos)
                    if len(word_synsets) > 0:
                        most_freq_synset = word_synsets[0] # The most frequent synset is the first one
                        saved_synsets.append(most_freq_synset)
            return saved_synsets


        def get_features_from_word(sent, index, pos):
            word = sent[index]
            features = []
            features.append(str(pos)) # Part-of-Speech                   
            features.append(str(len(word))) # length of word
            features.append(str(index==0)) # beggining of a sentence
            features.append(str(index==len(sent)-1)) # end of sentence
            features.append(str(word.isdigit())) # is a digit
            return features

        def sent2features(sent):
            features = []
            tokens = [word for word in word_tokenize(sent) if not word in self.stop_words]
            features.append(tokens)
            pos_tags = pos_tag(tokens)
            features.append(pos_tags)
            lemmas = [lemmatize(t, pos) for t, pos in zip(tokens, pos_tags)]
            features.append(lemmas)
            synsets = get_synsets(sent)
            if len(synsets) > 0:
                features.append(synsets)
            else:
                features.append([0])
            temp_f = []
            for i in range(len(tokens)):
                temp_f += get_features_from_word(tokens, i, pos_tags[i])
            features.append(temp_f)

            return features
        
        def distance(features1, features2, sent1, sent2, index):
            distances = []
            init = True
            for f1, f2 in zip(features1, features2):
                distances.append(jaccard_distance(set(f1), set(f2)))

            # ...
            return distances

        
        pairs_of_features = [(sent2features(sent1), sent2features(sent2)) for sent1, sent2 in x]
        distances = np.array([distance(features1, features2, sent1, sent2, index) for index, ((features1, features2), (sent1, sent2)) in enumerate(zip(pairs_of_features, x))])
        return distances

In [265]:
prova0 = JaccardModel(train_data, train_labels)
prova0.cross_validate()

['sources', 'close', 'sale', 'said', 'vivendi', 'keeping', 'door', 'open', 'bids', 'hoped', 'see', 'bidders', 'interested', 'individual', 'assets', 'team']
['sources', 'close', 'sale', 'said', 'vivendi', 'keeping', 'door', 'open', 'bids', 'next', 'day', 'two']
['micron', 'declared', 'first', 'quarterly', 'profit', 'three', 'years']
['microns', 'numbers', 'also', 'marked', 'first', 'quarterly', 'profit', 'three', 'years', 'dram', 'manufacturer']
['fines', 'part', 'failed', 'republican', 'efforts', 'force', 'entice', 'democrats', 'return']
['perry', 'said', 'backs', 'senates', 'efforts', 'including', 'fines', 'force', 'democrats', 'return']
['american', 'anglican', 'council', 'represents', 'episcopalian', 'conservatives', 'said', 'seek', 'authorization', 'create', 'separate', 'group']
['american', 'anglican', 'council', 'represents', 'episcopalian', 'conservatives', 'said', 'seek', 'authorization', 'create', 'separate', 'province', 'north', 'america', 'last', 'weeks', 'actions']
['techlo

['dow', 'jones', 'industrial', 'average', 'dji', '775', 'points', '008', 'percent', '917547']
['monkeypox', 'usually', 'found', 'central', 'western', 'africa']
['prairie', 'dogs', 'usually', 'found', 'southwestern', 'western', 'states', 'arent', 'indigenous', 'wisconsin']
['board', 'chancellor', 'robert', 'bennett', 'declined', 'comment', 'personnel', 'matters', 'tuesday']
['mr', 'mills', 'declined', 'comment', 'yesterday', 'saying', 'never', 'discussed', 'personnel', 'matters']
['case', 'court', 'held', 'cincinnati', 'violated', 'first', 'amendment', 'banning', 'advertising', 'pamphlets', 'interest', 'aesthetics']
['case', 'court', 'held', 'city', 'cincinnati', 'violated', 'first', 'amendment', 'banning', 'interest', 'aesthetics', 'advertising', 'pamphlets']
['film', 'second', 'trilogy', 'wrap', 'november', 'matrix', 'revolutions']
['reloaded', 'second', 'installment', 'trilogy', 'matrix', 'revolutions', 'slated', 'debut', 'november']
['addition', 'justice', 'department', 'said', 'fbi

['wells', 'series', 'include', 'nbcs', 'er', 'third', 'watch']
['wells', 'series', 'include', 'nbcs', 'er', 'third', 'watch']
['thirtythree', '42', 'men', 'arrested', 'wednesday', 'evening', 'said', 'daniel', 'bogden', 'us', 'attorney', 'nevada']
['thirtyfour', 'men', 'arrested', 'others', 'sought', 'us', 'attorney', 'daniel', 'bogden', 'said', 'yesterday']
['advanced', 'micro', 'devices', 'said', 'fujitsu', 'siemens', 'computers', 'offering', 'highend', 'workstation', 'based', 'amds', 'opteron', '200', 'series']
['fujitsu', 'siemens', 'computers', 'tuesday', 'made', 'good', 'promise', 'offer', 'workstation', 'based', 'advanced', 'micro', 'devices', 'opteron', 'processor']
['fivetime', 'tour', 'de', 'france', 'winner', 'cancer', 'survivor', 'lance', 'armstrong', 'words', 'advice', 'cancer', 'survivors', 'denver', 'friday']
['fivetime', 'tour', 'de', 'france', 'winner', 'lance', 'armstrong', 'denver', 'today', 'meeting', 'surviving', 'cancer']
['survives', 'four', 'children', 'sons', 'a

['garner', 'said', 'selfproclaimed', 'baghdad', 'mayor', 'mohammed', 'mohsen', 'zubaidi', 'released', '48', 'hours', 'detention', 'late', 'april']
['enron', 'company', 'executives', 'engaged', 'widespread', 'pervasive', 'fraud', 'prosecutor', 'samuel', 'buell', 'told', 'associated', 'press']
['enron', 'company', 'executives', 'engaged', 'widespread', 'pervasive', 'fraud', 'manipulate', 'companys', 'earnings', 'results', 'buell', 'said']
['seems', 'dealing', 'bragging', 'rights', 'wins', 'loses', 'said', 'gammerman', 'heard', 'case', 'without', 'jury']
['leaving', 'aside', 'attorney', 'fees', 'dealing', 'bragging', 'rights', 'wins', 'loses', 'said', 'gammerman']
['customers', 'pay', '1219', 'entrance', 'fee', 'get', 'sms', '2003', '10', 'device', 'client', 'access', 'licenses']
['retail', 'pricing', 'sms', '2003', '10', 'device', 'client', 'access', 'licenses', '1219']
['results', '2001', 'aboriginal', 'peoples', 'survey', 'released', 'yesterday', 'statistics', 'canada', 'suggest', 'liv

['besides', 'battling', 'sales', 'slump', 'siebel', 'also', 'sparring', 'investors', 'upset', 'huge', 'stock', 'option', 'windfalls', 'company', 'managers', 'pocketed']
['besides', 'sales', 'slump', 'siebel', 'sparring', 'shareholders', 'management', 'stock', 'option', 'windfalls']
['compared', 'yearearlier', 'profit', '102', 'million', '13', 'cents', 'share']
['double', '102', 'million', '13', 'cents', 'share', 'yearearlier', 'quarter']
['stanford', '5117', 'rice', '5712', 'play', 'national', 'championship', 'tonight']
['rice', '5712', 'stanford', '5117', 'meet', 'winnertakeall', 'matchup', '605', 'pm', 'monday']
['economy', 'nonetheless', 'yet', 'exhibit', 'sustainable', 'growth']
['economy', 'hasnt', 'shown', 'signs', 'sustainable', 'growth']
['32count', 'indictment', 'strikes', 'one', 'top', 'targets', 'drug', 'trafficking', 'world', 'us', 'attorney', 'marcos', 'jimenez', 'said']
['newly', 'unsealed', '32count', 'indictment', 'alleges', 'money', 'laundering', 'conspiracy', 'strikes

['woman', 'slicing', 'garlic']
['woman', 'slicing', 'big', 'pepper']
['person', 'making', 'bed']
['person', 'eating', 'table']
['man', 'buttering', 'bread']
['man', 'stirring', 'rice']
['woman', 'hitting', 'man']
['woman', 'taking', 'bath']
['man', 'playing', 'keyboard']
['man', 'playing', 'guitar']
['man', 'playing', 'violin']
['man', 'playing', 'harp']
['three', 'men', 'dancing']
['women', 'dancing']
['man', 'speaking']
['man', 'dancing']
['man', 'dancing']
['man', 'speaking']
['woman', 'smoking']
['man', 'walking']
['man', 'dancing']
['man', 'thinking']
['man', 'erased', 'man', 'board']
['man', 'erased', 'work', 'board']
['man', 'riding', 'motorcycle']
['man', 'riding', 'motorcycle', 'town']
['man', 'riding', 'motorcycle', 'highway']
['man', 'riding', 'motorcycle']
['woman', 'slicing', 'meat']
['woman', 'slicing', 'onions']
['person', 'combing', 'cat', 'hair']
['person', 'brushing', 'cat']
['woman', 'cuts', 'broccoli']
['woman', 'cutting', 'broccoli']
['man', 'playing', 'piano']
['w

['monkey', 'swung', 'branch', 'branch']
['children', 'jumping', 'trampoline']
['two', 'boy', 'playing', 'trampoline']
['person', 'folding', 'square', 'paper', 'piece']
['person', 'folds', 'piece', 'paper']
['woman', 'knife', 'slicing', 'pepper']
['women', 'slicing', 'green', 'pepper']
['panda', 'bear', 'chewing', 'stick']
['baby', 'panda', 'plays', 'stick']
['man', 'adding', 'oil', 'car']
['man', 'putting', 'antifreeze', 'car']
['people', 'lifting', 'bags']
['several', 'people', 'carrying', 'large', 'bags']
['boy', 'playing', 'piano']
['woman', 'pouring', 'oil', 'pan']
['man', 'talking', 'phone']
['man', 'moonwalking', 'across', 'store']
['man', 'slicing', 'carrot', 'machine']
['man', 'climbing', 'rope']
['boy', 'played', 'keyboard']
['person', 'playing', 'piano', 'keyboard']
['cat', 'playing', 'antenna']
['boy', 'playing', 'mud']
['woman', 'slicing', 'pumpkin']
['person', 'slicing', 'cantaloupe']
['woman', 'braiding', 'hair']
['man', 'riding', 'motorcycle']
['man', 'cutting', 'potato'

['men', 'skating', 'road']
['two', 'boys', 'skate', 'boarding']
['man', 'pours', 'liquid', 'pan']
['chef', 'poured', 'oil', 'pan']
['boy', 'playing', 'piano', 'singing']
['pig', 'agitating', 'lion']
['woman', 'slicing', 'big', 'pepper']
['man', 'playing', 'instrument']
['person', 'boiling', 'noodles']
['cat', 'licking', 'bottle']
['cat', 'ferret', 'playing']
['ferret', 'kitten', 'play', 'together']
['monkey', 'walking', 'water']
['guy', 'playing', 'trumpet']
['woman', 'putting', 'makeup']
['band', 'singing']
['person', 'slicing', 'onions']
['woman', 'chopping', 'herbs']
['dog', 'driving', 'car']
['guy', 'talking', 'microphone']
['man', 'sang', 'stage', 'microphone']
['man', 'suit', 'standing', 'microphone', 'singing']
['car', 'backs', 'space']
['car', 'taking', 'reverse']
['animal', 'big', 'eyes', 'eating']
['slow', 'loris', 'eating']
['man', 'holding', 'frog']
['man', 'emptying', 'plastic', 'container']
['baby', 'crawling', 'happily']
['cat', 'walking', 'hardwood', 'floor']
['man', 't

['european', 'parliament', 'called', 'resolution', '16', 'march', '2000', 'initiatives', 'european', 'council', 'presidency', 'intend', 'take', 'play', 'active', 'role', 'order', 'ensure', 'full', 'implementation', 'un', 'peace', 'plan']
['unlikely', 'planned', 'protection', 'universal', 'provision', 'services', 'means', 'compensatory', 'fund', 'result', 'private', 'profits', 'ploughed', 'back', 'public', 'services', 'last']
['maintenance', 'universal', 'service', 'assistance', 'compensation', 'fund', 'enable', 'engaging', 'advantage', 'public', 'service', 'probably', 'long', 'fire']
['wonder', 'commissioner', 'planned', 'steps', 'respect', 'demonstrate', 'really', 'consider', 'kostunica', 'lawfully', 'elected', 'representative', 'serbian', 'people', 'partner', 'european', 'union', 'involved', 'today']
['wonder', 'commissioner', 'already', 'provides', 'steps', 'make', 'possible', 'show', 'consider', 'koitunica', 'legally', 'elected', 'representative', 'serbian', 'people', 'partner', 'e

['firstly', 'simplification', 'clarification', 'treaties']
['first', 'simplification', 'clarification', 'treaties']
['kind', 'future', 'would', 'also', 'like', 'see', 'countries', 'within', 'european', 'union']
['development', 'hope', 'member', 'states', 'european', 'union']
['mr', 'president', 'voted', 'garcíamargallo', 'marfil', 'report', 'taxation', 'electronic', 'commerce']
['mr', 'president', 'voted', 'garcíamargallo', 'marfil', 'report', 'concerns', 'taxation', 'services', 'provided', 'electronic', 'means']
['firstly', 'simplification', 'clarification', 'treaties']
['firstly', 'simplification', 'treaties']
['thank', 'much', 'commissioner']
['thank', 'much', 'commissioner']
['therefore', 'also', 'believe', 'reconciliation', 'sustainable', 'government', 'courageously', 'found', 'solution', 'university', 'issue', 'many', 'issues', 'needs', 'visible', 'signs', 'success', 'powerful', 'support', 'else', 'peace', 'throughout', 'region', 'threat']
['also', 'believe', 'reconciliation', 'h

['labelling', 'beef', 'decided', 'minimum', 'labelling', 'force', 'two', 'half', 'months', 'make', 'possible', 'properly', 'remind', 'origin', 'animals', 'banned', 'late', 'specified', 'risk', 'material']
['worst', 'situation', 'women', 'concerned', 'also', 'come', 'distant', 'countries', 'taken', 'work', 'desperation', 'way', 'continuing', 'provide']
['things', 'worse', 'comes', 'womens', 'distant', 'countries', 'accepted', 'work', 'need', 'option', 'continue', 'provide', 'vital', 'needs']
['hand', 'would', 'like', 'given', 'chance', 'prove', 'europe', 'able', 'develop', 'constitutional', 'equality', 'serbia', 'montenegro', 'within', 'yugoslavia', 'recognising', 'genuine', 'principles', 'democracy']
['furthermore', 'would', 'like', 'chance', 'prove', 'europe', 'able', 'develop', 'constitutional', 'equality', 'serbia', 'montenegro', 'yugoslavia', 'recognising', 'genuine', 'democratic', 'principles']
['keep', 'pressure', 'strengthening', 'position', 'revenue', 'generating', 'delivering'

['mr', 'president', 'ladies', 'gentlemen', 'like', 'say', 'words', 'joint', 'resolution', 'course', 'passed']
['mr', 'president', 'ladies', 'gentlemen', 'would', 'like', 'speak', 'joint', 'resolution', 'shall', 'course', 'voting']
['question', '6', 'h088600']
['question', '6', 'h0886', '00']
['priority', 'fight', 'order', 'ensure', 'affirmation', 'fundamental', 'rights', 'put', 'practice', 'political', 'decisions']
['fight', 'priority', 'affirmation', 'fundamental', 'rights', 'implemented', 'political', 'choices']
['council', 'europe', 'along', 'court', 'human', 'rights', 'wealth', 'experience', 'forms', 'supervision', 'build']
['court', 'human', 'rights', 'council', 'europe', 'also', 'solid', 'experience', 'regarding', 'forms', 'control', 'take', 'basis']
['reiterating', 'calls', 'made', 'european', 'parliament', 'resolution', '16', 'march', '2000', 'initiatives', 'presidency', 'european', 'council', 'propose', 'take', 'view', 'playing', 'active', 'role', 'guarantee', 'full', 'complet

['telling', 'loud', 'clear', 'kostunica', 'must', 'given', 'chance', 'therefore', 'hope', 'give', 'amnesty', 'next', 'couple', 'weeks', 'months', 'meanwhile', 'want', 'introduction', 'two', 'new', 'budget', 'lines', 'democratisation', 'reconstruction', 'accompanied', 'political', 'conditions', 'political', 'budgetary', 'perspective']
['say', 'clearly', 'need', 'give', 'opportunity', 'koitunica', 'therefore', 'hope', 'ensure', 'amnesty', 'coming', 'weeks', 'months', 'meanwhile', 'hope', 'however', 'political', 'terms', 'budgetary', 'terms', 'political', 'conditions', 'linked', 'introduction', 'two', 'new', 'budget', 'lines', 'namely', 'democratisation', 'reconstruction']
['aim', 'annual', 'review', 'identify', 'potential', 'improvements']
['annual', 'report', 'designed', 'identify', 'potential', 'improvements']
['action', 'needed', 'quickly', 'decided', 'include', 'item', 'agenda']
['urgent', 'decided', 'put', 'item', 'agenda']
['one', 'could', 'indeed', 'wish', 'improvement', 'honestly

0.5744670790260961

## Validate

In [43]:
from scipy import stats
import sklearn
def evaluate(true_labels, predicted_labels):
    pearson, p_value = stats.pearsonr(true_labels, predicted_labels)
    return pearson, p_value
def cross_validate(data, labels, model, n_folds=5, seed=1):
    kf = sklearn.model_selection.KFold(n_splits=n_folds, random_state=seed)
    average_pearson = 0
    for train_index, val_index in kf.split(data):
        X_train, X_val = data[train_index], data[val_index]
        y_train, y_val = labels[train_index], labels[val_index]
        m = model.fit(X_train, y_train)
        predicted_labels = model.predict(X_val)
        pearson, _ = evaluate(y_val, predicted_labels)
        average_pearson += pearson
    return average_pearson/n_folds

In [7]:
cross_validate(distances, train_labels, LinearRegression())

0.4109743529779384

## Alternative 2: Transfer learning

### Word embeddings

In [6]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip --directory-prefix=data
%cd data
!unzip wiki-news-300d-1M.vec.zip
%cd ..

--2019-12-06 15:12:23--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.22.166, 104.20.6.166, 2606:4700:10::6814:6a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.22.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘data/wiki-news-300d-1M.vec.zip’


2019-12-06 15:13:28 (10,1 MB/s) - ‘data/wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

/home/jordiae/MAI/IHLT-MAI-clone/lab/project/data
Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   
/home/jordiae/MAI/IHLT-MAI-clone/lab/project


In [40]:
import os
import numpy as np
from nltk import word_tokenize
train_tokens = [word_tokenize(sent1) + word_tokenize(sent2) for sent1, sent2 in train_data]
vocabulary = set([])
for tokenized in train_tokens:
    for token in tokenized:
        vocabulary.add(token)
pretrained_embeddings_path = os.path.join('data', 'wiki-news-300d-1M.vec')
needed_tokens = set()
embedding_table = {}
dim = 0
for line in open(pretrained_embeddings_path, 'r').readlines():
    if dim == 0:
        dim = int(line.split()[1])
        continue
    row = line.split()
    token = row[0]
    if token not in vocabulary:
        continue
    vector = np.array(list(map(float, row[1:])))
    embedding_table[token] = vector

In [123]:
import scipy
import sklearn

from nltk.corpus import stopwords
import string
 
stop_words = set(stopwords.words('english'))

def get_sentence_embedding(sent, embedding_table, dim, method='max'):
    assert method in ['avg', 'sum', 'max']
    tokenized = word_tokenize(sent)
    def contains_punct(token):
        for c in string.punctuation:
            if c in token:
                return True
        return False
    tokenized = [token for token in tokenized if not contains_punct(token)] # empitjora
    tokenized = [token.lower() for token in tokenized if token.lower() not in stop_words] # empitjora
    embeddings = np.zeros((len(tokenized), dim))
    for idx, token in enumerate(tokenized):
        if token in embedding_table:
            embeddings[idx] = embedding_table[token]
        else:
            embeddings[idx] = np.zeros(dim)       
    if method == 'avg':
        aggregated_embeddings = np.mean(embeddings, axis=0)
    elif method == 'sum':
        aggregated_embeddings = np.sum(embeddings, axis=0)
    elif method == 'max':
        aggregated_embeddings = np.max(embeddings, axis=0)
    return aggregated_embeddings

from sklearn.decomposition import PCA
def get_embeddings_and_cosine_similarity(sent1, sent2, embedding_table, dim):
    emb1 = get_sentence_embedding(sent1, embedding_table, dim)
    emb2 = get_sentence_embedding(sent2, embedding_table, dim)
    cos_sim = scipy.spatial.distance.cosine(emb1, emb2)
    emb1_emb2 = np.concatenate([emb1, emb2])
    return cos_sim, emb1_emb2

cosine_similarities = np.zeros((len(train_data), 1))
embeddings = np.zeros((len(train_data), dim*2))
for idx, (sent1, sent2) in enumerate(train_data):
    cos_sim, emb1_emb2 = get_embeddings_and_cosine_similarity(sent1, sent2, embedding_table, dim)
    cosine_similarities[idx] = np.array([cos_sim])
    embeddings[idx] = np.array(emb1_emb2)

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
#cross_validate(cosine_similarities, train_labels, RandomForestRegressor(max_depth=4, random_state=0))

class NegatedModel():
    def fit(self, X, y):
        return self
    def predict(self, X):
        pred = []
        for row in X:
            pred.append(-row[0])
        return pred
print(cross_validate(cosine_similarities, train_labels, NegatedModel()))
print(cross_validate(embeddings, train_labels, LinearRegression()))
#print(cross_validate(embeddings, train_labels, MLPRegressor(early_stopping=True, random_state=1, max_iter=1000, hidden_layer_sizes=(300,300))))

0.6282995295159088
0.3378078759991627


In [124]:
np.save('outfile', cosine_similarities)

In [47]:
cos = np.load('outfile.npy')
cos[0]

array([0.05247012])

## Contextual embeddings

In [84]:
!python3 -m pip install transformers --user



In [127]:
import transformers, torch
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
model = transformers.BertModel.from_pretrained('bert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
#padding = [0] * ( 128 - len(input_ids))
#input_ids += padding

#attn_mask = input_ids.ne(0) # I added this to create a mask for padded indices
outputs = model(input_ids)#, attention_mask=attn_mask)
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
last_hidden_states

I1206 18:17:41.475112 140300861650688 file_utils.py:319] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt not found in cache or force_download set to True, downloading to /tmp/tmppil2k8dg
100%|██████████| 213450/213450 [00:00<00:00, 398927.28B/s]
I1206 18:17:42.593273 140300861650688 file_utils.py:334] copying /tmp/tmppil2k8dg to cache at /home/jordiae/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
I1206 18:17:42.603806 140300861650688 file_utils.py:338] creating metadata file for /home/jordiae/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
I1206 18:17:42.610208 140300861650688 file_utils.py:347] removing temp file /tmp/tmppil2k8dg
I1206 18:17:42.612698 140300861650688 tokenization_utils.py:379] loading file https://s3.amazonaws.com/models.hug

tensor([[[ 0.5132,  0.5097,  0.1991,  ..., -0.3900,  0.4053, -0.2315],
         [ 0.5395, -0.3658,  0.6667,  ..., -0.3920,  0.2505,  0.0202],
         [ 0.7767,  0.6823,  0.7110,  ..., -0.0420, -0.3718,  0.3748],
         ...,
         [ 0.3555,  0.4486,  0.6175,  ..., -0.0388, -0.2631,  0.3514],
         [ 0.7927, -0.1282,  0.2737,  ..., -0.5220,  0.4836,  0.0937],
         [ 1.2903,  1.0356,  0.5054,  ..., -0.4344,  1.1973, -0.4236]]],
       grad_fn=<AddcmulBackward>)

In [128]:
last_hidden_states.detach().numpy().shape
#np.mean(last_hidden_states.detach().numpy()[0], axis=0)

(1, 8, 768)

In [129]:
import scipy

def get_sentence_embedding(sent, model, dim, method='avg'):
    assert method in ['avg', 'sum']
    input_ids = torch.tensor(tokenizer.encode(sent)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs[0]
    if method == 'avg':
        aggregated_embeddings = np.mean(last_hidden_states.detach().numpy()[0], axis=0)
    elif method == 'sum':
        aggregated_embeddings = np.sum(last_hidden_states.detach().numpy()[0], axis=0)
    return aggregated_embeddings

def get_embeddings_and_cosine_similarity(sent1, sent2, model, dim):
    emb1 = get_sentence_embedding(sent1, model, dim)
    emb2 = get_sentence_embedding(sent2, model, dim)
    cos_sim = scipy.spatial.distance.cosine(emb1, emb2)
    emb1_emb2 = np.concatenate([emb1, emb2])
    return cos_sim, emb1_emb2

cosine_similarities = np.zeros((len(train_data), 1))
embeddings = np.zeros((len(train_data), 768*2))
for idx, (sent1, sent2) in enumerate(train_data):
    if idx % 10 == 0:
        print(idx, 'of', len(train_data))
    cos_sim, emb1_emb2 = get_embeddings_and_cosine_similarity(sent1, sent2, model, 768)
    cosine_similarities[idx] = np.array([cos_sim])
    embeddings[idx] = np.array(emb1_emb2)

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
#cross_validate(cosine_similarities, train_labels, RandomForestRegressor(max_depth=4, random_state=0))

class NegatedModel():
    def fit(self, X, y):
        return self
    def predict(self, X):
        pred = []
        for row in X:
            pred.append(-row[0])
        return pred
print(cross_validate(cosine_similarities, train_labels, NegatedModel()))
print(cross_validate(embeddings, train_labels, LinearRegression()))

0 of 2234
10 of 2234
20 of 2234
30 of 2234
40 of 2234
50 of 2234
60 of 2234
70 of 2234
80 of 2234
90 of 2234
100 of 2234
110 of 2234
120 of 2234
130 of 2234
140 of 2234
150 of 2234
160 of 2234
170 of 2234
180 of 2234
190 of 2234
200 of 2234
210 of 2234
220 of 2234
230 of 2234
240 of 2234
250 of 2234
260 of 2234
270 of 2234
280 of 2234
290 of 2234
300 of 2234
310 of 2234
320 of 2234
330 of 2234
340 of 2234
350 of 2234
360 of 2234
370 of 2234
380 of 2234
390 of 2234
400 of 2234
410 of 2234
420 of 2234
430 of 2234
440 of 2234
450 of 2234
460 of 2234
470 of 2234
480 of 2234
490 of 2234
500 of 2234
510 of 2234
520 of 2234
530 of 2234
540 of 2234
550 of 2234
560 of 2234
570 of 2234
580 of 2234
590 of 2234
600 of 2234
610 of 2234
620 of 2234
630 of 2234
640 of 2234
650 of 2234
660 of 2234
670 of 2234
680 of 2234
690 of 2234
700 of 2234
710 of 2234
720 of 2234
730 of 2234
740 of 2234
750 of 2234
760 of 2234
770 of 2234
780 of 2234
790 of 2234
800 of 2234
810 of 2234
820 of 2234
830 of 2234
840

In [None]:
# Template




## Bag of words

In [266]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
import numpy as np
from nltk.corpus import stopwords 
from nltk.metrics import jaccard_distance

class BagWordsModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'BagWords'
        self.description = 'We get the bag of words of both setences, calculate the union, get the count of each word of the union for each sentence and get the distance as the sum of the element-wise distance'
        self.stop_words = set(stopwords.words('english'))
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        def preprocess(data):
            processed_data = []
            for sent1, sent2 in data:
                sent1 = sent1.lower()
                sent2 = sent2.lower()
                processed_sent1 = ""
                processed_sent2 = ""
                for char in sent1: 
                    if char.isalnum() or char == ' ': 
                        processed_sent1 += char 
                for char in sent2: 
                    if char.isalnum() or char == ' ': 
                        processed_sent2 += char
                processed_data.append([processed_sent1, processed_sent2])
            return processed_data
        
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_bag_of_words(sent1, sent2):
            tokens1 = [word for word in word_tokenize(sent1) if not word in self.stop_words]
            tokens2 = [word for word in word_tokenize(sent2) if not word in self.stop_words]
            union = np.union1d(tokens1, tokens2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for token in tokens1:
                count1[np.where(union == token)] += 1
            for token in tokens2:
                count2[np.where(union == token)] += 1
            return [cosine_similarity(count1, count2)]
#             return [np.average(np.abs(count1 - count2))]
                
            
            
        preprocessed_x = preprocess(x)
        BoW = np.array([get_bag_of_words(sent1, sent2) for sent1, sent2 in preprocessed_x])
        return BoW

model0 = BagWordsModel(train_data, train_labels)
model0.cross_validate()

0.5968105783849428

In [None]:
best: 0.5968105783849428

## Bag of lemmas

In [364]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
import numpy as np
from nltk.corpus import stopwords 
from nltk.metrics import jaccard_distance


class BagLemmasModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'BagLemmasModel'
        self.description = 'We get the bag of lemmas of both setences, calculate the union, get the count of each word of the union for each sentence and get the cosine distance between them'
        self.stop_words = set(stopwords.words('english'))
        self.wnl = WordNetLemmatizer()
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        def preprocess(data):
            processed_data = []
            for sent1, sent2 in data:
                sent1 = sent1.lower()
                sent2 = sent2.lower()
                processed_sent1 = ""
                processed_sent2 = ""
                for char in sent1: 
                    if char.isalnum() or char == ' ': 
                        processed_sent1 += char 
                for char in sent2: 
                    if char.isalnum() or char == ' ': 
                        processed_sent2 += char
                processed_data.append([processed_sent1, processed_sent2])
            return processed_data
        
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_cosine_of_frequencies(vec1, vec2):
            union = np.union1d(vec1, vec2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in vec1:
                count1[np.where(union == elem)] += 1
            for elem in vec2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
        
        
        def lemmatize(token):
            return self.wnl.lemmatize(token)
        
        
        def get_bag_of_lemmas(sent1, sent2):
            tokens1 = [word for word in word_tokenize(sent1) if not word in self.stop_words]
            tokens2 = [word for word in word_tokenize(sent2) if not word in self.stop_words]
            lemmas1 = [lemmatize(word) for word in tokens1]
            lemmas2 = [lemmatize(word) for word in tokens2]
            return get_cosine_of_frequencies(lemmas1, lemmas2)
#             return [np.average(np.abs(count1 - count2))]
                
            
            
        preprocessed_x = preprocess(x)
        BoW = np.array([get_bag_of_lemmas(sent1, sent2) for sent1, sent2 in preprocessed_x])
        return BoW

model0 = BagLemmasModel(train_data, train_labels)
model0.cross_validate()

0.6076451713539923

In [None]:
best: 0.6076451713539923

## Bag of stems

In [365]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
import numpy as np
from nltk.corpus import stopwords 
from nltk.metrics import jaccard_distance


class BagStemsModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'BagWords'
        self.description = 'We get the bag of stems of both setences, calculate the union, get the count of each word of the union for each sentence and get the cosine distance between them'
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        def preprocess(data):
            processed_data = []
            for sent1, sent2 in data:
                sent1 = sent1.lower()
                sent2 = sent2.lower()
                processed_sent1 = ""
                processed_sent2 = ""
                for char in sent1: 
                    if char.isalnum() or char == ' ': 
                        processed_sent1 += char
                for char in sent2: 
                    if char.isalnum() or char == ' ': 
                        processed_sent2 += char
                processed_data.append([processed_sent1, processed_sent2])
            return processed_data
        
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_cosine_of_frequencies(vec1, vec2):
            union = np.union1d(vec1, vec2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in vec1:
                count1[np.where(union == elem)] += 1
            for elem in vec2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
        
        
        def stemmatize(token):
            return self.stemmer.stem(token)
        
        
        def get_bag_of_stems(sent1, sent2):
            tokens1 = [word for word in word_tokenize(sent1) if not word in self.stop_words]
            tokens2 = [word for word in word_tokenize(sent2) if not word in self.stop_words]
            stems1 = [stemmatize(word) for word in tokens1]
            stems2 = [stemmatize(word) for word in tokens2]
            return get_cosine_of_frequencies(stems1, stems2)                
            
            
        preprocessed_x = preprocess(x)
        BoW = np.array([get_bag_of_stems(sent1, sent2) for sent1, sent2 in preprocessed_x])
        return BoW

model0 = BagStemsModel(train_data, train_labels)
model0.cross_validate()

0.6488292812545408

In [None]:
best: 0.6488292812545408

## Bigrams vector representation

In [506]:
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


class BiGramsModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'BiGramsModel'
        self.description = 'BiGramsModel'
        self.allowed = np.array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '(', ')', '.', ' ', '!', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])
        self.not_allowed_words = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        def get_allowed_characters(sent):
            sent = sent.lower()
            tokens = [word for word in word_tokenize(sent) if not word in self.not_allowed_words]
            stems = [self.stemmer.stem(word) for word in tokens]
            new_sent = " ".join(stems)
            return "".join([char for char in new_sent if char in self.allowed])
        
        
        def preprocess(data):
            word_freq = {}
            processed_data = []
            for sent1, sent2 in data:
                allowed_chars1 = get_allowed_characters(sent1)
                allowed_chars2 = get_allowed_characters(sent2)
                processed_data.append([allowed_chars1, allowed_chars2])
            return processed_data
        
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_vector_bigrams(sent):
            count = np.zeros(len(self.allowed)**2) # number of possible bigrams
            for i in range(len(sent)-1):
                idx_f = int(np.where(self.allowed == sent[i])[0])
                idx_s = int(np.where(self.allowed == sent[i+1])[0])
                count[idx_f*len(self.allowed) + idx_s] += 1
            return count
            
                
        def get_cos_of_bigrams(sent1, sent2):
            return [cosine_similarity(get_vector_bigrams(sent1), get_vector_bigrams(sent2))]                
            
            
        preprocessed_x = preprocess(x)
        BiGms = np.array([get_cos_of_bigrams(sent1, sent2) for sent1, sent2 in preprocessed_x])
        return BiGms

model0 = BiGramsModel(train_data, train_labels)
model0.cross_validate()
# model0.fit(train_data, train_labels)
# predicted_labels = model0.predict(test_data)
# model0.evaluate(test_labels, predicted_labels)

0.6637117020982714

In [455]:
best = 0.6637117020982714

## Synsets

In [488]:
import numpy as np
from functools import reduce
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer

class SynsetsLemmasModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'SynsetsLemmasModel'
        self.description = 'SynsetsLemmasModel'
        self.not_allowed_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):
        
#         def get_allowed(sent):
#             sent = sent.lower()
#             tokens = [word for word in word_tokenize(sent) if not word in self.not_allowed_words]
# #             stems = [self.stemmer.stem(word) for word in tokens]
# #             new_sent = " ".join(stems)
#             return tokens
        
        
#         def preprocess(data):
            
#             # remove most frequent words
            
#             processed_data = []
#             for sent1, sent2 in data:
#                 processed_data.append([get_allowed(sent1), get_allowed(sent2)])
#             return processed_data
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_cosine_of_frequencies(vec1, vec2):
            union = np.union1d(vec1, vec2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in vec1:
                count1[np.where(union == elem)] += 1
            for elem in vec2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
        
        
        def get_union_SynLemmas(word):
            synsets = wordnet.synsets(word)
            if len(synsets) > 0:
                return reduce(np.union1d, ([[str(lemma.name()) for lemma in synset.lemmas()] for synset in synsets]))
            return np.empty(0)
        
        
        def get_sentence_union_synsets(tokens):
            return reduce(np.union1d, ([get_union_SynLemmas(word) for word in tokens]))
        
        
        def get_union_synsets(sent1, sent2):
            synsets1 = get_sentence_union_synsets(word_tokenize(sent1))
            synsets2 = get_sentence_union_synsets(word_tokenize(sent2))
            return get_cosine_of_frequencies(synsets1, synsets2)
            
        
#         preprocessed_x = preprocess(x)
        BoW = np.array([get_union_synsets(sent1, sent2) for sent1, sent2 in x])
        return BoW

model0 = SynsetsLemmasModel(train_data, train_labels)
model0.cross_validate()

0.4066406523739613

In [None]:
0.4066406523739613

## Bigrams specific counts
Like in Bag of stems. 
This way we can try trigrams without the sparse vector problem.

In [504]:
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


class BiGramsSpecificModel(Model):
    
    
    def __init__(self, *kwargs):
        self.name = 'BiGramsSpecificModel'
        self.description = 'BiGramsSpecificModel'
        self.allowed = np.array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '(', ')', '.', ' ', '!', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'])
        self.not_allowed_words = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer("english")
        super().__init__(*kwargs)
    
        
    def _extract_features(self, x):        
        
        def get_cosine_of_frequencies(vec1, vec2):
            union = np.union1d(vec1, vec2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in vec1:
                count1[np.where(union == elem)] += 1
            for elem in vec2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
        
        
        def stemmatize(token):
            return self.stemmer.stem(token)
        
        
        def get_bag_of_stems(sent1, sent2):
            tokens1 = [word for word in word_tokenize(sent1) if not word in self.not_allowed_words]
            tokens2 = [word for word in word_tokenize(sent2) if not word in self.not_allowed_words]
            stems1 = [stemmatize(word) for word in tokens1]
            stems2 = [stemmatize(word) for word in tokens2]
            return get_cosine_of_frequencies(stems1, stems2)
        
        
        def get_allowed_characters(sent):
            sent = sent.lower()
            tokens = np.array([word for word in word_tokenize(sent) if not word in self.not_allowed_words])
            stems = [self.stemmer.stem(word) for word in tokens]
            new_sent = " ".join(stems)
            final_sent = "".join([char for char in new_sent if char in self.allowed])
            return final_sent
        
        
        def preprocess(data):
            processed_data = []
            for sent1, sent2 in data:
                allowed_chars1 = get_allowed_characters(sent1)
                allowed_chars2 = get_allowed_characters(sent2)
                processed_data.append([allowed_chars1, allowed_chars2])
            return processed_data
        
        
        def cosine_similarity(a, b):
            cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
            return cos_sim
        
        
        def get_vector_bigrams(sent):
            bigrams = []
            for i in range(len(sent)):
                bigrams.append(sent[i])
            return np.array(bigrams)
        
        
        def get_cosine_distance_of_count_of_bigrams(sent1, sent2):
            bigrams1 = get_vector_bigrams(sent1)
            bigrams2 = get_vector_bigrams(sent2)
            union = np.union1d(bigrams1, bigrams2)
            count1 = np.zeros(len(union))
            count2 = np.zeros(len(union))
            for elem in bigrams1:
                count1[np.where(union == elem)] += 1
            for elem in bigrams2:
                count2[np.where(union == elem)] += 1
            return [cosine_similarity(count1, count2)]
            
                
        def get_cosinus_of_bigrams(sent1, sent2):
            return get_cosine_distance_of_count_of_bigrams(sent1, sent2)                
            
            
        preprocessed_x = preprocess(x)
        nBiGms = np.array([get_cosinus_of_bigrams(sent1, sent2) + get_bag_of_stems(sent1, sent2) for sent1, sent2 in preprocessed_x])
        return nBiGms

model0 = BiGramsSpecificModel(train_data, train_labels)
model0.cross_validate()

0.6686812421906371

## Using CountVectorizer: full text, smal batchs, line by line...