# IHLT Final Project: Semantic Textual Similarity Project

# Data collection
We start by downloading the SemEval 2012 dataset.

In [None]:
!mkdir -p data
!wget https://gebakx.github.io/ihlt/sts/resources/train.tgz --directory-prefix=data
!wget https://gebakx.github.io/ihlt/sts/resources/test-gold.tgz --directory-prefix=data
%cd data
!tar zxvf train.tgz
!tar zxvf test-gold.tgz
%cd ..

## Corpus assembly
Train and test sets. The test set will not be used for learning or model selection.

In [1]:
import os
import numpy as np
train_files = ['MSRpar', 'MSRvid', 'SMTeuroparl']
train_data = []
train_labels = []
for file in train_files:
    with open(os.path.join('data', 'train', 'STS.input.' + file + '.txt'), 'r') as f:
        train_data += [sent.split('\t') for sent in f.readlines()]
    with open(os.path.join('data', 'train', 'STS.gs.' + file + '.txt'), 'r') as f:
        train_labels += [float(num) for num in f.readlines()]
train_data = np.array(train_data)
train_labels = np.array(train_labels)

test_files = ['MSRpar', 'MSRvid', 'SMTeuroparl', 'surprise.OnWN', 'surprise.SMTnews']
test_data = []
test_labels = []
for file in test_files:
    with open(os.path.join('data', 'test-gold', 'STS.input.' + file + '.txt'), 'r') as f:
        test_data += [sent.split('\t') for sent in f.readlines()]
    with open(os.path.join('data', 'test-gold', 'STS.gs.'+ file + '.txt'), 'r') as f:
        test_labels += [float(num) for num in f.readlines()]
test_data = np.array(test_data)
test_labels = np.array(test_labels)

## Alternative 1: Classical NLP and machine learning

### Preprocessing and tagging

In [15]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk import ne_chunk

stopwords_set = set(stopwords.words('english')) 

def preprocess(X):
    def is_number(s):
        try:
            x = float(s)
            return True
        except ValueError:
            return False

    def characters_not_punct(token):
        for c in token:
            if c in punctuation:
                return False
        return True
    
    def lemmatize(token, pos):
        if pos in {'N','V'}:
            return wnl.lemmatize(token.lower(), pos.lower())
        return token.lower()

    def nltk_pos_to_wordnet_pos(nltk_pos):
        mapping = {'NN': wn.NOUN, 'JJ': wn.ADJ, 'VB': wn.VERB, 'RB': wn.ADV}
        if nltk_pos in mapping:
            return mapping[nltk_pos]
        else:
            return None
    
    def get_synset(lemma, pos):
        wordnet_pos = nltk_pos_to_wordnet_pos(pos[1])
        if wordnet_pos is not None:
            word_synsets = wn.synsets(lemma, wordnet_pos)
            if len(word_synsets) > 0:
                most_freq_synset = word_synsets[0] # The most frequent synset is the first one
                return most_freq_synset
        return None
    
    def get_nes(pos_tags):
        nes = ne_chunk(pos_tags, binary=False)
        nes_map = []
        for tree_element in nes:
            if type(tree_element) == nltk.tree.Tree:
                for element in tree_element:
                        nes_map.append(tree_element.label())
            else:
                nes_map.append(None)
        return nes_map
                
    def preprocess_sentence(sent):
        tokens = word_tokenize(sent)
        pos_tags = pos_tag(tokens)
        clean_tokens = []
        synsets = set([])
        chars = ''.join([c for c in sent if c not in punctuation + ' '])
        nes_map = get_nes(pos_tags)
        for token, pos, ne in zip(tokens, pos_tags, nes_map):
            if ne is not None:
                token = ne
                clean_tokens.append(token)
            if token not in stopwords_set and characters_not_punct(token):
                if is_number(token):
                    token = 'IS_NUMBER'
                else:
                    lemma = lemmatize(token, pos)
                    token = lemma
                    synset = get_synset(lemma, pos)
                    if synset is not None:
                        synsets.add(synset)
                clean_tokens.append(token)
        return clean_tokens, synsets, chars

    clean_tokens = []
    synsets = []
    chars = []
    nes = []
    for sent1, sent2 in X:
        tok1, syn1, ch1 = preprocess_sentence(sent1)
        tok2, syn2, ch2 = preprocess_sentence(sent2)
        clean_tokens.append((tok1, tok2))
        synsets.append((syn1, syn2))
        chars.append((ch1, ch2))
        
    return clean_tokens, synsets, chars

preprocessed_tokens_train, synsets_train, chars_train = preprocess(train_data)

### Feature extraction: Text representation and distances

In [21]:
from nltk.metrics.distance import edit_distance
from nltk.metrics import jaccard_distance
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def cosine_similarity(a, b):
        cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
        return cos_sim

def get_features(preprocessed_tokens, synsets, chars):
    
    def build_bow(sequences, method, chars=False):
        assert method in ['bow', 'tf', 'tf_idf']
        if not chars:
            corpus = [' '.join(tokens1) + ' '.join(tokens2) for tokens1, tokens2 in sequences]
            analyzer = 'word'
        else:
            corpus = [' '.join(list(chars1)) + ' '.join(list(chars2)) for chars1, chars2 in sequences]
            analyzer = 'char'
        if method == 'bow':
            cv = CountVectorizer(binary=True, analyzer=analyzer)
        elif method == 'tf':
            cv = CountVectorizer(binary=False, analyzer=analyzer)
        else:
            cv = TfidfVectorizer()
        cv.fit(corpus)
        return cv
    bow_tokens = build_bow(preprocessed_tokens, 'bow')
    tf_tokens = build_bow(preprocessed_tokens, 'tf')
    tf_idf_tokens = build_bow(preprocessed_tokens, 'tf_idf')
    bow_chars = build_bow(chars, 'bow', chars=True)
    tf_chars = build_bow(chars, 'tf', chars=True)
    tf_idf_chars = build_bow(chars, 'tf_idf', chars=True)
    def feature_encoder(tokens, synsets, chars,
                        bow_tokens=bow_tokens, tf_tokens=tf_tokens, tf_idf_tokens=tf_idf_tokens,
                        bow_chars=bow_chars, tf_chars=tf_chars, tf_idf_chars=tf_idf_chars):
                toks1, toks2 = tokens
                syns1, syns2 = synsets
                chars1, chars2 = chars
                # encoding
                encoded_bow_tokens = np.concatenate((bow_tokens.transform(toks1).toarray(),
                                                     bow_tokens.transform(toks2).toarray()))
                encoded_tf_tokens = np.concatenate((tf_tokens.transform(toks1).toarray(),
                                                     tf_tokens.transform(toks2).toarray()))
                encoded_tf_idf_tokens = np.concatenate((tf_idf_tokens.transform(toks1).toarray(),
                                                     tf_idf_tokens.transform(toks2).toarray()))
                encoded_bow_chars = np.concatenate((bow_chars.transform(chars1).toarray(),
                                                     bow_chars.transform(chars2).toarray()))
                encoded_tf_chars = np.concatenate((tf_chars.transform(chars1).toarray(),
                                                     tf_chars.transform(chars2).toarray()))
                encoded_tf_idf_chars = np.concatenate((tf_idf_chars.transform(chars1).toarray(),
                                                     tf_idf_chars.transform(chars2).toarray()))
                                                     
                # distances
                edit_dist = edit_distance(chars1, chars2)
                cos_encoded_bow_tokens = cosine_similarity(
                    encoded_bow_tokens[:encoded_bow_tokens.shape[0]//2],
                    encoded_bow_tokens[encoded_bow_tokens.shape[0]//2:])
                cos_encoded_tf_tokens = cosine_similarity(
                    encoded_tf_tokens[:encoded_tf_tokens.shape[0]//2],
                    encoded_tf_tokens[encoded_tf_tokens.shape[0]//2:])
                cos_encoded_tf_idf_tokens = cosine_similarity(
                    encoded_tf_idf_tokens[:encoded_tf_idf_tokens.shape[0]//2],
                    encoded_tf_idf_tokens[encoded_tf_idf_tokens.shape[0]//2:])
                cos_encoded_bow_chars = cosine_similarity(
                    encoded_bow_chars[:encoded_bow_chars.shape[0]//2],
                    encoded_bow_chars[encoded_bow_chars.shape[0]//2:])
                cos_encoded_tf_chars = cosine_similarity(
                    encoded_tf_chars[:encoded_tf_chars.shape[0]//2],
                    encoded_tf_chars[encoded_tf_chars.shape[0]//2:])
                cos_encoded_tf_idf_chars = cosine_similarity(
                    encoded_tf_idf_chars[:encoded_tf_idf_chars.shape[0]//2],
                    encoded_tf_idf_chars[encoded_tf_idf_chars.shape[0]//2:])
                
                                                      
                toks1_set, toks2_set = set(toks1), set(toks2)
                if len(toks1_set) > 0 and len(toks2_set) > 0:
                    tok_jaccard = jaccard_distance(set(toks1), set(toks2))
                else:
                    tok_jaccard = 1
                if len(syns1) > 0 and len(syns2) > 0:
                    syn_jaccard = jaccard_distance(syns1, syns2)
                else:
                    syn_jaccard = 1
                if len(chars1) > 0 and len(chars2) > 0:
                    chars_jaccard = jaccard_distance(set(list(chars1)), set(list(chars2)))
                else:
                    chars_jaccard = 1
                return dict(encoded_bow_tokens=encoded_bow_tokens, encoded_tf_tokens=encoded_tf_tokens,
                           encoded_tf_idf_tokens=encoded_tf_idf_tokens, encoded_bow_chars=encoded_bow_chars,
                           encoded_tf_chars=encoded_tf_chars, encoded_tf_idf_chars=encoded_tf_idf_chars,
                           edit_dist=edit_dist, cos_encoded_bow_tokens=cos_encoded_bow_tokens,
                           cos_encoded_tf_idf_tokens=cos_encoded_tf_idf_tokens,
                           cos_encoded_bow_chars=cos_encoded_bow_chars, cos_encoded_tf_chars=cos_encoded_tf_chars,
                           cos_encoded_tf_idf_chars=cos_encoded_tf_idf_chars, tok_jaccard=tok_jaccard,
                           syn_jaccard=syn_jaccard, chars_jaccard=chars_jaccard)                             
                
    features = []
    for tokens, synsets, char in zip(preprocessed_tokens, synsets, chars):
        feat = feature_encoder(tokens, synsets, chars)
        features.append(feat)
        
    return features, feature_encoder