# IHLT Final Project: Semantic Textual Similarity Project

# Data collection
We start by downloading the SemEval 2012 dataset.

In [None]:
%%time
!mkdir -p data
!wget https://gebakx.github.io/ihlt/sts/resources/train.tgz --directory-prefix=data
!wget https://gebakx.github.io/ihlt/sts/resources/test-gold.tgz --directory-prefix=data
%cd data
!tar zxvf train.tgz
!tar zxvf test-gold.tgz
%cd ..

## Corpus assembly
Train and test sets. The test set will not be used for learning or model selection.

In [1]:
%%time
import os
import numpy as np
train_files = ['MSRpar', 'MSRvid', 'SMTeuroparl']
train_data = []
train_labels = []
for file in train_files:
    with open(os.path.join('data', 'train', 'STS.input.' + file + '.txt'), 'r') as f:
        train_data += [sent.split('\t') for sent in f.readlines()]
    with open(os.path.join('data', 'train', 'STS.gs.' + file + '.txt'), 'r') as f:
        train_labels += [float(num) for num in f.readlines()]
train_data = np.array(train_data)
train_labels = np.array(train_labels)

test_files = ['MSRpar', 'MSRvid', 'SMTeuroparl', 'surprise.OnWN', 'surprise.SMTnews']
test_data = []
test_labels = []
for file in test_files:
    with open(os.path.join('data', 'test-gold', 'STS.input.' + file + '.txt'), 'r') as f:
        test_data += [sent.split('\t') for sent in f.readlines()]
    with open(os.path.join('data', 'test-gold', 'STS.gs.'+ file + '.txt'), 'r') as f:
        test_labels += [float(num) for num in f.readlines()]
test_data = np.array(test_data)
test_labels = np.array(test_labels)

CPU times: user 223 ms, sys: 164 ms, total: 387 ms
Wall time: 129 ms


## Alternative 1: Classical NLP and machine learning

### Preprocessing and tagging

In [238]:
%%time
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk import ne_chunk
from nltk.stem import PorterStemmer

stopwords_set = set(stopwords.words('english')) 

def preprocess(X):
    def is_number(s):
        try:
            x = float(s)
            return True
        except ValueError:
            return False

    def characters_not_punct(token):
        for c in token:
            if c in punctuation:
                return False
        return True
    
    def lemmatize(token, pos):
        if pos in {'N','V'}:
            return wnl.lemmatize(token.lower(), pos.lower())
        return token.lower()
    
    def stemmatize(token):
            return PorterStemmer().stem(token)

    def nltk_pos_to_wordnet_pos(nltk_pos):
        mapping = {'NN': wn.NOUN, 'JJ': wn.ADJ, 'VB': wn.VERB, 'RB': wn.ADV}
        if nltk_pos in mapping:
            return mapping[nltk_pos]
        else:
            return None
    
    def get_synset(lemma, pos):
        wordnet_pos = nltk_pos_to_wordnet_pos(pos[1])
        if wordnet_pos is not None:
            word_synsets = wn.synsets(lemma, wordnet_pos)
            if len(word_synsets) > 0:
                most_freq_synset = word_synsets[0] # The most frequent synset is the first one
                return most_freq_synset
        return None
    
    def get_nes(pos_tags):
        nes = ne_chunk(pos_tags, binary=False)
        nes_map = []
        for tree_element in nes:
            if type(tree_element) == nltk.tree.Tree:
                for element in tree_element:
                        nes_map.append(tree_element.label())
            else:
                nes_map.append(None)
        return nes_map
                
    def preprocess_sentence(sent):
        tokens = word_tokenize(sent)
        pos_tags = pos_tag(tokens)
        clean_tokens = []
        synsets = set([])
        chars = ''.join([c for c in sent if c not in punctuation + ' '])
        nes_map = get_nes(pos_tags)
        for token, pos, ne in zip(tokens, pos_tags, nes_map):
#             token = token.lower()
#             if ne is not None:
#                 token = ne
#                 clean_tokens.append(token)
#             lemma = lemmatize(token, pos)
#             clean_tokens.append(lemma)
            stem = stemmatize(token)
            clean_tokens.append(stem)
#             synset = get_synset(lemma, pos)
#             if synset is not None:
#                 synsets.add(synset)
        return clean_tokens, synsets, chars

    clean_tokens = []
    synsets = []
    chars = []
    nes = []
    for sent1, sent2 in X:
        tok1, syn1, ch1 = preprocess_sentence(sent1)
        tok2, syn2, ch2 = preprocess_sentence(sent2)
        clean_tokens.append((tok1, tok2))
        synsets.append((syn1, syn2))
        chars.append((ch1, ch2))
        
    return clean_tokens, synsets, chars

preprocessed_tokens_train, synsets_train, chars_train = preprocess(train_data)
preprocessed_tokens_test, synsets_test, chars_test = preprocess(test_data)

CPU times: user 49 s, sys: 120 ms, total: 49.1 s
Wall time: 49.1 s


### Feature extraction: Text representation and distances

In [240]:
%%time
from nltk.metrics.distance import edit_distance
from nltk.metrics import jaccard_distance
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import pandas as pd
from collections import OrderedDict


def cosine_similarity(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

def overlap(A, B):
    return len(A.intersection(B))/(min(len(A), len(B)))

def set_kernel_distance(a, b):
    return np.exp2(np.dot(a, b.T))

def get_features(preprocessed_tokens, synsets, chars):
    
    def build_bow(sequences, method, chars=False, ngram_range=(1,1)):
        assert method in ['bow', 'tf', 'tf_idf']
        if not chars:
            corpus = [' '.join(tokens1) + ' '.join(tokens2) for tokens1, tokens2 in sequences]
            analyzer = 'word'
        else:
            corpus = [' '.join(list(chars1)) + ' '.join(list(chars2)) for chars1, chars2 in sequences]
            analyzer = 'char'
        if method == 'bow':
            cv = CountVectorizer(binary=True, analyzer=analyzer, ngram_range=ngram_range)
        elif method == 'tf':
            cv = CountVectorizer(binary=False, analyzer=analyzer, ngram_range=ngram_range)
        else:
            cv = TfidfVectorizer()
        cv.fit(corpus)
        return cv
    bow_tokens = build_bow(preprocessed_tokens, 'bow')
    tf_tokens = build_bow(preprocessed_tokens, 'tf')
    tf_idf_tokens = build_bow(preprocessed_tokens, 'tf_idf')
    bow_chars = build_bow(chars, 'bow', chars=True)
    tf_chars = build_bow(chars, 'tf', chars=True)
    tf_idf_chars = build_bow(chars, 'tf_idf', chars=True)
    bow_bigrams = build_bow(preprocessed_tokens, 'bow', (2, 2))
    tf_bigrams = build_bow(preprocessed_tokens, 'tf', (2, 2))
    tf_idf_bigrams = build_bow(preprocessed_tokens, 'tf_idf', (2, 2))
    bow_trigrams = build_bow(preprocessed_tokens, 'bow', (3, 3))
    tf_trigrams = build_bow(preprocessed_tokens, 'tf', (3, 3))
    tf_idf_trigrams = build_bow(preprocessed_tokens, 'tf_idf', (3, 3))
    def feature_encoder(tokens, synsets, chars,
                        bow_tokens=bow_tokens, tf_tokens=tf_tokens, tf_idf_tokens=tf_idf_tokens,
                        bow_chars=bow_chars, tf_chars=tf_chars, tf_idf_chars=tf_idf_chars):
                toks1, toks2 = tokens
                syns1, syns2 = synsets
                chars1, chars2 = chars
                
                # encoding
                encoded_bow_tokens = np.concatenate((bow_tokens.transform([' '.join(toks1)]).toarray(),
                                                     bow_tokens.transform([' '.join(toks2)]).toarray())).flatten()
                encoded_tf_tokens = np.concatenate((tf_tokens.transform([' '.join(toks1)]).toarray(),
                                                     tf_tokens.transform([' '.join(toks2)]).toarray())).flatten()
                encoded_tf_idf_tokens = np.concatenate((tf_idf_tokens.transform([' '.join(toks1)]).toarray(),
                                                     tf_idf_tokens.transform([' '.join(toks2)]).toarray())).flatten()
                encoded_bow_chars = np.concatenate((bow_chars.transform([' '.join(chars1)]).toarray(),
                                                     bow_chars.transform([' '.join(chars2)]).toarray())).flatten()
                encoded_tf_chars = np.concatenate((tf_chars.transform([' '.join(chars1)]).toarray(),
                                                     tf_chars.transform([' '.join(chars2)]).toarray())).flatten()
                encoded_tf_idf_chars = np.concatenate((tf_idf_chars.transform([' '.join(chars1)]).toarray(),
                                                     tf_idf_chars.transform([' '.join(chars2)]).toarray())).flatten()
                encoded_bow_bigrams = np.concatenate((bow_chars.transform([' '.join(toks1)]).toarray(),
                                                     bow_chars.transform([' '.join(toks2)]).toarray())).flatten()
                encoded_tf_bigrams = np.concatenate((tf_chars.transform([' '.join(toks1)]).toarray(),
                                                     tf_chars.transform([' '.join(toks2)]).toarray())).flatten()
                encoded_tf_idf_bigrams = np.concatenate((tf_idf_chars.transform([' '.join(toks1)]).toarray(),
                                                     tf_idf_chars.transform([' '.join(toks2)]).toarray())).flatten()
                encoded_bow_trigrams = np.concatenate((bow_chars.transform([' '.join(toks1)]).toarray(),
                                                     bow_chars.transform([' '.join(toks2)]).toarray())).flatten()
                encoded_tf_trigrams = np.concatenate((tf_chars.transform([' '.join(toks1)]).toarray(),
                                                     tf_chars.transform([' '.join(toks2)]).toarray())).flatten()
                encoded_tf_idf_trigrams = np.concatenate((tf_idf_chars.transform([' '.join(toks1)]).toarray(),
                                                     tf_idf_chars.transform([' '.join(toks2)]).toarray())).flatten()
                                                     
                # distances
                edit_dist = edit_distance(chars1, chars2)
                set_dist_encoded_bow_tokens = set_kernel_distance(
                    encoded_bow_tokens[:encoded_bow_tokens.shape[0]//2],
                    encoded_bow_tokens[encoded_bow_tokens.shape[0]//2:])
                cos_encoded_tf_tokens = cosine_similarity(
                    encoded_tf_tokens[:encoded_tf_tokens.shape[0]//2],
                    encoded_tf_tokens[encoded_tf_tokens.shape[0]//2:])
                cos_encoded_tf_idf_tokens = cosine_similarity(
                    encoded_tf_idf_tokens[:encoded_tf_idf_tokens.shape[0]//2],
                    encoded_tf_idf_tokens[encoded_tf_idf_tokens.shape[0]//2:])
                cos_encoded_tf_chars = cosine_similarity(
                    encoded_tf_chars[:encoded_tf_chars.shape[0]//2],
                    encoded_tf_chars[encoded_tf_chars.shape[0]//2:])
                set_dist_encoded_bow_bigrams = set_kernel_distance(
                    encoded_bow_bigrams[:encoded_bow_bigrams.shape[0]//2],
                    encoded_bow_bigrams[encoded_bow_bigrams.shape[0]//2:])
                cos_encoded_tf_bigrams = cosine_similarity(
                    encoded_tf_bigrams[:encoded_tf_bigrams.shape[0]//2],
                    encoded_tf_bigrams[encoded_tf_bigrams.shape[0]//2:])
                set_dist_encoded_bow_trigrams = set_kernel_distance(
                    encoded_bow_trigrams[:encoded_bow_trigrams.shape[0]//2],
                    encoded_bow_trigrams[encoded_bow_trigrams.shape[0]//2:])
                cos_encoded_tf_trigrams = cosine_similarity(
                    encoded_tf_trigrams[:encoded_tf_trigrams.shape[0]//2],
                    encoded_tf_trigrams[encoded_tf_trigrams.shape[0]//2:])
                
                                                      
                toks1_set, toks2_set = set(toks1), set(toks2)
                if len(toks1_set) > 0 and len(toks2_set) > 0:
                    tok_jaccard = jaccard_distance(set(toks1), set(toks2))
                    tok_overlap = overlap(set(toks1), set(toks2))
                else:
                    tok_jaccard = 1
                    tok_overlap = 1
                if len(syns1) > 0 and len(syns2) > 0:
                    syn_jaccard = jaccard_distance(syns1, syns2)
                    syn_overlap = overlap(syns1, syns2)
                else:
                    syn_jaccard = 1
                    syn_overlap = 1
                if len(chars1) > 0 and len(chars2) > 0:
                    chars_jaccard = jaccard_distance(set(list(chars1)), set(list(chars2)))
                    chars_overlap = overlap(set(list(chars1)), set(list(chars2)))
                else:
                    chars_jaccard = 1
                    chars_overlap = 1
                return OrderedDict(edit_dist=edit_dist, set_dist_encoded_bow_tokens=set_dist_encoded_bow_tokens,
                           cos_encoded_tf_idf_tokens=cos_encoded_tf_idf_tokens,
                           cos_encoded_tf_chars=cos_encoded_tf_chars,
                           set_dist_encoded_bow_bigrams=set_dist_encoded_bow_bigrams,
                           cos_encoded_tf_bigrams=cos_encoded_tf_bigrams,
                           set_dist_encoded_bow_trigrams=set_dist_encoded_bow_trigrams,
                           cos_encoded_tf_trigrams=cos_encoded_tf_trigrams,
                           tok_jaccard=tok_jaccard,
                           tok_overlap=tok_overlap,    
                           syn_jaccard=syn_jaccard,
                           syn_overlap=syn_overlap,
                           chars_jaccard=chars_jaccard,
                           chars_overlap=chars_overlap)                      
                
    features = []
    for toks, syns, ch in zip(preprocessed_tokens, synsets, chars):
        feat = feature_encoder(toks, syns, ch)
        features.append(feat)
    mat_features = pd.DataFrame(features).values
    std_scaler = StandardScaler().fit(mat_features)
    def scale(features_row, scaler=std_scaler):
        scaled = scaler.transform(pd.DataFrame([features_row]).values)
        scaled_features_row = OrderedDict(zip(features_row.keys(), scaled[0]))
        return scaled_features_row
    scaled_feat = []
    for feat in features:
        scaled_feat.append(scale(feat))
    return scaled_feat, feature_encoder, scale

features_train, feature_encoder, scaler = get_features(preprocessed_tokens_train, synsets_train, chars_train)
features_test = []
for toks, syns, ch in zip(preprocessed_tokens_test, synsets_test, chars_test):
    feat = feature_encoder(toks, syns, ch)
    features_test.append(feat)
scaled_feat = []
for feat in features_test:
    scaled_feat.append(scaler(feat))

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


CPU times: user 54.5 s, sys: 15.7 ms, total: 54.5 s
Wall time: 54.5 s


In [241]:
import math
features_test = []
for feat in scaled_feat:
    new_d = {}
    for key in feat:
        if not math.isnan(feat[key]):
            new_d[key] = feat[key]
        else:
            new_d[key] = 0
    features_test.append(new_d)

In [242]:
features_test[0]

{'chars_jaccard': 0.25559013584448587,
 'chars_overlap': -0.5072534499650403,
 'cos_encoded_tf_bigrams': 0.47702630078606734,
 'cos_encoded_tf_chars': 0.32444359152442065,
 'cos_encoded_tf_idf_tokens': 0.6147071247428714,
 'cos_encoded_tf_trigrams': 0.47702630078606734,
 'edit_dist': -0.017885558102357722,
 'set_dist_encoded_bow_bigrams': -0.11286957047122254,
 'set_dist_encoded_bow_tokens': -0.06002446431769902,
 'set_dist_encoded_bow_trigrams': -0.11286957047122254,
 'syn_jaccard': 0.0,
 'syn_overlap': 0.0,
 'tok_jaccard': 0.4972678363433251,
 'tok_overlap': -0.3372444337413412}

In [243]:
import pickle
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy import stats
import sklearn

class Model:
    def __init__(self,  x, y, regr=LinearRegression(),):
        self.regr = regr
        self.x_features = self._extract_features(x)
        self.y = y
        self.name = None
        self.description = None
    
    
    def save(self):
        pickle.dump(self, open(self.name + '.model', 'wb').write())
    
    @classmethod
    def load(cls, name):
        return pickle.load(open(self.name + '.model', 'rb').read())
        
    
    def _extract_features(self, x):
        raise NotImplementedError 
    
    def fit(self, x, y):
        self.x_features = self._extract_features(x)
        self.y = y
        self.regr.fit(self.x_features, self.y)
    
    
    def predict(self, new_x):
        new_x_features = self._extract_features(new_x)
        return self.regr.predict(new_x_features)
        
        
    def evaluate(self, true_labels, predicted_labels):
        pearson, p_value = stats.pearsonr(true_labels, predicted_labels)
        return pearson, p_value
    
    
    def cross_validate(self, n_folds=5, seed=1):
        assert self.x_features is not None
        kf = sklearn.model_selection.KFold(n_splits=n_folds, random_state=seed)
        average_pearson = 0
        for train_index, val_index in kf.split(self.x_features):
            X_train, X_val = self.x_features[train_index], self.x_features[val_index]
            y_train, y_val = self.y[train_index], self.y[val_index]
            self.regr.fit(X_train, y_train)
            predicted_labels = self.regr.predict(X_val)
            pearson, _ = self.evaluate(y_val, predicted_labels)
            average_pearson += abs(pearson)
        return average_pearson/n_folds

In [244]:
from sklearn.ensemble import RandomForestRegressor

class RandomForestModel(Model):
    def __init__(self, *kwargs):
        self.name = 'RandomForestModel'
        self.description = 'RandomForestModel'    
        super().__init__(*kwargs)
        self.regr = RandomForestRegressor(n_estimators=100, max_depth=8, random_state=1)
        
    def _extract_features(self, x):
        return x

randomForestModel = RandomForestModel(pd.DataFrame(features_train).values, train_labels)
# randomForestModel.cross_validate()

randomForestModel.fit(pd.DataFrame(features_train).values, train_labels)
pred = randomForestModel.predict(pd.DataFrame(features_test).values)
randomForestModel.evaluate(test_labels, pred)

(0.7432510799933818, 0.0)

In [245]:
from sklearn.ensemble import GradientBoostingRegressor
class GradientBoostingModel(Model):
    def __init__(self, *kwargs):
        self.name = 'GradientBoostingModel'
        self.description = 'GradientBoostingModel'    
        super().__init__(*kwargs)
        self.regr = GradientBoostingRegressor(alpha=0.95,
                                n_estimators=55, max_depth=4,
                                learning_rate=.101, min_samples_leaf=9,
                                min_samples_split=4, random_state=40)
        
    def _extract_features(self, x):
        return x

gradientBoostingModel = GradientBoostingModel(pd.DataFrame(features).values, train_labels)
# gradientBoostingModel.cross_validate()
gradientBoostingModel.fit(pd.DataFrame(features_train).values, train_labels)
pred = gradientBoostingModel.predict(pd.DataFrame(features_test).values)
gradientBoostingModel.evaluate(test_labels, pred)

(0.7461038362794891, 0.0)

In [246]:
from sklearn.svm import SVR

class SVMModel(Model):
    def __init__(self, *kwargs):
        self.name = 'SVMModel'
        self.description = 'SVMModel'    
        super().__init__(*kwargs)
        self.regr = SVR(C=1.4, epsilon=0.0, gamma='scale')
        
    def _extract_features(self, x):
        return x

sVMModel = SVMModel(pd.DataFrame(features).values, train_labels)
# sVMModel.cross_validate()
sVMModel.fit(pd.DataFrame(features_train).values, train_labels)
pred = sVMModel.predict(pd.DataFrame(features_test).values)
sVMModel.evaluate(test_labels, pred)

(0.7379090863984689, 0.0)