# String-based similarity measures

In [56]:
from nltk.corpus import stopwords
import os
import tarfile
import glob, os, sys
import pathlib
from fnmatch import fnmatch
from pycorenlp import StanfordCoreNLP
from shutil import rmtree
import timeit
import numpy as np
import utils
import operator
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import ngram
import pandas as pd

In [111]:
results = {}

In [112]:
def evaluate_similarities(annotation_scores, similarities, stop_word_set, name = None,):
    """Calculate Pearson and Spearman correlation; return values in dictionary"""  
    ppmc = utils.calculate_ppmc(similarities, annotation_scores)
    spearman_correlation = utils.calculate_spearman(similarities, annotation_scores)
    if name:
        results[name, stop_word_set] = {'r' : round(ppmc[0], 3), 'rs' : round(spearman_correlation[0], 3)}
    return results

def remove_stopwords(sent, stop_word_set):
    """Remove stopwords and punctuation"""
    # Note: BIOSSES original paper used stop words from https://www.ranks.nl/stopwords
    # and following punctuation: (.,!;/-?: colon, mark,)
    if stop_word_set == 'ranks':
        stop_words = set(ranks_stopwords)
    elif stop_word_set == 'stanford_core':
        stop_words = set(stanford_core_stopwords) 
    elif stop_word_set == 'nltk':
        stop_words = set(stopwords.words('english')) 
    punctuation_tokens = set('.,-!;/?:')
    word_tokens = word_tokenize(sent)
    filtered_sentence = [w for w in word_tokens if not (w in stop_words or w in punctuation_tokens)]
    filtered_sentence = ' '.join(filtered_sentence)
    #filtered_sentence = stemmer.stem(filtered_sentence)
    return filtered_sentence

def calculate_string_similarity(strings, method, stop_word_set, N=3):
    """Calculate string similarities for a list of strings (interleaved sentence pairs)"""
    similarities = []
    i= 0
    while (i <= len(strings)-1):
        if method == 'jaccard':
            similarities.append(utils.dist_jaccard(remove_stopwords(strings[i], stop_word_set), remove_stopwords(strings[i+1], stop_word_set)))  
        elif method == 'qgram':
            similarities.append(utils.dist_qgram(remove_stopwords(strings[i], stop_word_set), remove_stopwords(strings[i+1], stop_word_set), N=N))  
        i += 2
    similarities = np.array(similarities, dtype=float)
    return similarities

## Files and directories

In [2]:
DATA_DIR = '/home/matthias/Documents/Intelligence/SDL1 - Embeddings/data/'
RANKS_STOPWORDS = 'ranks_stopwords.txt'
STANFORD_STOPWORDS = 'stanford_core_stopwords.txt'

# Scores assigned by the human experts for each of the 100 sentence pairs
ANNOTATION_SCORES = 'annotation_scores_from_github.txt'

# Original BIOSSES sentence pairs (no pre-processing)
BIOSSES_SENTENCE_PAIRS = 'biosses_sentence_pairs_test_derived_from_github.txt'

# Pre-processed sentences (lower-case, words and punctuation separated by whitespaces), one sentence per line.
BIOSSES_SENTENCE_PAIRS_PREPROCESSED = 'biosses_sentence_pairs_test_derived_from_github_preprocessed.txt' 

 ## Compare stop word lists

In [3]:
stop_word_lists = ['ranks', 'nltk' , 'stanford_core']

In [4]:
nltk_stopwords = stopwords.words('english')
ranks_stopwords = []
stanford_core_stopwords = []

with open(os.path.join(DATA_DIR, RANKS_STOPWORDS), 'r') as words:
    for i in words:
        ranks_stopwords.append(i.strip())

with open(os.path.join(DATA_DIR, STANFORD_STOPWORDS), 'r') as words:
    for i in words:
        stanford_core_stopwords.append(i.strip())

print("Number of stopwords in NLTK list: ", len(nltk_stopwords))
print("Number of stopwords in RANKS list: ", len(ranks_stopwords))
print("Number of stopwords in Stanford Core list: ", len(stanford_core_stopwords))

Number of stopwords in NLTK list:  153
Number of stopwords in RANKS list:  174
Number of stopwords in Stanford Core list:  257


In [25]:
#set(nltk_stopwords).symmetric_difference(set(ranks_stopwords))
#set(nltk_stopwords)-(set(ranks_stopwords))

In [8]:
#set(ranks_stopwords)-(set(nltk_stopwords))

In [43]:
#print(nltk_stopwords)

## Annotation scores

In [5]:
# Read scores assigned by human experts for the 100 sentence pairs into np.array
with open(os.path.join(DATA_DIR, ANNOTATION_SCORES), "r", encoding="utf-8") as scores:
    annotation_scores = np.loadtxt(scores)

#print(annotation_scores)

## Calculate Jaccard and Q-gram measures for BIOSSES sentences

### Pre-process sentences with StanfordCoreNLP

In [11]:
nlp = StanfordCoreNLP('http://localhost:9000')

first_sentence = True

with open(os.path.join(DATA_DIR, BIOSSES_SENTENCE_PAIRS), 'r') as fin, \
     open(os.path.join(DATA_DIR, BIOSSES_SENTENCE_PAIRS_PREPROCESSED), 'w') as fout:
    for line in fin:
        res = nlp.annotate(line,properties={'annotators': 'tokenize','outputFormat': 'json','timeout': 10000,})
        for t in res["tokens"]:
            if '-LRB-' in t["word"]:
                t["word"] = "("
            if '-RRB-' in t["word"]:
                t["word"] = ")"
            if '-LCB-' in t["word"]:
                t["word"] = "{"
            if '-RCB-' in t["word"]:
                t["word"] = "}"
            if '-LSB-' in t["word"]:
                t["word"] = "["
            if '-RSB-' in t["word"]:
                t["word"] = "]"	
            if '-' in t["word"]:
                t["word"] = t["word"].replace('-', ' - ')
        sentence = (" ".join([t["word"] for t in res["tokens"]]))
        sentence = sentence.lower().strip()
        if first_sentence == False:
            sentence = "\n" + sentence 
        else:
            first_sentence = False
        fout.write(sentence)

In [12]:
# Read pre-processed sentence pairs into list
strings = []
with open(os.path.join(DATA_DIR, BIOSSES_SENTENCE_PAIRS_PREPROCESSED), 'r') as BIOSSES_sentences_tokenized:
    for line in BIOSSES_sentences_tokenized:
        strings.append(line.strip())

### Jaccard distance

In [113]:
for i in stop_word_lists:
    evaluate_similarities(annotation_scores, calculate_string_similarity(strings, 
    method='jaccard', stop_word_set=i), i, 'Jaccard')

### Q-gram distance

In [114]:
for i in stop_word_lists:
    evaluate_similarities(annotation_scores, calculate_string_similarity(strings, 
    method='qgram', stop_word_set=i), i, 'Qgram' )

## Results overview

In [123]:
string_based_measures_results_df = pd.DataFrame.from_dict(results).transpose()
string_based_measures_results_df

Unnamed: 0,Unnamed: 1,r,rs
Jaccard,ranks,0.746,0.758
Jaccard,nltk,0.751,0.764
Jaccard,stanford_core,0.767,0.789
Qgram,ranks,0.72,0.763
Qgram,nltk,0.723,0.763
Qgram,stanford_core,0.727,0.769


In [125]:
# Save results
string_based_measures_results_df.to_csv(os.path.join(DATA_DIR, 'string_based_measures_results.csv'))