# IHLT Project: Semantic Textual Similarity

# Imports

In [None]:
import pandas as pd
import numpy as np
import glob
from nltk import ne_chunk, word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import nltk
import spacy
import string
from nltk.metrics import jaccard_distance
from nltk.corpus import wordnet_ic
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from scipy.stats import pearsonr
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
nltk.download('wordnet_ic')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('treebank')
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

[nltk_data] Downloading package wordnet_ic to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet_ic.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [None]:
import sklearn 
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Functions
First we developed functions to calculate the features of sentences learned on the previous labs. In the beginning we created a function which creates a dataframe from sentences that are in the training and testing datasets. Then we wrote functions that read pairs of sentences and perform different featurizing algorithms. The features we used are:
- tokenizing
- deleting stopwords
- lemmatizing
- getting part-of-speech tags
- name entity chunking
- Lesk's algorithm
- extracting synsets
- extracting n-grams




Function that creates a dataframe from the STS datasets:

In [None]:
def create_df(files,gs):
  files = sorted(files)
  gs = sorted(gs)
  # we want to concatenate every 'gs' file to be sorted the same as the input files, and we don't know how file with the name 'gs.ALL' is sorted
  if 'ALL' in gs[0]:
    gs.pop(0)
  column0 = []
  column1 = []
  column2 = []
  for filename in files:
    with open(filename, encoding="utf8") as txt:
      for l in txt:
        # appending sentences from txt files to columns
        sentences = l.strip().split('\t')
        column0.append(sentences[0])
        column1.append(sentences[1])
  for filename in gs:
    # appending gold standards from txt files to columns
    with open(filename, encoding="utf8") as txt:
      for l in txt:
        gs = l.strip().split('\t')
        column2.append(gs[0])
  
  # appending columns to dataframe
  df = pd.DataFrame()
  df['s1'] = column0
  df['s2'] = column1
  df['gs'] = column2
  return df

Function that returns set of words from a sentence, followed by function which creates a list of tokenized sentences from dataframe:

In [None]:
def tokenize(sentence):
  sentences = nltk.sent_tokenize(sentence)
  # setting the words to lowercase for comparison purposes
  tokenized = set(np.concatenate([nltk.word_tokenize(s.lower()) for s in sentences]))
  # filtering punctuation
  return set([t for t in tokenized if t not in string.punctuation]) 

def tokenize_df(df):
  tokenized1 = []
  tokenized2 = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    tokenized1.append(tokenize(sent1))
    tokenized2.append(tokenize(sent2))
  tokenized = [tokenized1,tokenized2]
  return tokenized

Function that tokenizes sentences and filters the stopwords, followed by function which creates a list of tokenized sentences from dataframe:

In [None]:
def tokenize_wout_stopwords(sentence):
  sentences = nltk.sent_tokenize(sentence)
  # setting the words to lowercase for comparison purposes
  tokenized = set(np.concatenate([nltk.word_tokenize(s.lower()) for s in sentences]))
  # filtering punctuation and stopwords
  return set([t for t in tokenized if t not in stopwords.words('english') and t not in string.punctuation])
  
def tokenize_wout_stopwords_df(df):
  tokenized1 = []
  tokenized2 = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    tokenized1.append(tokenize_wout_stopwords(sent1))
    tokenized2.append(tokenize_wout_stopwords(sent2))
  tokenized = [tokenized1,tokenized2]
  return tokenized

Function that lemmatizes sentences, followed by function which creates a list of lemmatized sentences from dataframe:

In [None]:
def lemmatize(pos_tags):
    wnl = WordNetLemmatizer()
    # setting pos_tags to lowercase to be able to process it by the WordNetLemmatizer() lemmatizing function
    if pos_tags[1][0] in {'N','V', 'A', 'S', 'R'}:
        return wnl.lemmatize(pos_tags[0].lower(), pos=pos_tags[1][0].lower())
    return pos_tags[0]

def lemmatize_df(df):
  lemmatized1 = []
  lemmatized2 = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    lemmatized1.append(set([lemmatize(pair) for pair in nltk.pos_tag(tokenize(sent1))]))
    lemmatized2.append(set([lemmatize(pair) for pair in nltk.pos_tag(tokenize(sent2))]))
  lemmatized = [lemmatized1, lemmatized2]
  return lemmatized

Function which creates a list of lemmatized sentences without stopwords from dataframe:

In [None]:
def lemmatize_wout_stopwords_df(df):
  lemmatized1 = []
  lemmatized2 = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    lemmatized1.append(set([lemmatize(pair) for pair in nltk.pos_tag(tokenize_wout_stopwords(sent1))]))
    lemmatized2.append(set([lemmatize(pair) for pair in nltk.pos_tag(tokenize_wout_stopwords(sent2))]))
  lemmatized = [lemmatized1, lemmatized2]
  return lemmatized

Function that gets pos tags from words which have them from sentences, followed by function which creates a list of pos-tagged sentences from dataframe. This function only returns the part of speech, without its properties. For example (n - noun, v - verb, instead of NNP, PRP etc.):

In [None]:
def get_pos_tag_simple(sentence):
  sentence = tokenize(sentence) 
  pairs = pos_tag(sentence)
  pairs_useful = []
  # check witch words have synsets in the wordnet library
  for pair in pairs: 
    x, y = pair[0].lower(), pair[1][0].lower()
    try:
      wn.synsets(x, y)
    except:
      pass
    else:
      pairs_useful.append((x, y[0]))
  return pairs_useful
  
def pos_tag_simple_df(df):
  pos_tagged1 = []
  pos_tagged2 = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    pos_tagged1.append(set(get_pos_tag_simple(sent1)))
    pos_tagged2.append(set(get_pos_tag_simple(sent2)))
  pos_tagged = [pos_tagged1, pos_tagged2]
  return pos_tagged

Function which creates a list of pos-tagged sentences from dataframe:

In [None]:
def pos_tag_df(df):
  pos_tagged1 = []
  pos_tagged2 = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    pos_tagged1.append(set(pos_tag(tokenize(sent1))))
    pos_tagged2.append(set(pos_tag(tokenize(sent2))))
  pos_tagged = [pos_tagged1, pos_tagged2]
  return pos_tagged

Function that returns list of Name Entity chunks from a dataframe of sentences:

In [None]:
def ne_chunk_df(df):
  chunked1_sentence = []
  chunked2_sentence = []
  chunked1 = []
  chunked2 = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    ch1 = ne_chunk(pos_tag(word_tokenize(sent1)))
    ch2 = ne_chunk(pos_tag(word_tokenize(sent2)))
    # iterate through tree and find the Name Entities and append them to lists
    for c1 in ch1:
      if hasattr(c1, 'label'):
        chunked1_sentence.append((c1.label(), ' '.join(c[0] for c in c1)))
    chunked1.append(set(chunked1_sentence))
    for c2 in ch2:
      if hasattr(c2, 'label'):
        chunked2_sentence.append((c2.label(), ' '.join(c[0] for c in c2)))
    chunked2.append(set(chunked2_sentence))
  ne_chunked = [chunked1, chunked2]
  return ne_chunked

   

Function to apply Lesk’s algorithm to the words in the sentences, followed by function which creates a list of products of Lesk's algorithm applied to sentences from the dataframe. To perform Lesk's algorithm we must obtain the simple pos tags from the sentences:

In [None]:
def get_lesk(sentence,pairs_useful):
  synsets = set()
  #finding words which can be processed by Lesk, and appending them
  for pair in pairs_useful:
    synset = nltk.wsd.lesk(sentence, pair[0], pair[1])
    if synset is not None:
      synsets.add(synset.name())
  return synsets
  
def lesk_df(df):
  lesk0 = []
  lesk1 = []
  for sentence in df['s1']: # applying lesk algorith to every word in every sentence from column 0
    a = get_pos_tag_simple(sentence)
    b = get_lesk(sentence, a)
    lesk0.append(b)
  for sentence in df['s2']: # applying lesk algorith to every word in every sentence from column 1
    a = get_pos_tag_simple(sentence)
    b = get_lesk(sentence, a)
    lesk1.append(b)
  lesk = [lesk0, lesk1]
  return lesk

Function which creates a list of lenghts differences between sentences from the dataframe:

In [None]:
def length_difference_df(df):
  diff = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    # find lenght difference by subsuming the length of sentences and dividing by the lenght of the longer sentence
    d = 1 - abs((len(tokenize(sent1)) - len(tokenize(sent2))) / max(len(tokenize(sent1)), len(tokenize(sent1))))
    diff.append(d)
  return diff

def length_difference_wout_stopwords_df(df):
  diff = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    d = 1 - abs((len(tokenize_wout_stopwords(sent1)) - len(tokenize_wout_stopwords(sent2))) / max(len(tokenize_wout_stopwords(sent1)), len(tokenize_wout_stopwords(sent1))))
    diff.append(d)
  return diff

def length_difference_lemmatized_df(df):
  ldl = []
  feature = lemmatize_wout_stopwords_df(df)
  for i in range(len(feature[0])):
    d = 1 - abs( ( len(feature[0][i]) - len(feature[1][i]) ) / max(len(feature[0][i]), len(feature[1][i])))
    ldl.append(d)
  return ldl

def length_difference_lemmatized_wout_stopwords_df(df):
  ldlws = []
  feature = lemmatize_df(df)
  for i in range(len(feature[0])):
    d = 1 - abs( ( len(feature[0][i]) - len(feature[1][i]) ) / max(len(feature[0][i]), len(feature[1][i])))
    ldlws.append(d)
  return ldlws

Fucntion that calculates the ngrams of sentences, followed by function which creates a list of calculated ngrams of all pairs of sentences in the dataframe:

In [None]:
def ngram(sentence, n):
  tok = tokenize(sentence)
  ng = ngrams(tok,n)
  return set(ng)

def ngram_df(df, n):
  ngram1 = []
  ngram2 = []
  
  for sent1, sent2 in zip(df['s1'],df['s2']):
    try:
      ngram1.append(ngram(sent1, n))
      ngram2.append(ngram(sent2, n))
    except:
      ngram1.append(set())
      ngram2.append(set())
  ngrammed = [ngram1, ngram2]
  return ngrammed

The next feature to retrieve will be the synsets of each words, in fact the most frequent synset given the POS tag of the word in order to calculate the similarity between each word. 

But first we need to get from each sentence each word wihout punctuations and stopwords. For this we us the useful_words function

In [None]:
def useful_words(sentence):
    return [word.lower() for word in nltk.word_tokenize(sentence) if word not in string.punctuation and word.lower() not in stopwords.words('english')]

Then We obtain the proper pos_tag that works with wordnet library. to do this we modify the format of the postag using the get_useful_pos_Tag mehtod.

In [None]:
def get_useful_pos_tag(pos_tag):
  if pos_tag.startswith('N'):
    return 'n'
  elif pos_tag.startswith('V'):
    return 'v'
  elif pos_tag.startswith('J'):
    return 'a'
  elif pos_tag.startswith('R'):
    return 'r'
  else:
    return None

In [None]:
import collections
# sysnet data structure that holds the value of the synset and the associated pos tag.
Synset = collections.namedtuple('Synset',['syns', 'postag'])

After that we get the most frequent synset given the pos_tag of the word. using the function bellow.

In [None]:
def get_synset(pos_tagged_words):
  synsets = []
  for word, pos_tagg in pos_tagged_words:
    useful_pos_tag = get_useful_pos_tag(pos_tagg)
    if useful_pos_tag:
      try:
        # getting the most frequent synset
        s =  wn.synsets(word, useful_pos_tag)[0]
      except:
        s = None
      if s:
        synset_pair = Synset(s, useful_pos_tag) 
        synsets.append(synset_pair)   
  return synsets  

Now we can calculate the similarity of between 2 synsets. For this we will use the following methods: path, wup, lch, lin. Taking into account the condition of each one of them. 

In [None]:
def synset_path_similarity(ss1, ss2):
  score, count = 0.0, 0
  for s1 in ss1:
    sim_values = []
    for s2 in ss2:
      #compute the path synset
      synset_sim = s1.syns.path_similarity(s2.syns)
      if synset_sim:
        # only if the synset exists, append it to the list
        sim_values.append(synset_sim)
    if len(sim_values):
      highest = max(sim_values)
    else:
      highest = 0

    score += highest
    count += 1
  # Average similarity of the sentences 
  if count:
    score /= count
    return score
  else: 
    return 0

def synset_wup_similarity(ss1, ss2):
  score, count = 0.0, 0
  for s1 in ss1:
    sim_values = []
    for s2 in ss2:
      synset_sim = s1.syns.wup_similarity(s2.syns)
      if synset_sim:
        sim_values.append(synset_sim)
    if len(sim_values):
      highest = max(sim_values)
    else:
      highest = 0

    score += highest
    count += 1
  # Average similarity of the sentences 
  if count:
    score /= count
    return score
  else: 
    return 0

def synset_lch_similarity(ss1, ss2):
  score, count = 0.0, 0
  for s1 in ss1:
    sim_values = []
    for s2 in ss2:
      # condition that the synset must hold: same pos tag
      if s1.postag == s2.postag and s1.postag != 'a':
        synset_sim = s1.syns.lch_similarity(s2.syns)
        if synset_sim:
          sim_values.append(synset_sim)
    if len(sim_values):
      highest = max(sim_values)
    else:
      highest = 0

    score += highest
    count += 1
  # Average similarity of the sentences 
  if count:
    score /= count
    return score
  else: 
    return 0

def synset_lin_similarity(ss1, ss2):
  score, count = 0.0, 0
  for s1 in ss1:
    sim_values = []
    for s2 in ss2:
      # pos tag between synset must be the same and not be an adverb nor adjective.
      if s1.postag == s2.postag and s1.postag not in {'a', 'r'}:
        try:
          synset_sim = s1.syns.lin_similarity(s2.syns,semcor_ic)
        except:
          synset_sim = 0
        if synset_sim:
          sim_values.append(synset_sim)
    if len(sim_values):
      highest = max(sim_values)
    else:
      highest = 0

    score += highest
    count += 1
  # Average similarity of the sentences 
  if count:
    score /= count
    return score
  else: 
    return 0

def synset_res_similarity(ss1, ss2):
  score, count = 0.0, 0
  for s1 in ss1:
    sim_values = []
    for s2 in ss2:
      if s1.postag == s2.postag and s1.postag not in {'a', 'r'}:
        synset_sim = s1.syns.res_similarity(s2.syns,semcor_ic)
        if synset_sim:
          sim_values.append(synset_sim)
    if len(sim_values):
      highest = max(sim_values)
    else:
      highest = 0

    score += highest
    count += 1
  # Average similarity of the sentences 
  if count:
    score /= count
    return score
  else: 
    return 0

Next we develop a method that integrates the previous function and outputs a list, where each item is a list of similiraties. 

In [None]:
# The next code add 4 wordnet similarities column to the dataframe

def synsets_similarities(df):
  path_similarity = []
  wup_similarity = []
  lch_similarity = []
  lin_similarity = []
  res_similarity = []

  for i in range(len(df['s1'])):
    s0 = df['s1'][i]
    s1 = df['s2'][i]
   
    w0 = useful_words(s0)
    w1 = useful_words(s1)
   
    pos_tag0 = pos_tag(w0)
    pos_tag1 = pos_tag(w1)
    
    ss0 = get_synset(pos_tag0)
    ss1 = get_synset(pos_tag1)
  
    path_score = synset_path_similarity(ss0, ss1)
    wup_score = synset_wup_similarity(ss0, ss1)
    lch_score = synset_lch_similarity(ss0, ss1)
    lin_score = synset_lin_similarity(ss0, ss1)
    res_score = synset_res_similarity(ss0, ss1)

    path_similarity.append(path_score)
    wup_similarity.append(wup_score)
    lch_similarity.append(lch_score)
    lin_similarity.append(lin_score)
    res_similarity.append(res_score)

  return path_similarity, wup_similarity, lch_similarity, lin_similarity

Function responsible for calculating frequencies of bigrams in given sentences. Further down, there are functions responsible for finding bigrams frequencies in:
- tokenized sentences
- lemmatized sentences
- tokenized and lemmatized sentences without stopwords

In [None]:
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder

def bigram_frequency(sent1, sent2):
  bigrams1 = []
  bigrams2 = []
  frequencies1 = []
  frequencies2 = []
  # find the bigrams of the sentence
  collocations1 = BigramCollocationFinder.from_words(sent1)
  collocations2 = BigramCollocationFinder.from_words(sent2)
  for c1 in collocations1.ngram_fd.items():
    bigrams1.append(c1[0])
    frequencies1.append(c1[1])
  for c2 in collocations2.ngram_fd.items():
    bigrams2.append(c2[0])
    frequencies2.append(c2[1])
  freq = []
  # calculate the frequency of occurences of the bigrams
  for i, b1 in enumerate(bigrams1):
    if b1 in bigrams2:
      freq.append(min(frequencies1[i], frequencies2[bigrams2.index(b1)]))
  avg_len = (len(sent1) + len(sent2))/2
  frequency = sum(freq)/avg_len
  return frequency
  
def bigram_frequency_tokenized_df(df):
  bigrammed = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    bigrammed.append(bigram_frequency(tokenize(sent1), tokenize(sent2)))
  return bigrammed

def bigram_frequency_tokenized_wout_stopwords_df(df):
  bigrammed = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    bigrammed.append(bigram_frequency(tokenize_wout_stopwords(sent1), tokenize_wout_stopwords(sent2)))
  return bigrammed

def bigram_frequency_lemmatized_df(df):
  bfl = []
  feature = lemmatize_df(df)
  for i in range(len(feature[0])):
    bfl.append(bigram_frequency(feature[0][i],feature[1][i]))
  return bfl

def bigram_frequency_lemmatized_wout_stopwords_df(df):
  bfl = []
  feature = lemmatize_wout_stopwords_df(df)
  for i in range(len(feature[0])):
    bfl.append(bigram_frequency(feature[0][i],feature[1][i]))
  return bfl

Function responsible for calculating frequencies of trigrams in given sentences. Further down, there are functions responsible for finding trigrams frequencies in:
- tokenized sentences
- lemmatized sentences
- tokenized and lemmatized sentences without stopwords

In [None]:
def trigram_frequency(sent1, sent2):
  trigrams1 = []
  trigrams2 = []
  frequencies1 = []
  frequencies2 = []
  collocations1 = TrigramCollocationFinder.from_words(sent1)
  collocations2 = TrigramCollocationFinder.from_words(sent2)
  for c1 in collocations1.ngram_fd.items():
    trigrams1.append(c1[0])
    frequencies1.append(c1[1])
  for c2 in collocations2.ngram_fd.items():
    trigrams2.append(c2[0])
    frequencies2.append(c2[1])
  freq = []
  for i, b1 in enumerate(trigrams1):
    if b1 in trigrams2:
      freq.append(min(frequencies1[i], frequencies2[trigrams2.index(b1)]))
  avg_len = (len(sent1) + len(sent2))/2
  frequency = sum(freq)/avg_len
  return frequency
  
def trigram_frequency_tokenized_df(df):
  trigrammed = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    trigrammed.append(trigram_frequency(tokenize(sent1), tokenize(sent2)))
  return trigrammed

def trigram_frequency_tokenized_wout_stopwords_df(df):
  trigrammed = []
  for sent1, sent2 in zip(df['s1'],df['s2']):
    trigrammed.append(trigram_frequency(tokenize_wout_stopwords(sent1), tokenize_wout_stopwords(sent2)))
  return trigrammed

def trigram_frequency_lemmatized_df(df):
  tfl = []
  feature = lemmatize_df(df)
  for i in range(len(feature[0])):
    tfl.append(trigram_frequency(feature[0][i],feature[1][i]))
  return tfl

def trigram_frequency_lemmatized_wout_stopwords_df(df):
  tfl = []
  feature = lemmatize_wout_stopwords_df(df)
  for i in range(len(feature[0])):
    tfl.append(trigram_frequency(feature[0][i],feature[1][i]))
  return tfl

Function that calculates jaccard similarity, followed by function which creates a list of jaccard similarities between features retrieved from the sentences from a dataframe:

In [None]:
def jaccard_similarity(x, y):
  return 1 -  jaccard_distance(x, y)

def jaccard_similarity_list(feature):
  jac_sim_list = []
  for i in range(len(feature[0])):
    try:
      jac_sim_list.append(jaccard_similarity(feature[0][i],feature[1][i]))
    except:
      jac_sim_list.append(0)
  return jac_sim_list

Function responsible for calculating cosine similarity, followed by function which creates a list of cosine similarities between features retrieved from the sentences from a dataframe. It will be necesary for computing ngrams, because it takes into account duplicates of elements, which jaccard similarity does not consider:

In [None]:
def cosine_similarity(x, y):
  o1 =[]
  o2 =[]
  rvec = x.union(y) 
  for item in rvec:
    # if item appear in both sentences, append 1, else append 0
      if item in x: 
        o1.append(1) 
      else: 
        o1.append(0)
      if item in y: 
        o2.append(1)
      else: 
        o2.append(0)
  c = 0
  # formula for cosine similairty 
  for i in range(len(rvec)):
          c+= o1[i]*o2[i]
  cos_sim = c / float((sum(o1)*sum(o2))**0.5)
  return cos_sim

def cosine_similarity_list(feature):
  cos_sim_list = []
  for i in range(len(feature[0])):
    try:
      cos_sim_list.append(jaccard_similarity(feature[0][i],feature[1][i]))
    except:
      cos_sim_list.append(0)
  return cos_sim_list


Function that combines all the features and calculate jaccard distances between featurized sentences:

In [None]:
def get_jaccard_features(df):
  features = []
  tokenized_list = jaccard_similarity_list(tokenize_df(df))
  tokenized_wout_stopwords_list = jaccard_similarity_list(tokenize_wout_stopwords_df(df))
  lemmatized_list = jaccard_similarity_list(lemmatize_df(df))
  lemmatized_wout_stopwords_list = jaccard_similarity_list(lemmatize_wout_stopwords_df (df))
  lesk_list = jaccard_similarity_list(lesk_df(df))
  pos_simple_list = jaccard_similarity_list(pos_tag_df(df))
  pos_list = jaccard_similarity_list(pos_tag_df(df))
  ne_chunk_list = jaccard_similarity_list(ne_chunk_df(df))
  length_difference_list = length_difference_df(df)
  length_difference_wout_stopwords_list= length_difference_wout_stopwords_df(df)
  length_difference_lemmatized_list= length_difference_lemmatized_df(df)
  length_difference_lemmatized_wout_stopwords_list= length_difference_lemmatized_wout_stopwords_df(df)
  ngram2_list = ngram_df(df,2)
  ngram3_list = ngram_df(df,3)
  ngram4_list = ngram_df(df,4)
  bigrams_frequency_list = bigram_frequency_tokenized_df(df)
  bigrams_frequency_wout_stopwords_list = bigram_frequency_tokenized_wout_stopwords_df(df)
  bigrams_frequency_lemmatized_list = bigram_frequency_lemmatized_df(df)
  bigrams_frequency_lemmatized_wout_stopwords_list = bigram_frequency_lemmatized_wout_stopwords_df(df)
  trigrams_frequency_list = bigram_frequency_tokenized_df(df)
  trigrams_frequency_wout_stopwords_list = trigram_frequency_tokenized_wout_stopwords_df(df)
  trigrams_frequency_lemmatized_list = trigram_frequency_lemmatized_df(df)
  trigrams_frequency_lemmatized_wout_stopwords_list = trigram_frequency_lemmatized_wout_stopwords_df(df)


  path_similarity, wup_similarity, lch_similarity, lin_similarity = synsets_similarities(df)

  for i, f in enumerate(tokenized_list):
    features.append([
                    tokenized_list[i], 
                    tokenized_wout_stopwords_list[i], 
                    lemmatized_list[i], 
                    lemmatized_wout_stopwords_list[i], 
                    lesk_list[i], 
                    # pos_list[i], 
                    pos_simple_list[i],
                    # ne_chunk_list[i], 
                    path_similarity[i], 
                    wup_similarity[i], 
                    lch_similarity[i], 
                    lin_similarity[i],
                    length_difference_list[i],
                    length_difference_wout_stopwords_list[i], 
                    length_difference_lemmatized_list[i],
                    length_difference_lemmatized_wout_stopwords_list[i],
                    # cosine_similarity_list(ngram2_list)[i],
                    # cosine_similarity_list(ngram3_list)[i],
                    # cosine_similarity_list(ngram4_list)[i],
                    bigrams_frequency_list[i],
                    bigrams_frequency_wout_stopwords_list[i],
                    bigrams_frequency_lemmatized_list[i],
                    bigrams_frequency_lemmatized_wout_stopwords_list[i],
                    trigrams_frequency_list[i],
                    trigrams_frequency_wout_stopwords_list[i],
                    trigrams_frequency_lemmatized_list[i],
                    trigrams_frequency_lemmatized_wout_stopwords_list[i]
                       ])
  return features

# TRAINING PART
For the training, we used the train folder from STS dataset. We computed jaccard similarities of every feature as well as the synset similarities. Then we provided the features and the golden standard, to train the Support Vector Regression(SVR), as well as the linear regression model. Then we chose the one which got better results.

Importing the train dataset:

In [None]:
files_train = glob.glob('/content/drive/MyDrive/IHLT/data_STS/train/STS.input.*')
gs_train = glob.glob('/content/drive/MyDrive/IHLT/data_STS/train/STS.gs.*')
df_train = create_df(files_train,gs_train)
df_train

Unnamed: 0,s1,s2,gs
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...,4.000
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...,3.750
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl...",2.800
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent...",3.400
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....,2.400
...,...,...,...
2229,"Action is needed quickly, which is why we deci...",It is urgent and that is why we have decided t...,5.000
2230,One could indeed wish for more and for improve...,"We can actually want more and better, but I th...",4.800
2231,(Parliament accepted the oral amendment),(Parliament accepted the oral amendment),5.000
2232,- My party has serious reservations about Comm...,My party serious reservations about the regula...,4.800


Importing the test dataset:

In [None]:
files_test = glob.glob('/content/drive/MyDrive/IHLT/data_STS/test-gold//STS.input.*')
gs_test = glob.glob('/content/drive/MyDrive/IHLT/data_STS/test-gold/STS.gs.*')
df_test = create_df(files_test,gs_test)
df_test

Unnamed: 0,s1,s2,gs
0,The problem likely will mean corrective change...,He said the problem needs to be corrected befo...,4.400
1,The technology-laced Nasdaq Composite Index .I...,The broad Standard & Poor's 500 Index .SPX inc...,0.800
2,"""It's a huge black eye,"" said publisher Arthur...","""It's a huge black eye,"" Arthur Sulzberger, th...",3.600
3,SEC Chairman William Donaldson said there is a...,"""I think there's a building confidence that th...",3.400
4,Vivendi shares closed 1.9 percent at 15.80 eur...,"In New York, Vivendi shares were 1.4 percent d...",1.400
...,...,...,...
3103,A defeat on Alstom would have profound consequ...,Losing on the issue of Alstom would have serio...,4.750
3104,Tocqueville believed that there are no effecti...,"Tocqueville thought that on the long run, noth...",4.500
3105,Will it give us the right to divorce the husba...,A couple who have left?,1.000
3106,But US stock prices fell only 5.2% between May...,"However, the Americans have accused that lower...",3.250


Getting the features from the train and test datasets:

In [None]:
features_jaccard_train = get_jaccard_features(df_train)
gs_train = np.array([float(g) for g in df_train['gs']])

In [None]:
features_jaccard_test = get_jaccard_features(df_test)
gs_test = np.array([float(g) for g in df_test['gs']])

Scaling the feature lists:

In [None]:
# scale the data frame
scaler = MinMaxScaler()
# for training
scaler.fit(features_jaccard_train)
features_jaccard_train_scaled = scaler.transform(features_jaccard_train)
# for testing
features_jaccard_test_scaled = scaler.transform(features_jaccard_test)

Setting up an SVR model and feeding the feature list from the training set as well as the golden standard from the training set:

In [None]:
kernel = 'rbf'
gamma = 4
C = 1
epsilon = 0.5
tol = 1

svr = SVR(gamma = gamma, C = C, epsilon = epsilon, tol = tol)
svr.fit(features_jaccard_train_scaled, gs_train)

SVR(C=1, epsilon=0.5, gamma=4, tol=1)

Testing the regression of the model on the test set:

In [None]:
test_prediction = svr.predict(features_jaccard_test_scaled)

Computing the pearson correlation of the predicted values from the testing set and the gold standard:

In [None]:
correlation = pearsonr(test_prediction, gs_test)[0]
print("Pearson correlation:", correlation)

Pearson correlation: 0.6917634011205702


MLP Regression:

Finding the right parameters for the model:

In [None]:
parameter_space = {
    'hidden_layer_sizes': [(100, 50), (200, 100), (50, 100), (40, 150)],
    'activation': ['relu'],
    'solver': ['sgd', 'adam','lbfgs'],
    'alpha': [0.0001, 0.001, 0.1,0.05, 0.5],
    'learning_rate': ['constant','adaptive','invscaling'],
}
clf = GridSearchCV(regr, parameter_space, n_jobs=-1, cv=5)
clf.fit(features_jaccard_train_scaled, gs_train)
print('Best parameters found:\n', clf.best_params_)

Best parameters found:
 {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (100, 50), 'learning_rate': 'constant', 'solver': 'adam'}




In [None]:
regr = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', 
                     alpha=0.001, learning_rate='constant').fit(features_jaccard_train_scaled, gs_train)
mlp_predictions = regr.predict(features_jaccard_test_scaled)
correlation = pearsonr(mlp_predictions, gs_test)[0]



Computing the pearson correlation of the predicted values from the testing set and the gold standard:

In [None]:
correlation = pearsonr(mlp_predictions, gs_test)[0]
print("Pearson correlation:", correlation)

Pearson correlation: 0.7117823424160945


# Conclusions
Comparing the similarity of sentences is not a simple task, and requires a lot of methods to perform it. There is a lot of algorithms that extract features from the sentences. We used features such as:
- tokenizing (with and without stopwords)
- lemmatizing (with and without stopwords)
- getting part-of-speech tags (simple and complex)
- name entity chunking
- Lesk's algorithm
- synsets similarity
- n-grams similarity
- n-grams frequency


We applied jaccard distances to find the similarities between most featurized sentences. We found out that the cosine similarity only works good with comparing n-grams, because it takes into account the duplicates of elements. To combine all of the features we fed them to regression models such as Support Vector Regression and Multi Layer Perceptron Regression. We found out that the number of features is not the most important. Rather the quality of features has more influence on the overall quality of the model. We obtained 0.717 pearson correlation with MLP Regression and 0.691 with SVR. The results are simmilar, but the MLP performed better so we kept it. We found the parameters by performing GridSearchCV. We tested a lot of combinations of features and kept the most useful and deleted the ones that were creating noise:
- n-grams similarity (lots of not meaningful data)
- name entity chunking (only the entities were returned, and because of that, the data was insufficient to perform comparisons)
- complex part-of-speech tagging (too complex, and we already performed the simple version)


We also found out that the regression methods are the best for combining these features, because they find correlation between all of them and can output a final result. There is still area to improve, for example searching bigger parameter spaces in the MLP model. For that however, we would need more computing power which we did not have.