Code done by Serden-Yilmaz Kose, Jesper Nyman and Jussi Saariniemi

## Task 1

In [7]:
import nltk
import numpy as np
from nltk.corpus import genesis
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [8]:
import pandas as pd

path = "./msr_paraphrase_corpus.csv"
data = pd.read_csv(path, sep=";", header = 0, on_bad_lines='skip')

In [None]:
print(data.head())

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('genesis')

nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

In [11]:
genesis_ic = wn.ic(genesis, False, 0.0)

def wup(S1, S2):
    """Wu-Palmer similarity."""
    return S1.wup_similarity(S2)

def resnik(S1, S2):
    """Resnik similarity."""
    return S1.res_similarity(S2, genesis_ic)

options = {0: wup, 1: resnik}

def preProcess(sentence):
    """Tokenize, remove stopwords, and clean the sentence."""
    Stopwords = list(set(nltk.corpus.stopwords.words('english')))
    words = word_tokenize(sentence)
    words = [word.lower() for word in words if word.isalpha() and word not in Stopwords]
    return words

def get_wordnet_pos(word):
    """Map POS tag to first character for lemmatization with WordNet."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wn.ADJ, "N": wn.NOUN, "V": wn.VERB, "R": wn.ADV}
    return tag_dict.get(tag, wn.NOUN)

def word_similarity(w1, w2, num):
    """Calculate similarity between two words only if they share the same POS."""
    pos1 = get_wordnet_pos(w1)
    pos2 = get_wordnet_pos(w2)

    synsets1 = wn.synsets(w1, pos=pos1)
    synsets2 = wn.synsets(w2, pos=pos2)
    
    if synsets1 and synsets2:
        S1 = synsets1[0]
        S2 = synsets2[0]
        try:
            similarity = options[num](S1, S2)
            if similarity:
                return round(similarity, 2)
        except nltk.corpus.reader.wordnet.WordNetError:
            return 0
    return 0

def Similarity(T1, T2, num):
    """Calculate sentence-to-sentence similarity using TF-IDF and WordNet similarity."""
    words1 = preProcess(T1)
    words2 = preProcess(T2)

    tf = TfidfVectorizer(use_idf=True)
    tf.fit_transform([' '.join(words1), ' '.join(words2)])
    
    Idf = dict(zip(tf.get_feature_names_out(), tf.idf_))
    
    Sim_score1 = 0
    Sim_score2 = 0

    for w1 in words1:
        Max = 0
        for w2 in words2:
            score = word_similarity(w1, w2, num)
            if Max < score:
                Max = score
        Sim_score1 += Max * Idf.get(w1, 0)
    Sim_score1 /= sum([Idf.get(w1, 0) for w1 in words1])

    for w2 in words2:
        Max = 0
        for w1 in words1:
            score = word_similarity(w1, w2, num)
            if Max < score:
                Max = score
        Sim_score2 += Max * Idf.get(w2, 0)
    Sim_score2 /= sum([Idf.get(w2, 0) for w2 in words2])

    Sim = (Sim_score1 + Sim_score2) / 2
    
    return round(Sim, 2)

In [None]:
# Wup similarity

data['Similarity score'] = 0.0

for index, row in data.iterrows():
        T1, T2 = str(row['String1']), str(row['String2'])
        similarity_score = Similarity(T1, T2, 0)
        data.at[index, 'Similarity score'] = similarity_score
print(data.head())  
        

In [13]:
# To save the data, so we wont have to calculate the similarities again
data.to_csv('msr_paraphrase_corpus_sim.csv', index=False, sep='|')

In [12]:
path = "./msr_paraphrase_corpus_sim.csv"
data = pd.read_csv(path, sep="|", header = 0)

In [None]:
print(data.head())

In [None]:
from scipy.stats import pearsonr

cc, p = pearsonr(data['Quality'], data['Similarity score'])
print(f"Comparing the Quality and Wup similarity score:")
print(f"Pearson correlation coefficient: {cc}")
print(f"p-value: {p}")


## Task 2

In [None]:
data['Resnik'] = 0.0

for index, row in data.iterrows():
        T1, T2 = str(row['String1']), str(row['String2'])
        resnik_sim = Similarity(T1, T2, 1)
        data.at[index, 'Resnik'] = resnik_sim
print(data.head())

In [18]:
data.to_csv('msr_paraphrase_corpus_sim_res.csv', index=False, sep='|')

In [19]:
path = "./msr_paraphrase_corpus_sim_res.csv"
data = pd.read_csv(path, sep="|", header = 0)

In [None]:
res_cc, res_p = pearsonr(data['Quality'], data['Resnik'])
print(f"Comparing the Quality and Resnik similarity score:")
print(f"Pearson correlation coefficient: {res_cc}")
print(f"p-value: {res_p}")

In [None]:
CC_table = {
    "Type": ["Wup", "Resnik"],
    "Pearson correlation coefficient": [cc, res_cc],
    "p-value": [p, res_p]
}
CC_table = pd.DataFrame(CC_table)

print("Comparison between Wup and Resnik similiarity:")
print(CC_table)


## Task 3

In [1]:
# Next, we want to use the preceding to compute threshold value beyond which a sentence-to-sentence similarity is considered as a paraphrase.
# Suggest an approach and a script that allows you to do so by exploring the minimum value for both paraphrasing and non-paraphrasing.

In [2]:
# We need to see how similar two sentences have to be for the similarity score to be 1
# We should iterate through each sentence and find the lowest semantic similarity score, and make that the threshold

# Import first csv sim file
import pandas as pd

path = "./msr_paraphrase_corpus_sim_res.csv"
data = pd.read_csv(path, sep="|", header = 0, on_bad_lines='skip')

FileNotFoundError: [Errno 2] No such file or directory: './msr_paraphrase_corpus_sim_res.csv'

In [None]:
# This function will find the maximum similarity score needed for the score to be 0 or "non - paraphrased"
def non_paraphrasing_threshold():
    # set the maximum score at 1, we will find lower scores later
    maximum_sim = 0
    # Iterate through each row
    for index, row in data.iterrows():
        # If the the quality score is 0, run the following
        if row['Quality'] == 0:
            # Make a temporary variable and assigned the similarity score to it
            temp_sim = row['Similarity score']
            # If the temporary similarity score is less than the current maximum score, update max score
            if temp_sim > maximum_sim:
                maximum_sim = temp_sim
                print(f"current maximum score: {maximum_sim}")
    # return minimum sim
    return maximum_sim
        
non_threshold = non_paraphrasing_threshold()
print(f"Non-paraphrasing threshold is {non_threshold}.")

In [None]:
# This function will find the minimum similarity score needed for the score to be 1 or "paraphrased"
def paraphrasing_threshold():
    # set the minimum score at 1, we will find lower scores later
    minimum_sim = 1
    # Iterate through each row
    for index, row in data.iterrows():
        # If the the quality score is 1, run the following
        if row['Quality'] == 1:
            # Make a temporary variable and assigned the similarity score to it
            temp_sim = row['Similarity score']
            # If the temporary similarity score is less than the current minimum score, update min score
            if temp_sim < minimum_sim:
                minimum_sim = temp_sim
                print(f"current minimum score: {minimum_sim}")
    # return minimum sim
    return minimum_sim
        
threshold = paraphrasing_threshold()
print(f"Paraphrasing threshold is {threshold}")

## Task 4

In [None]:
# Next, we want to comprehend whether the pairs that do not match with manual labelling contain some linguistic quantifiers. 
# Suggest, a script that identifies the presence of quantifier such that negation, in the sentences and test the validity of statement 
# that “incorrect matching is often due to presence of some specific linguistic quantifiers”. You may need to manually explore the pairs 
# of sentences for which the matching between manual annotation and sentence-to-similarity score to identify those quantifiers presents 
# in such sentences. 

In [None]:
# Import first csv sim file
import pandas as pd

path = "./msr_paraphrase_corpus_sim_res.csv"
data = pd.read_csv(path, sep="|", header = 0, on_bad_lines='skip')

In [None]:
# First we need to find the threshold of negation. Meaning, whats the average difference in negation between two sentences labeled as 1?
# https://codefather.tech/blog/nltk-python-sentiment-analysis/
from nltk.sentiment import SentimentIntensityAnalyzer

# Find the negation of a string
analyzer = SentimentIntensityAnalyzer()
def negation(string):
    scores = analyzer.polarity_scores(str(string))
    return scores['neg']

# store negation variable
negation_thresh = negation_threshold()
print(f"Average difference in negation value of 1 label strings is {negation_thresh}")

def negation_threshold():
    # set the average negation difference at 0
    negation_difference_average = 0
    sentence_count = 0
    # Iterate through each row
    for index, row in data.iterrows():
        # If the the quality score is 1, run the following
        if row['Quality'] == 1:
            # Make a temporary variable and assigned it the absolute value of the difference in negation scores
            temp_negation = abs(negation(str(row['String1'])) - negation(str(row['String2'])))
            # add it to the average, update sentence count
            """
            if temp_negation > negation_difference_average:
                negation_difference_average = temp_negation
            """
            negation_difference_average += temp_negation
            sentence_count += 1
            
            
    return negation_difference_average / sentence_count

In [None]:
# This function will find the negated strings among those labeled as 0
negations_quantifiers_list = ["not", "no", "never", "haven't", "hasn't", "hadn't", "wasn't", "weren't"]
def find_negated():
    # create variables to count sentences, and negated sentences
    negated_sentences = 0
    sentences = 0
    # Iterate through each row
    for index, row in data.iterrows():
        # If the label is 0 but the similarity score is above the paraphrasing threshold, do the following
        if row['Similarity score'] >= threshold and row['Quality'] == 0:
            # Update senteces integer
            sentences += 1
            # If one sentence is negated but the other isnt
            if (negation(row['String1']) > 0 and negation(row['String2']) == 0) or (negation(row['String2']) > 0 and negation(row['String1']) == 0):
                # update negated sentences by 1, and you may want to print the sentences
                negated_sentences += 1
                #print(row['String1'])
                #print(row['String2'] + "\n")
        
    print(f"The number of 0 labeled sentences which reach the threshold is {sentences}, but only {negated_sentences} have conlficting negations.")
    print(f"{negated_sentences * 100 / sentences}% of 0 labeled sentences which reach the threshold have conflicting negations.")
find_negated() 

## Task 5

In [20]:
from scipy.spatial.distance import cosine
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
import fasttext
import fasttext.util

# Load pre-trained word embeddings (Word2Vec, FastText, and GloVe) from the data folder
word2vec = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)
fasttext = fasttext.load_model('data/cc.en.300.bin')
# glove2word2vec('data/glove.6B.300d.txt', 'data/glove.6B.300d.word2vec.txt')
glove = KeyedVectors.load_word2vec_format('data/glove.6B.300d.txt', binary=False, no_header=True)

In [21]:
def sentence_embedding(sentence, model):
    words = preProcess(sentence)
    embedding = np.mean([model[word] for word in words if word in model], axis=0)
    return embedding

def embedding_similarity(data, model):
    similarities = []
    for index, row in data.iterrows():
        T1, T2 = str(row['String1']), str(row['String2'])
        emb1 = sentence_embedding(T1, model)
        emb2 = sentence_embedding(T2, model)
        similarity = 1 - cosine(emb1, emb2)
        similarities.append(similarity)
    return similarities

In [None]:
# Compute similarities using Word2Vec, FastText, and GloVe
data['Word2Vec'] = embedding_similarity(data, word2vec)
data['FastText'] = embedding_similarity(data, fasttext)
data['GloVe'] = embedding_similarity(data, glove)

# Compute Pearson correlations for Word2Vec, FastText, and GloVe
word2vec_cc, word2vec_p = pearsonr(data['Quality'], data['Word2Vec'])
fasttext_cc, fasttext_p = pearsonr(data['Quality'], data['FastText'])
glove_cc, glove_p = pearsonr(data['Quality'], data['GloVe'])

print(f"Word2Vec Pearson correlation coefficient: {word2vec_cc}, p-value: {word2vec_p}")
print(f"FastText Pearson correlation coefficient: {fasttext_cc}, p-value: {fasttext_p}")
print(f"GloVe Pearson correlation coefficient: {glove_cc}, p-value: {glove_p}")

## Task 6

In [23]:
from fuzzywuzzy import fuzz

In [None]:
data['FuzzyWuzzy'] = data.apply(lambda x: fuzz.ratio(str(x['String1']), str(x['String2'])), axis=1)

fuzzy_cc, fuzzy_p = pearsonr(data['Quality'], data['FuzzyWuzzy'])

print(f"FuzzyWuzzy Pearson correlation coefficient: {fuzzy_cc}, p-value: {fuzzy_p}")

In [None]:
summary_table = {
    "Method": ["Word2Vec", "FastText", "GloVe", "FuzzyWuzzy"],
    "Pearson correlation coefficient": [word2vec_cc, fasttext_cc, glove_cc, fuzzy_cc],
    "p-value": [word2vec_p, fasttext_p, glove_p, fuzzy_p]
}

summary_df = pd.DataFrame(summary_table)
print(summary_df)