Code done by Serden-Yilmaz Kose, Jesper Nyman and Jussi Saariniemi

Task 1

In [11]:
import nltk
import numpy as np
from nltk.corpus import genesis
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [39]:
import pandas as pd

path = "./msr_paraphrase_corpus.csv"
data = pd.read_csv(path, sep=";", header = 0, on_bad_lines='skip')

In [40]:
print(data.head())

   Quality      ID1      ID2  \
0        1   702876   702977   
1        0  2108705  2108831   
2        1  1330381  1330521   
3        0  3344667  3344648   
4        1  1236820  1236712   

                                             String1  \
0  Amrozi accused his brother, whom he called "th...   
1  Yucaipa owned Dominick's before selling the ch...   
2  They had published an advertisement on the Int...   
3  Around 0335 GMT, Tab shares were up 19 cents, ...   
4  The stock rose $2.11, or about 11 percent, to ...   

                                             String2 Unnamed: 5  
0  Referring to him as only "the witness", Amrozi...        NaN  
1  Yucaipa bought Dominick's in 1995 for $693 mil...        NaN  
2  On June 10, the ship's owners had published an...        NaN  
3  Tab shares jumped 20 cents, or 4.6%, to set a ...        NaN  
4  PG&E Corp. shares jumped $1.63 or 8 percent to...        NaN  


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('genesis')

In [42]:
genesis_ic = wn.ic(genesis, False, 0.0)

def wup(S1, S2):
    """Wu-Palmer similarity."""
    return S1.wup_similarity(S2)

def resnik(S1, S2):
    """Resnik similarity."""
    return S1.res_similarity(S2, genesis_ic)

options = {0: wup, 1: resnik}

def preProcess(sentence):
    """Tokenize, remove stopwords, and clean the sentence."""
    Stopwords = list(set(nltk.corpus.stopwords.words('english')))
    words = word_tokenize(sentence)
    words = [word.lower() for word in words if word.isalpha() and word not in Stopwords]
    return words

def get_wordnet_pos(word):
    """Map POS tag to first character for lemmatization with WordNet."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wn.ADJ, "N": wn.NOUN, "V": wn.VERB, "R": wn.ADV}
    return tag_dict.get(tag, wn.NOUN)

def word_similarity(w1, w2, num):
    """Calculate similarity between two words only if they share the same POS."""
    pos1 = get_wordnet_pos(w1)
    pos2 = get_wordnet_pos(w2)

    synsets1 = wn.synsets(w1, pos=pos1)
    synsets2 = wn.synsets(w2, pos=pos2)
    
    if synsets1 and synsets2:
        S1 = synsets1[0]
        S2 = synsets2[0]
        try:
            similarity = options[num](S1, S2)
            if similarity:
                return round(similarity, 2)
        except nltk.corpus.reader.wordnet.WordNetError:
            return 0
    return 0

def Similarity(T1, T2, num):
    """Calculate sentence-to-sentence similarity using TF-IDF and WordNet similarity."""
    words1 = preProcess(T1)
    words2 = preProcess(T2)

    tf = TfidfVectorizer(use_idf=True)
    tf.fit_transform([' '.join(words1), ' '.join(words2)])
    
    Idf = dict(zip(tf.get_feature_names_out(), tf.idf_))
    
    Sim_score1 = 0
    Sim_score2 = 0

    for w1 in words1:
        Max = 0
        for w2 in words2:
            score = word_similarity(w1, w2, num)
            if Max < score:
                Max = score
        Sim_score1 += Max * Idf.get(w1, 0)
    Sim_score1 /= sum([Idf.get(w1, 0) for w1 in words1])

    for w2 in words2:
        Max = 0
        for w1 in words1:
            score = word_similarity(w1, w2, num)
            if Max < score:
                Max = score
        Sim_score2 += Max * Idf.get(w2, 0)
    Sim_score2 /= sum([Idf.get(w2, 0) for w2 in words2])

    Sim = (Sim_score1 + Sim_score2) / 2
    
    return round(Sim, 2)

In [43]:
# Wup similarity

data['Similarity score'] = 0.0

for index, row in data.iterrows():
        T1, T2 = str(row['String1']), str(row['String2'])
        similarity_score = Similarity(T1, T2, 0)
        data.at[index, 'Similarity score'] = similarity_score
print(data.head())  
        

   Quality      ID1      ID2  \
0        1   702876   702977   
1        0  2108705  2108831   
2        1  1330381  1330521   
3        0  3344667  3344648   
4        1  1236820  1236712   

                                             String1  \
0  Amrozi accused his brother, whom he called "th...   
1  Yucaipa owned Dominick's before selling the ch...   
2  They had published an advertisement on the Int...   
3  Around 0335 GMT, Tab shares were up 19 cents, ...   
4  The stock rose $2.11, or about 11 percent, to ...   

                                             String2 Unnamed: 5  \
0  Referring to him as only "the witness", Amrozi...        NaN   
1  Yucaipa bought Dominick's in 1995 for $693 mil...        NaN   
2  On June 10, the ship's owners had published an...        NaN   
3  Tab shares jumped 20 cents, or 4.6%, to set a ...        NaN   
4  PG&E Corp. shares jumped $1.63 or 8 percent to...        NaN   

   Similarity score  
0              0.75  
1              0.54  
2

In [48]:
# To save the data, so we wont have to calculate the similarities again
data.to_csv('msr_paraphrase_corpus_sim.csv', index=False, sep='|')

In [54]:
path = "./msr_paraphrase_corpus_sim.csv"
data = pd.read_csv(path, sep="|", header = 0)

In [55]:
print(data.head())

   Quality      ID1      ID2  \
0        1   702876   702977   
1        0  2108705  2108831   
2        1  1330381  1330521   
3        0  3344667  3344648   
4        1  1236820  1236712   

                                             String1  \
0  Amrozi accused his brother, whom he called "th...   
1  Yucaipa owned Dominick's before selling the ch...   
2  They had published an advertisement on the Int...   
3  Around 0335 GMT, Tab shares were up 19 cents, ...   
4  The stock rose $2.11, or about 11 percent, to ...   

                                             String2  Similarity score  
0  Referring to him as only "the witness", Amrozi...              0.75  
1  Yucaipa bought Dominick's in 1995 for $693 mil...              0.54  
2  On June 10, the ship's owners had published an...              0.69  
3  Tab shares jumped 20 cents, or 4.6%, to set a ...              0.72  
4  PG&E Corp. shares jumped $1.63 or 8 percent to...              0.72  


In [57]:
from scipy.stats import pearsonr

cc, p = pearsonr(data['Quality'], data['Similarity score'])
print(f"Comparing the Quality and Wup similarity score:")
print(f"Pearson correlation coefficient: {cc}")
print(f"p-value: {p}")


Comparing the Quality and Similarity score:
Pearson correlation coefficient: 0.257558139727016
p-value: 4.654742450348613e-61


Task 2

In [62]:
data['Resnik'] = 0.0

for index, row in data.iterrows():
        T1, T2 = str(row['String1']), str(row['String2'])
        resnik_sim = Similarity(T1, T2, 1)
        data.at[index, 'Resnik'] = resnik_sim
print(data.head())

   Quality      ID1      ID2  \
0        1   702876   702977   
1        0  2108705  2108831   
2        1  1330381  1330521   
3        0  3344667  3344648   
4        1  1236820  1236712   

                                             String1  \
0  Amrozi accused his brother, whom he called "th...   
1  Yucaipa owned Dominick's before selling the ch...   
2  They had published an advertisement on the Int...   
3  Around 0335 GMT, Tab shares were up 19 cents, ...   
4  The stock rose $2.11, or about 11 percent, to ...   

                                             String2  Similarity score  Wup  \
0  Referring to him as only "the witness", Amrozi...              0.75  0.0   
1  Yucaipa bought Dominick's in 1995 for $693 mil...              0.54  0.0   
2  On June 10, the ship's owners had published an...              0.69  0.0   
3  Tab shares jumped 20 cents, or 4.6%, to set a ...              0.72  0.0   
4  PG&E Corp. shares jumped $1.63 or 8 percent to...              0.72  0.0

In [64]:
data.to_csv('msr_paraphrase_corpus_sim_res.csv', index=False, sep='|')

In [65]:
path = "./msr_paraphrase_corpus_sim_res.csv"
data = pd.read_csv(path, sep="|", header = 0)

In [66]:
res_cc, res_p = pearsonr(data['Quality'], data['Resnik'])
print(f"Comparing the Quality and Resnik similarity score:")
print(f"Pearson correlation coefficient: {res_cc}")
print(f"p-value: {res_p}")

Comparing the Quality and Resnik similarity score:
Pearson correlation coefficient: 0.08501047640669863
p-value: 8.370731431809998e-08


In [67]:
CC_table = {
    "Type": ["Wup", "Resnik"],
    "Pearson correlation coefficient": [cc, res_cc],
    "p-value": [p, res_p]
}
CC_table = pd.DataFrame(CC_table)

print("Comparison between Wup and Resnik similiarity:")
print(CC_table)


Comparison between Wup and Resnik similiarity:
     Type  Pearson correlation coefficient       p-value
0     Wup                         0.257558  4.654742e-61
1  Resnik                         0.085010  8.370731e-08
