# Preparation

## Imports

In [1]:
import spacy 
import en_core_web_lg
import numpy as np
import pandas as pd
import sklearn as sklearn
import time
import copy
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import warnings; warnings.simplefilter('ignore')
from spacy import displacy
import statistics

nlp = en_core_web_lg.load()
nlp_spacy = spacy.load("en_core_web_lg")

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

import nltk
import codecs
from nltk.tokenize import PunktSentenceTokenizer,sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /home/ivan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ivan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ivan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading data from file

In [2]:
trainingX = pd.read_csv('subtaskA_data_all.csv')
trainingY = pd.read_csv('subtaskA_answers_all.csv')
testX = pd.read_csv('data/taskA_trial_data.csv')
testY = pd.read_csv('data/taskA_trial_answer.csv')
pairsOfSentences=[]
pairsOfSentences_test = []
for i in tqdm(range(len(trainingX.values))):
    pairsOfSentences.append([trainingX.values[i][1], trainingX.values[i][2]])
                    
for i in tqdm(range(len(testX.values))):
    pairsOfSentences_test.append([testX.values[i][1], testX.values[i][2]])
    
whichOneIsWrong = [0]
for i in tqdm(range (len(trainingY.values))):
    whichOneIsWrong.append(trainingY.values[i][1])
    
whichOneIsWrong_test = [0]                   
for i in tqdm(range(len(testY.values))):
    whichOneIsWrong_test.append(testY.values[i][1])
print(len(pairsOfSentences))
print(len(whichOneIsWrong))

100%|██████████| 10000/10000 [00:07<00:00, 1389.74it/s]
100%|██████████| 2021/2021 [00:00<00:00, 4748.34it/s]
100%|██████████| 9999/9999 [00:00<00:00, 188500.21it/s]
100%|██████████| 2020/2020 [00:00<00:00, 182325.72it/s]

10000
10000





# Filters

## Filter functions

### Lemmatization filter

In [3]:
def lemmatization_filter(sentence):

    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(sentence)
    filtered_sent = ""
    
    for word in words:
           if word not in stop_words:
                    filtered_sent += (lemmatizer.lemmatize(word)) + " "

    return filtered_sent

###  First category filter

In [4]:
def first_cat_filter(first_sent, second_sent):
    first_sent = lemmatization_filter(first_sent)
    second_sent = lemmatization_filter(second_sent)

    tokens_first = nlp_spacy(first_sent)
    tokens_second = nlp_spacy(second_sent)
    
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer() 
    
    words_first = {}
    words_second = {}
  
    
    for i in range(len(tokens_first)):
        if tokens_first[i].text.lower() not in stop_words:
            words_first.update({i: lemmatizer.lemmatize(tokens_first[i].text.lower())})
    for i in range(len(tokens_second)):
        if tokens_second[i].text.lower() not in stop_words:
            words_second.update({i: lemmatizer.lemmatize(tokens_second[i].text.lower())})
        
    first_sent_reduced = ""
    second_sent_reduced = ""
    
    if len(words_second) == len(words_first):
        for index, word in words_first.items():
            if word in words_second.values():
                if word != words_second.get(index):
                    return False
    else:            

        for word in words_second.values():
            if word in words_first.values():
                second_sent_reduced += word

        for word in words_first.values():
            if word in words_second.values():
                first_sent_reduced += word

    
        return lemmatization_filter(first_sent_reduced) == lemmatization_filter(second_sent_reduced)

## Dafa filtering

### First category filtering

In [None]:
first_cat_pairs = []
first_cat_answers = []
for i in tqdm(range(len(pairsOfSentences))):
    if first_cat_filter(pairsOfSentences[i][0], pairsOfSentences[i][1]):
        first_cat_pairs.append(pairsOfSentences[i])
        first_cat_answers.append(whichOneIsWrong[i])

# Data init

In [32]:
# data[0], y[0], X[0] -> training set
# data[1], y[1], X[1] -> test set


data = [None, None]
y = [None, None]
X = [None, None]

data[0] = copy.deepcopy(pairsOfSentences)
y[0] = copy.deepcopy(whichOneIsWrong)
data[1] = copy.deepcopy(pairsOfSentences_test)
y[1] = copy.deepcopy(whichOneIsWrong_test)

# data[0] = copy.deepcopy(first_cat_pairs) 
# y[0] = copy.deepcopy(first_cat_answers)


X[0] = [list() for i in range(len(data[0]))]
X[1] = [list() for i in range(len(data[1]))]


#print(data)

In [78]:
# print(X[0][0])
# print(len(X[0]))
# print(X[1][0])
# print(len(X[1]))
# for i in range(100):
#     print(data[0][i][0])
#     print(data[0][i][1])
#     print("___________")

# Various functions

## Words similarity with spacy

In [6]:
def find_similarity_spacy_tokens(tokens_1, tokens_2):
    # spacy tokens are used as input
    sim = 0.0
    count = 0
    for tokenOuter in tokens_1:
        for tokenInner in tokens_2:
            sim += tokenOuter.similarity(tokenInner)
            count += 1

    if count != 0:
        return sim / count
    else:
        return 0
    
    #     if len(sim) != 0:
#         return statistics.avg(sim)
#     else:
#         return 0
    

## Reducing sentences (removing subj, obj, pred, subj+obj, subj+pred, pred+obj)

In [7]:
def make_reduced_sentences(tokens, predicates, subjects, objects, forbidden):
    without_sub = ""
    without_obj = ""
    without_pred = ""
    without_sub_obj = ""
    without_sub_pred = ""
    without_pred_obj = ""
    
    for token in tokens:
       # if token.text not in forbidden:
        if token.text not in subjects:
            without_sub += (token.text + " ")
            if token.text not in objects:
                without_sub_obj += (token.text + " ")
            if token.text not in predicates:
                without_sub_pred += (token.text + " ")        
        if token.text not in objects:
            without_obj += (token.text + " ")
            if token.text not in predicates:
                without_pred_obj += (token.text + " ")
        if token.text not in predicates:
            without_pred += (token.text + " ")
    return [without_sub, without_obj, without_pred, without_sub_obj, without_sub_pred, without_pred_obj]

## Find subjects, objects and predicates

In [8]:
def find_subj_obj_pred(spacy_tokens):
    
    SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
    OBJECTS = ["dobj", "dative", "attr", "oprd"]
    
    subjects = []
    objects = []
    predicates = []
    
    for token in spacy_tokens:
        if token.dep_ in SUBJECTS:
            subjects.append(token)
        elif token.dep_ in OBJECTS:
            objects.append(token)
        elif token.dep_ == "ROOT" or token.pos_ == "VERB":
            predicates.append(token)
    return subjects, objects, predicates

## Pronouns to nouns

In [9]:
def nounsToPronouns(sentence):
    listOfWords = sentence.split()
    for i in range(len(listOfWords)):
        if listOfWords[i] == "he" or listOfWords[i] == "him":
            listOfWords[i] = "man"
        if listOfWords[i] == "she" or listOfWords[i] == "her":
            listOfWords[i] = "woman"
        if listOfWords[i] == "they" or listOfWords[i] == "them":
            listOfWords[i] = "persons"
        if listOfWords[i] == "i" or listOfWords[i] == "me":
            listOfWords[i] = "person"
        if listOfWords[i] == "you":
            listOfWords[i] = "person"
    return ' '.join(listOfWords)

## Extract important parts of sentence as new sentence

In [10]:
def extract_important_as_sent(sentence):
    new_sentence = ""
    tokens = extract_important_as_tokens(sentence)
    for token in tokens:
        new_sentence += token.text + " "
    return new_sentence

## Extract important parts of sentence as spacy tokens

In [11]:
def extract_important_as_tokens(sentence):
    important_tokens = set()
    parts = nlp_spacy(sentence)
    for part in parts.noun_chunks:
        important_tokens.add(part.root)
        important_tokens.add(part.root.head)
    return important_tokens

## Similarity between words in sentence and hypernyms

In [None]:
def find_similarity_hypernyms(tokens_1, tokens_2):
    sent_sim = -1
                                    
    for tokenOuter in tokens_1:
        for tokenInner in tokens_2:
            if len(wordnet.synsets(tokenOuter)) == 0:
                break
            else:
                first_outer = (wordnet.synsets(tokenOuter))[0]
            if len(wordnet.synsets(tokenInner)) == 0:
                continue
            else:
                first_inner = (wordnet.synsets(tokenInner))[0]
           
            if first_outer.pos() == first_inner.pos() and tokenOuter != tokenInner: 
                for hypernym in first_inner.hypernyms(): 
                    sim = nlp_spacy(hypernym.name().split(".")[0]).similarity(nlp_spacy(first_outer.name().split(".")[0]))
                    #sim = hypernym.wup_similarity(first_outer)
                    if sim is not None:
                        if sim > sent_sim:
                            sent_sim = sim

    return sent_sim

In [None]:
print(find_similarity_hypernyms(lemmatization_filter("He drinks juice"),
                                ["juice"]))
print(find_similarity_hypernyms(lemmatization_filter("He drinks apple"),
                                ["apple"]))

# Sentences manipulation

## Changing pronouns to nouns and lowering all words

In [None]:
for x_set in range 
for i in tqdm(range(len(data))):
    data[x_set][i][0] = nounsToPronouns(data[x_set][i][0].lower())
    data[x_set][i][1] = nounsToPronouns(data[x_set][i][1].lower())

## Removing less important parts of sentences

In [67]:
 for x_set in range(2):
    for i in tqdm(range(len(data[x_set]))):
        data[x_set][i][0] = extract_important_as_sent(data[x_set][i][0])
        data[x_set][i][1] = extract_important_as_sent(data[x_set][i][1])
    
# for i in range(100):
#     print(data[i][0])
#     print(data[i][1])

100%|██████████| 10000/10000 [02:02<00:00, 81.37it/s]
100%|██████████| 2021/2021 [00:24<00:00, 81.13it/s]


# Vector building models

## POS tagging with relations (BAD)

In [None]:
for i in range(len(data)): 
    firstSentence = nlp(data[i][0])
    secondSentence = nlp(data[i][1])
    firstSentenceNumberOfRelations = 0
    firstSentenceRelationSum = 0
    secondSentenceNumberOfRelations = 0
    secondSentenceRelationSum = 0 
    for token in firstSentence:
        for child in token.children:
            firstSentenceNumberOfRelations += 1
            firstSentenceRelationSum += token.similarity(child)
    for token in secondSentence:
        for child in token.children:
            secondSentenceNumberOfRelations += 1
            secondSentenceRelationSum += token.similarity(child)
    firstSentenceAVG = firstSentenceRelationSum / firstSentenceNumberOfRelations
    secondSentenceAVG = secondSentenceRelationSum / secondSentenceNumberOfRelations
    X[i].append(firstSentenceAVG)
    X[i].append(secondSentenceAVG)

## Similarity of different words in between sentences with words in the same sentence - regular

In [48]:
for x_set in range(2):
    for i in tqdm(range(len(data[x_set]))):
        firstSentence = data[x_set][i][0]
        secondSentence = data[x_set][i][1]
        tokensFirst = nlp_spacy(firstSentence)
        tokensSecond = nlp_spacy(secondSentence)

        firstSentDiffs = [] # words which are not in a second sentence
        secondSentDiffs = [] # words which are not in a first sentence

        for token in tokensFirst:
            if token.text not in tokensSecond.text:
                firstSentDiffs.append(token)
        # print(firstSentDiffs)
        for token in tokensSecond:
            if token.text not in tokensFirst.text:
                secondSentDiffs.append(token)

        X[x_set][i].append(find_similarity_spacy_tokens(tokensFirst, firstSentDiffs))
        X[x_set][i].append(find_similarity_spacy_tokens(tokensSecond, secondSentDiffs))


100%|██████████| 10000/10000 [02:18<00:00, 72.02it/s]
100%|██████████| 2021/2021 [00:26<00:00, 75.52it/s]


## Similarity of different words in between sentences with words in the same sentence - lemmatization version

In [68]:
for x_set in range(2):
    for i in tqdm(range(len(data[x_set]))):
        firstSentence = lemmatization_filter(data[x_set][i][0])
        secondSentence = lemmatization_filter(data[x_set][i][1])
        tokensFirst = nlp_spacy(firstSentence)
        tokensSecond = nlp_spacy(secondSentence)

        firstSentDiffs = [] # words which are not in a second sentence
        secondSentDiffs = [] # words which are not in a first sentence

        for token in tokensFirst:
            if token.text not in tokensSecond.text:
                firstSentDiffs.append(token)
        # print(firstSentDiffs)
        for token in tokensSecond:
            if token.text not in tokensFirst.text:
                secondSentDiffs.append(token)

        X[x_set][i].append(find_similarity_spacy_tokens(tokensFirst, firstSentDiffs))
        X[x_set][i].append(find_similarity_spacy_tokens(tokensSecond, secondSentDiffs))


100%|██████████| 10000/10000 [01:46<00:00, 94.29it/s]
100%|██████████| 2021/2021 [00:21<00:00, 95.00it/s]


## Similarity of different words in between sentences with common words of two sentences - regular

In [70]:
for x_set in range(2):    
    for i in tqdm(range(len(data[x_set]))):
        firstSentence = data[x_set][i][0]
        secondSentence = data[x_set][i][1]
        tokensFirst = nlp_spacy(firstSentence)
        tokensSecond = nlp_spacy(secondSentence)
        firstSentDiffs = [] # words which are not in a second sentence
        secondSentDiffs = [] # words which are not in a first sentence
        CommonWords = [] # words which are common to sentences
        CommonTokens = [] # tokens which are common to sentences
        for token in tokensFirst:
            if token.text not in tokensSecond.text.split():
                firstSentDiffs.append(token)
            else:
                if token.text not in CommonWords:
                    CommonWords.append(token.text)
                    CommonTokens.append(token)
        #print(firstSentence)
        #print(firstSentDiffs)
        #print(CommonWords)
        #print(CommonTokens)
        for token in tokensSecond:
            if token.text not in tokensFirst.text.split():
                secondSentDiffs.append(token)
            else:
                if token.text not in CommonWords:
                    CommonWords.append(token.text)
                    CommonTokens.append(token)
        firstDiffSimil = 0 # similarity of words that are not found in a second sentence with common words of sentences
        secondDiffSimil = 0 # similarity of words that are not found in a first sentence with common words of sentences
        for tokenOuter in CommonTokens:
            for tokenInner in firstSentDiffs:
                firstDiffSimil += tokenOuter.similarity(tokenInner)
        for tokenOuter in CommonTokens:
            for tokenInner in secondSentDiffs:
                secondDiffSimil += tokenOuter.similarity(tokenInner)
        if len(firstSentDiffs) != 0:
            firstSentDiffAvg = firstDiffSimil / len(firstSentDiffs)
            X[x_set][i].append(firstSentDiffAvg)
        else:
            X[x_set][i].append(0)
        if len(secondSentDiffs) != 0:
            secondSentDiffAvg = secondDiffSimil / len(secondSentDiffs)
            X[x_set][i].append(secondSentDiffAvg)
        else:
            X[x_set][i].append(0)

100%|██████████| 10000/10000 [01:49<00:00, 91.03it/s]
100%|██████████| 2021/2021 [00:22<00:00, 90.68it/s]


## Similarity of different words in between sentences with common words of two sentences - lemmatization version

In [50]:
for x_set in range(2):    
    for i in tqdm(range(len(data[x_set]))):
        tokensFirst = nlp_spacy(lemmatization_filter(data[x_set][i][0]))
        tokensSecond = nlp_spacy(lemmatization_filter(data[x_set][i][1]))
        firstSentDiffs = [] # words which are not in a second sentence
        secondSentDiffs = [] # words which are not in a first sentence
        CommonWords = [] # words which are common to sentences
        CommonTokens = [] # tokens which are common to sentences
        for token in tokensFirst:
            if token.text not in tokensSecond.text.split():
                firstSentDiffs.append(token)
            else:
                if token.text not in CommonWords:
                    CommonWords.append(token.text)
                    CommonTokens.append(token)

        for token in tokensSecond:
            if token.text not in tokensFirst.text.split():
                secondSentDiffs.append(token)
            else:
                if token.text not in CommonWords:
                    CommonWords.append(token.text)
                    CommonTokens.append(token)
                    
        firstDiffSimil = 0 # similarity of words that are not found in a second sentence with common words of sentences
        secondDiffSimil = 0 # similarity of words that are not found in a first sentence with common words of sentences
        
        X[x_set][i].append(find_similarity_spacy_tokens(CommonTokens, firstSentDiffs))
        X[x_set][i].append(find_similarity_spacy_tokens(CommonTokens, secondSentDiffs))
       

100%|██████████| 10000/10000 [01:58<00:00, 84.11it/s]
100%|██████████| 2021/2021 [00:23<00:00, 86.65it/s]


## Similarity between predicate, subject and object

In [61]:
for x_set in range(2):
    for i in tqdm(range(len(data[x_set]))):
        firstSentence = data[x_set][i][0]
        secondSentence = data[x_set][i][1]

        tokensFirst = nlp_spacy(firstSentence)
        tokensSecond = nlp_spacy(secondSentence)

        firstSubjects, firstObjects, firstPredicates = find_subj_obj_pred(tokensFirst)
        secondSubjects, secondObjects, secondPredicates = find_subj_obj_pred(tokensSecond)

        # vectors for similarity and number of vectors per category       
        firstPS = find_similarity_spacy_tokens(firstPredicates, firstSubjects)
        firstPO = find_similarity_spacy_tokens(firstPredicates, firstObjects)
        firstSO = find_similarity_spacy_tokens(firstSubjects, firstObjects) 
        secondPS = find_similarity_spacy_tokens(secondPredicates, secondSubjects)
        secondPO = find_similarity_spacy_tokens(secondPredicates, secondObjects) 
        secondSO = find_similarity_spacy_tokens(secondSubjects, secondObjects)  

        X[x_set][i].append((firstPS + firstPO + firstSO) / 3)
        X[x_set][i].append((secondPS + secondPO + secondSO) / 3)  

100%|██████████| 10000/10000 [02:07<00:00, 78.52it/s]
100%|██████████| 2021/2021 [00:25<00:00, 78.32it/s]


## Similarity between predicate, subject and object - lemmatization version

In [56]:
for x_set in range(2):
    for i in tqdm(range(len(data[x_set]))):
        
        tokensFirst = nlp_spacy(lemmatization_filter(data[x_set][i][0]))
        tokensSecond = nlp_spacy(lemmatization_filter(data[x_set][i][1]))

        firstSubjects, firstObjects, firstPredicates = find_subj_obj_pred(tokensFirst)
        secondSubjects, secondObjects, secondPredicates = find_subj_obj_pred(tokensSecond)

        # vectors for similarity and number of vectors per category       
        firstPS = find_similarity_spacy_tokens(firstPredicates, firstSubjects)
        firstPO = find_similarity_spacy_tokens(firstPredicates, firstObjects)
        firstSO = find_similarity_spacy_tokens(firstSubjects, firstObjects) 
        secondPS = find_similarity_spacy_tokens(secondPredicates, secondSubjects)
        secondPO = find_similarity_spacy_tokens(secondPredicates, secondObjects) 
        secondSO = find_similarity_spacy_tokens(secondSubjects, secondObjects)  

        X[x_set][i].append((firstPS + firstPO + firstSO) / 3)
        X[x_set][i].append((secondPS + secondPO + secondSO) / 3)  

100%|██████████| 10000/10000 [01:57<00:00, 85.46it/s]
100%|██████████| 2021/2021 [00:23<00:00, 86.55it/s]


### Similarity between subject and objects(ivan:reused,rino) NOT BAD

In [None]:
for i in tqdm(range(len(pairsOfSentences))):
    firstSentence = pairsOfSentences[i][0]
    secondSentence = pairsOfSentences[i][1]
    tokensFirst = nlp_spacy(firstSentence)
    tokensSecond = nlp_spacy(secondSentence)
    
    firstSubjects = []
    firstObjects = []
    firstPredicates = []
    
    secondSubjects = []
    secondObjects = []
    secondPredicates = []
    
    # getting predicates, subjects and objects out of sentences
    for token in tokensFirst:
        if token.dep_ == "nsubj":
            firstSubjects.append(token)
        elif token.dep_ == "pobj" or token.dep_ == "nobj" or token.dep_ == "dobj":
            firstObjects.append(token)
        elif token.dep_ == "ROOT" or token.dep_ == "conj":
            firstPredicates.append(token)
    for token in tokensSecond:
        if token.dep_ == "nsubj":
            secondSubjects.append(token)
        elif token.dep_ == "pobj" or token.dep_ == "nobj" or token.dep_ == "dobj":
            secondObjects.append(token)
        elif token.dep_ == "ROOT" or token.dep_ == "conj":
            secondPredicates.append(token)
            
    # vectors for similarity and number of vectors per category       
    firstPO = 0 
    fPoNum = len(firstPredicates) * len(firstObjects)
    firstSO = 0 
    fSoNum = len(firstSubjects) * len(firstObjects)
    secondPO = 0 
    sPoNum = len(secondPredicates) * len(secondObjects)
    secondSO = 0
    sSoNum = len(secondSubjects) * len(secondObjects)
    
    
    for tokenP in firstPredicates:
        for tokenO in firstObjects:
            firstPO += (tokenP.similarity(tokenO))
    for tokenS in firstSubjects:
        for tokenO in firstObjects:
            firstSO += (tokenS.similarity(tokenO))
    
    for tokenP in secondPredicates:
        for tokenO in secondObjects:
            secondPO += (tokenP.similarity(tokenO))
    for tokenS in secondSubjects:
        for tokenO in secondObjects:
            secondSO +=(tokenS.similarity(tokenO))
    
    
    
    if (fPoNum + fSoNum) != 0:
        avg = (firstPO + firstSO) / (fPoNum + fSoNum)
        X[i].append(avg)
    else:
        X[i].append(0)
        
    
    if (sPoNum + sSoNum) != 0:
        avg = (secondPO + secondSO) / (sPoNum + sSoNum)
        X[i].append(avg)
    else:
        X[i].append(0)

### Similarity between predicate, subject and object (appended separately) (ivan:reused,rino) NOT BAD

In [None]:
for i in tqdm(range(len(pairsOfSentences))):
    firstSentence = pairsOfSentences[i][0]
    secondSentence = pairsOfSentences[i][1]
    tokensFirst = nlp_spacy(firstSentence)
    tokensSecond = nlp_spacy(secondSentence)
    
    firstSubjects = []
    firstObjects = []
    firstPredicates = []
    
    secondSubjects = []
    secondObjects = []
    secondPredicates = []
    
    # getting predicates, subjects and objects out of sentences
    for token in tokensFirst:
        if token.dep_ == "nsubj":
            firstSubjects.append(token)
        elif token.dep_ == "pobj" or token.dep_ == "nobj" or token.dep_ == "dobj":
            firstObjects.append(token)
        elif token.dep_ == "ROOT" or token.dep_ == "conj":
            firstPredicates.append(token)
    for token in tokensSecond:
        if token.dep_ == "nsubj":
            secondSubjects.append(token)
        elif token.dep_ == "pobj" or token.dep_ == "nobj" or token.dep_ == "dobj":
            secondObjects.append(token)
        elif token.dep_ == "ROOT" or token.dep_ == "conj":
            secondPredicates.append(token)
            
    # vectors for similarity and number of vectors per category       
    firstPS = 0 
    fPsNum = len(firstPredicates) * len(firstSubjects)
    firstPO = 0 
    fPoNum = len(firstPredicates) * len(firstObjects)
    firstSO = 0 
    fSoNum = len(firstSubjects) * len(firstObjects)
    secondPS = 0
    sPsNum = len(secondPredicates) * len(secondSubjects)
    secondPO = 0 
    sPoNum = len(secondPredicates) * len(secondObjects)
    secondSO = 0
    sSoNum = len(secondSubjects) * len(secondObjects)
    
    
    for tokenP in firstPredicates:
        for tokenS in firstSubjects:
            firstPS += (tokenP.similarity(tokenS))
    for tokenP in firstPredicates:
        for tokenO in firstObjects:
            firstPO += (tokenP.similarity(tokenO))
    for tokenS in firstSubjects:
        for tokenO in firstObjects:
            firstSO += (tokenS.similarity(tokenO))
    
    for tokenP in secondPredicates:
        for tokenS in secondSubjects:
            secondPS += (tokenP.similarity(tokenS))
    for tokenP in secondPredicates:
        for tokenO in secondObjects:
            secondPO += (tokenP.similarity(tokenO))
    for tokenS in secondSubjects:
        for tokenO in secondObjects:
            secondSO +=(tokenS.similarity(tokenO))
    
    #FIRST SENTENCE
    
    if fPsNum > 0:
        X[i].append(firstPS/fPsNum)
    else:
        X[i].append(0)
        
    if fPoNum > 0:
        X[i].append(firstPO/fPoNum)
    else:
        X[i].append(0)

    if fSoNum > 0:
        X[i].append(firstSO/fSoNum)
    else:
        X[i].append(0)
        
    #SECONDSENTENCE
    
    if sPsNum > 0:
        X[i].append(secondPS/sPsNum)
    else:
        X[i].append(0)
    
    if sPoNum > 0:
        X[i].append(secondPO/sPoNum)
    else:
        X[i].append(0)
    
    if sSoNum > 0:
        X[i].append(secondSO/sSoNum)
    else:
        X[i].append(0)
        

## Similarity of disctinct words and hypernyms


In [None]:
X = [list() for i in range(len(data))]

In [None]:
for i in tqdm(range(len(data))):
    firstSentence = data[i][0]
    secondSentence = data[i][1]
    
    tokensFirst = nlp_spacy(firstSentence)
    tokensSecond = nlp_spacy(secondSentence)
    
    firstSentDiffs = [] # words which are not in a second sentence
    secondSentDiffs = [] # words which are not in a first sentence
  
    for token in tokensFirst:
        if token.text not in tokensSecond.text:
            firstSentDiffs.append(token.text)

    for token in tokensSecond:
        if token.text not in tokensFirst.text:
            secondSentDiffs.append(token.text)
    
    X[i].append(find_similarity_hypernyms(lemmatization_filter(firstSentence), secondSentDiffs))
    X[i].append(find_similarity_hypernyms(lemmatization_filter(secondSentence), firstSentDiffs))
    

# Training and validation

## Manual cross validation with SVC(rbf) model and wrong predictions extraction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

classifier = SVC(kernel = 'rbf', random_state = 53)


number_of_cross_validations = 10
X_parts = [list() for i in range (number_of_cross_validations)]
y_parts = [list() for i in range (number_of_cross_validations)]


# spliting data in number_of_cross_validations training sets and test sets
for i in range(len(X)):
    X_parts[i % number_of_cross_validations].append(X[i])
for i in range(len(X)):
    y_parts[i % number_of_cross_validations].append(y[i])


X_training_set = [list() for i in range (number_of_cross_validations)]
for i in range(number_of_cross_validations):
    for j in range(number_of_cross_validations):
        if j != i:
            X_training_set[i].extend(X_parts[j])
            
            
y_training_set = [list() for i in range (number_of_cross_validations)]
for i in range(number_of_cross_validations):
    for j in range(number_of_cross_validations):
        if j != i:
            y_training_set[i].extend(y_parts[j])

            

wrong_sentence_pairs = [list() for i in range (number_of_cross_validations)] # stores every pair that is wrongly predicted
wrong_sent_answers = [list() for i in range (number_of_cross_validations)] # which of 2 sentences is wrong


sc = StandardScaler()
test_sum = 0

# performing cross validation using linear regression
for i in range(number_of_cross_validations):
    
    X_train = sc.fit_transform(X_training_set[i])
    X_test = sc.transform(X_parts[i])
    
    classifier.fit(X_train, y_training_set[i])
    
    y_pred = classifier.predict(X_test)
    
    for j in range(len(y_pred)):
        if y_pred[j] != y_parts[i][j]:
            wrong_sentence_pairs[i].append(pairsOfSentences[i + 10 * j])
            wrong_sent_answers[i].append(y_parts[i][j])
    
    cm = confusion_matrix(y_parts[i], y_pred)
    true = cm[0][0]+cm[1][1]
    false = cm[0][1]+cm[1][0]
    
    test_sum += (true / (true + false))
    
    print("Accuracy in run number " + str(i + 1) + " is: " +  str(true / (true + false)))
    
print("Avg accuracy is: ", test_sum / number_of_cross_validations)

## Cross-validation

### Run models

In [96]:
print("SVM-linear: " + svc_linear(X[0], y[0], 0.1))

SVM-linear: Accuracy: 0.70 (+/- 0.08)


In [97]:
print("SVM-rbf: " + svc_rbf(X[0] + X[1], y[0] + y[1], 2))

SVM-rbf: Accuracy: 0.70 (+/- 0.09)


In [31]:
print("Logistic regression: " + logistic_regression(X, y, 1))

Logistic regression: Accuracy: 0.66 (+/- 0.09)


In [32]:
print("Random forest: " + random_forest(X, y))

Random forest: Accuracy: 0.65 (+/- 0.06)


In [None]:
print(X[0])

### SVM-linear (66%)

In [25]:
def svc_linear(X, y, c):
    clf = svm.SVC(kernel='linear', C=c)
    scores = cross_val_score(clf, X, y, cv=10)
    return("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### Logistic regression (67%)

In [26]:
def logistic_regression(X, y, rs):
    scaler = StandardScaler()
    #X = scaler.fit_transform(X)
    classifier = LogisticRegression(random_state = rs, max_iter=2000)
    scores = cross_val_score(classifier, X, y, cv=10)
    return("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### SVM-rbf (67%)

In [27]:
def svc_rbf(X, y, rs):
    classifier = SVC(kernel = 'rbf', random_state = rs)
    scores = cross_val_score(classifier, X, y, cv=10)
    return("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### Random Forest

In [28]:
def random_forest(X, y):
    classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
    scores = cross_val_score(classifier, X, y, cv=10)
    return("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## Manual train and test

In [66]:
X_moj = copy.deepcopy(X)
#X = copy.deepcopy(X_moj)
print(X[0][0])
print(X_moj[0][0])

[0.4629705523451169, 0.4936219722032547, 0.3935041083022952, 0.40340409108570646, 0.29969875141978264, 0.3670274652540684, 0.9438835829496384, 0.9514634708563486]
[0.4629705523451169, 0.4936219722032547, 0.3935041083022952, 0.40340409108570646, 0.29969875141978264, 0.3670274652540684, 0.9438835829496384, 0.9514634708563486]


In [71]:
#best -> 6.2 + 6.3 + 6.4 + 6.5 + 6. 6 + 6.3(reduced) -> 76.79 %

from sklearn.metrics import confusion_matrix
classifier1 = svm.SVC(kernel='linear', C=2)
classifier2 = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier3 = SVC(kernel = 'rbf', random_state = 10)
classifier4 = LogisticRegression(random_state = 10, max_iter=2000)

classifier2.fit(X[0], y[0])
y_pred = classifier2.predict(X[1])

cm = confusion_matrix(y[1], y_pred)
true = cm[0][0]+cm[1][1]
false = cm[0][1]+cm[1][0]
print("Accuracy", true/(true + false)) 

Accuracy 0.7570509648688768
