In [0]:
!pip install sentence_transformers

Collecting sentence_transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b9/46/b7d6c37d92d1bd65319220beabe4df845434930e3f30e42d3cfaecb74dc4/sentence-transformers-0.2.6.1.tar.gz (55kB)
[K     |████████████████████████████████| 61kB 4.1MB/s 
[?25hCollecting transformers>=2.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/12/b5/ac41e3e95205ebf53439e4dd087c58e9fd371fd8e3724f2b9b4cdb8282e5/transformers-2.10.0-py3-none-any.whl (660kB)
[K     |████████████████████████████████| 665kB 12.9MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 23.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86

In [0]:
import numpy as np
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
import scipy.spatial
from nltk.metrics import precision, recall, f_measure
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### preparation

In [0]:
def load_data(path):
    doc_data1 = pd.read_csv(path, header = None)
    print(len(doc_data1))
    return doc_data1

In [0]:
def normalize(data):
    sentences = (data[0].str.lower() #lowercase
                           .str.replace(r'[^\w\s]+', '') #rem punctuation 
                          .str.strip()) #rem trailing whitespaces
    return sentences

In [0]:
def get_embedd(data1, method = "CountVectorizer"):
    if method in ("use", "sbert", "sroberta"):
        if method == "use":
            embedder = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
            X1 = embedder(data1)
        elif method == "sbert":
            embedder = SentenceTransformer('bert-large-nli-stsb-mean-tokens')
            X1 = embedder.encode(data1)
        elif method == "sroberta":
            embedder = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
            X1 = embedder.encode(data1)
    else:
        vectorizer = CountVectorizer()
        X1 = vectorizer.fit_transform(data1) 
        X1 = X1.toarray()
    return X1, X1
    

In [0]:
def find_candidates(X1, X2):
    closest_n = 40
    my_data = []
    num_sentence = 0
    for query, query_embedd in zip(X1[0], X2):
        distances = scipy.spatial.distance.cdist([query_embedd], X1, "cosine")[0]

        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])

        for idx, distance in results[0:closest_n]:
            if ((1-distance) > 0.8) and (num_sentence != idx) :
                my_data.append(idx)
        num_sentence = num_sentence + 1

   
    my_data = list(set(my_data))
    print(my_data)
    return my_data


In [0]:
def get_result(path_true, candidates):
    true = load_data(path_true)
    candidates = set(candidates)
    true_set = set(true[4])
    print("Precision: %0.3f" % precision(true_set, candidates))
    print("Recall: %0.3f" % recall(true_set, candidates))
    print("F-measure: %0.3f" % f_measure(true_set, candidates))

### Load data


In [1]:
data1 = load_data('/content/drive/My Drive/Step1/slf.csv')

133


In [0]:
data1 = normalize(data1)

### CountVect

In [0]:
X1, X2 = get_embedd(data1)

In [0]:
candidates = find_candidates(X1,X2)

[9, 15, 19, 20, 27, 29, 34, 35, 36, 37, 38, 39, 40, 41, 43, 44, 46, 47, 49, 54, 55, 59, 60, 62, 63, 64, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 99, 102, 110, 111, 112, 113, 118, 119, 120, 127, 129, 130, 131, 136, 137, 138, 139, 140, 141, 142, 143, 146, 147, 149, 151, 152, 154, 155, 161, 162, 163, 164, 166, 168, 169, 180, 181, 182, 183, 187, 188, 190, 192, 196, 199, 202, 218, 221, 232, 234, 243, 244, 245, 246, 249, 250, 259, 260, 261, 269, 270, 271, 272, 274, 275, 276, 277, 278, 279, 280, 281, 283, 293, 294, 295, 296, 297, 298, 300, 301, 302, 304, 306, 308, 309, 310, 311, 314, 315, 317, 318, 319, 320, 321, 322, 323, 329, 336, 340, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 369, 370, 371, 372, 377, 378, 380, 396, 397, 398, 410, 412, 415, 416, 417, 420, 433, 434, 435]


In [0]:
path = '/content/drive/My Drive/Step1/slf_true.csv'
get_result(path, candidates)

133
Precision: 0.810
Recall: 0.992
F-measure: 0.892


### SBert

In [0]:
X1, X2 = get_embedd(data1, "sbert")

In [0]:
candidates = find_candidates(X1,X2)

[0, 1, 3, 5, 6, 8, 9, 12, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 78, 80, 81, 84, 85, 86, 87, 89, 90, 91, 93, 104, 107, 109, 110, 111, 112, 113, 115, 116, 121, 124, 127, 128, 129, 130, 132, 134, 136, 140, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 177, 179, 182, 186, 188, 192, 193, 194, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 208, 211, 212, 213, 214, 215, 220, 221, 222, 223, 224, 225, 226]


In [0]:
get_result(path, candidates)

133
Precision: 0.761
Recall: 0.910
F-measure: 0.829


### Sroberta

In [0]:
X1, X2 = get_embedd(data1, "sroberta")

100%|██████████| 1.31G/1.31G [00:22<00:00, 59.3MB/s]


In [0]:
candidates = find_candidates(X1,X2)

[0, 3, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 78, 80, 81, 84, 85, 87, 88, 90, 91, 93, 107, 109, 111, 113, 115, 116, 121, 124, 127, 132, 134, 136, 137, 138, 139, 140, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 177, 179, 182, 186, 188, 192, 193, 194, 197, 199, 200, 202, 203, 204, 205, 206, 207, 208, 211, 212, 213, 214, 215, 217, 219, 220, 221, 222, 223, 224, 226]


In [0]:
get_result(path, candidates)

133
Precision: 0.825
Recall: 0.955
F-measure: 0.885


### USE

In [0]:
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np

In [0]:
X1, X2 = get_embedd(data1, "use")

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


In [0]:
candidates = find_candidates(X1,X2)

[0, 2, 3, 5, 8, 9, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 79, 81, 84, 85, 86, 87, 90, 91, 93, 107, 109, 111, 113, 115, 116, 118, 119, 120, 121, 124, 127, 130, 132, 134, 136, 138, 139, 140, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 177, 179, 182, 186, 188, 192, 193, 194, 195, 197, 199, 200, 201, 202, 203, 204, 205, 206, 208, 211, 212, 213, 214, 215, 217, 220, 221, 222, 223, 224, 225, 226]


In [0]:
get_result(path, candidates)

133
Precision: 0.816
Recall: 0.970
F-measure: 0.887
