In [1]:
!pip install sentence_transformers
!pip install transformers
!pip install faiss
!python -m pip install --upgrade faiss faiss-gpu
!pip install p_tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 1.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 25.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 39.1 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 8.7 MB/s 
Collecting tokenizers!

In [2]:
from google.colab import drive
drive.mount('/content/drive')
!ln -s "/content/drive/My Drive/Colab Notebooks/tfm"
import sys
sys.path.append('tfm/checkpoints/')
sys.path.append('tfm/outputs/')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import faiss
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from p_tqdm import p_map
from transformers import AutoTokenizer

In [4]:
filepath = "../../data/outputs/dataset_2021_paragraph"
filepath = "https://storage.googleapis.com/tfm_aideas_datasets/dataset_2021_paragraph" # Filepath GCloud
filepath = "tfm/outputs/dataset_2021_paragraph" # Filepath COLAB

In [5]:
df = pd.read_csv(f"{filepath}_annotated.csv", sep=";", index_col=1, parse_dates=["date"], low_memory=False)

In [6]:
annotated = df.dropna(subset=["finanzas_tag_news_s_n", "impacto_tag_news_s_n", "topic_tag_news", "finanzas_tag_s_n", "impacto_tag_s_n"])

In [7]:
sentences = annotated.body.drop_duplicates()
sentences

pk_paragraph
1656289470276_43316_IAG_0000001769    El primer ejecutivo del grupo Qatar Airways, A...
1656289470276_43316_IAG_0000001770    “La evolución de la aplicación de la vacuna a ...
1656289470276_43316_IAG_0000001771    Qatar Airways, primer accionista de IAG con el...
1656289470276_43316_IAG_0000001772    De forma paralela, Catar está invirtiendo en e...
1656289470276_43316_IAG_0000001773    Durante la pandemia, Qatar Airways ha repatria...
                                                            ...                        
1656407554675_6_AMS_0000000024        El chico del periódico (Bambú): El periodista ...
1656407554675_6_AMS_0000000025        Taurus, salvar la tierra (Montena): El guionis...
1656407554675_6_AMS_0000000026        El quicio (Bruguera): Marina, la protagonista ...
1656407554675_6_AMS_0000000027        Escritos en la guerra (Kalandraka): Para lecto...
1656407554675_6_AMS_0000000028        Puedes seguir De mamas & de papas en Facebook,...
Name: body, Length:

In [8]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2', device='cuda')
embeddings = model.encode(sentences.tolist())

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [10]:
np.save("tfm/outputs/embeddings_semantic_matching.npy", embeddings)

In [None]:
embeddings = np.load("tfm/outputs/embeddings_semantic_matching.npy")

## FAISS - Fast similarity search (huge datasets)

In [11]:
d = 768
index = faiss.IndexFlatIP(d)
faiss.normalize_L2(embeddings)
index.add(embeddings)

In [12]:
# Versión multithreading:

neighbors = 50
similarity_threshold = 0.7
i_query = list(sentences.iteritems())

def calculate_cosine_df(i_query, query_embedding):
    results = {}
    i, query = i_query
    distances, indices = index.search(query_embedding.reshape(1,d), neighbors)
    idx_sentences = sentences.index[indices[0]]
    mask1 = ~(idx_sentences == i)
    mask2 = distances[0] > similarity_threshold
    mask = mask1 * mask2
    length = sum(mask)
    if length > 0:
        idx_sentences = idx_sentences[mask]
        distances = distances[0, mask]
        indices = indices[0, mask]    
        results["first_idx"] = np.repeat(i, length)
        results["first_body"] = np.repeat(query, length)
        results["second_idx"] = idx_sentences
        results["second_body"] = sentences.iloc[indices]
        results["cosine"] = distances
    return results

results = p_map(calculate_cosine_df, i_query, embeddings)

converted_results = {}
converted_results["first_idx"] = []
converted_results["first_body"] = []
converted_results["second_idx"] = []
converted_results["second_body"] = []
converted_results["cosine"] = []
for res in results:
    if len(res) == 0:
        continue
    converted_results["first_idx"].extend(res["first_idx"])
    converted_results["first_body"].extend(res["first_body"])
    converted_results["second_idx"].extend(res["second_idx"])
    converted_results["second_body"].extend(res["second_body"])
    converted_results["cosine"].extend(res["cosine"])  

res_df = pd.DataFrame(converted_results)
res_df.to_csv("tfm/outputs/similarity_results.csv", sep=";")

  0%|          | 0/2157 [00:00<?, ?it/s]

In [14]:
res_df.shape

(9448, 5)

In [13]:
df_no_duplicates = res_df[~pd.DataFrame(np.sort(res_df[['first_body','second_body']], axis=1), index=res_df.index).duplicated()]
df_no_duplicates

Unnamed: 0,first_idx,first_body,second_idx,second_body,cosine
0,1656289470276_43316_IAG_0000001770,“La evolución de la aplicación de la vacuna a ...,1656323453375_6_BBVA_0000000379,"Carlos Ocaña, director general de Funcas, ha s...",0.825646
1,1656289470276_43316_IAG_0000001770,“La evolución de la aplicación de la vacuna a ...,1656346595982_65884_ANA_0000000013,"Con este acuerdo, informaba la pasada semana l...",0.808709
2,1656289470276_43316_IAG_0000001770,“La evolución de la aplicación de la vacuna a ...,1656346595982_65884_ANA_0000000019,El consultor de Estrategia Digital y profesor ...,0.775043
3,1656289470276_43316_IAG_0000001770,“La evolución de la aplicación de la vacuna a ...,1656346595982_65884_ANA_0000000016,“Nuestro único interés es colaborar en el avan...,0.762434
4,1656289470276_43316_IAG_0000001770,“La evolución de la aplicación de la vacuna a ...,1656346595982_65884_ANA_0000000011,"A partir de este miércoles, los madrileños y m...",0.740546
...,...,...,...,...,...
9440,1656406912310_9_CABK_0000000739,"CÓRDOBA, 28 Dic. (EUROPA PRESS) -",1656326995744_6_REE_0000000488,"MADRID, 15 Abr. (EUROPA PRESS) -",0.745921
9441,1656406912310_9_CABK_0000000739,"CÓRDOBA, 28 Dic. (EUROPA PRESS) -",1656330974344_22020_SAN_0000000121,"MADRID, 26 Mar. (EUROPA PRESS) -",0.744082
9442,1656406912310_9_CABK_0000000739,"CÓRDOBA, 28 Dic. (EUROPA PRESS) -",1656358324332_65884_ANA_0000001394,"MADRID, 22 Jul. (EUROPA PRESS) -",0.744027
9443,1656406912310_9_CABK_0000000739,"CÓRDOBA, 28 Dic. (EUROPA PRESS) -",1656361500829_65884_SAN_0000000544,"MADRID, 26 Jul. (EUROPA PRESS) -",0.740852


In [15]:
df_no_duplicates.shape

(5321, 5)

In [18]:
df_sorted = df_no_duplicates.sort_values('cosine', ascending=False).dropna()
df_sorted

Unnamed: 0,first_idx,first_body,second_idx,second_body,cosine
4032,1656341099276_9_ANA_0000000216,hidrógeno,1656341099276_9_ANA_0000000220,de hidrógeno,0.989330
8173,1656389914689_66916_BKT_0000000674,"MADRID, 27 Oct. (EUROPA PRESS) -",1656391719044_66916_ELE_0000002360,"MADRID, 29 Oct. (EUROPA PRESS) -",0.987717
5849,1656349825393_65884_ACS_0000000777,"MADRID, 12 Jul. (EUROPA PRESS) -",1656350610025_65884_BBVA_0000001587,"MADRID, 13 Jul. (EUROPA PRESS) -",0.984713
7666,1656374233353_66916_SAB_0000001208,"MADRID, 8 Oct. (EUROPA PRESS) -",1656377019094_66916_ELE_0000000496,"MADRID, 12 Oct. (EUROPA PRESS) -",0.983863
6211,1656358324332_65884_ANA_0000001394,"MADRID, 22 Jul. (EUROPA PRESS) -",1656362413790_65884_CLNX_0000000857,"MADRID, 27 Jul. (EUROPA PRESS) -",0.983143
...,...,...,...,...,...
4195,1656341099276_9_ANA_0000000232,En el llamado Corredor Vasco del Hidrógeno hab...,1656341920409_6_IBE_0000000361,"Porque a pesar del camino recorrido, desde el ...",0.700087
6579,1656365944780_65884_ITX_0000001562,"Pero es en el sector energético donde, por el ...",1656365944780_65884_ITX_0000001563,"Tras un 2020 de relativa calma inversora, marc...",0.700077
2962,1656332653487_9_MEL_0000000774,"Mientras llegan las ayudas, el sector tiene cl...",1656373661536_6_AENA_0000000225,"En Renta 4 opinan que, aunque las cotizaciones...",0.700077
3005,1656332653487_9_MEL_0000000789,Ya hay centros logísticos en España donde los ...,1656332653487_9_MEL_0000000791,"Tras este proyecto piloto, el ITH extrajo vari...",0.700066


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [26]:
df_sorted

Unnamed: 0,first_idx,first_body,second_idx,second_body,cosine
4032,1656341099276_9_ANA_0000000216,hidrógeno,1656341099276_9_ANA_0000000220,de hidrógeno,0.989330
8173,1656389914689_66916_BKT_0000000674,"MADRID, 27 Oct. (EUROPA PRESS) -",1656391719044_66916_ELE_0000002360,"MADRID, 29 Oct. (EUROPA PRESS) -",0.987717
5849,1656349825393_65884_ACS_0000000777,"MADRID, 12 Jul. (EUROPA PRESS) -",1656350610025_65884_BBVA_0000001587,"MADRID, 13 Jul. (EUROPA PRESS) -",0.984713
7666,1656374233353_66916_SAB_0000001208,"MADRID, 8 Oct. (EUROPA PRESS) -",1656377019094_66916_ELE_0000000496,"MADRID, 12 Oct. (EUROPA PRESS) -",0.983863
6211,1656358324332_65884_ANA_0000001394,"MADRID, 22 Jul. (EUROPA PRESS) -",1656362413790_65884_CLNX_0000000857,"MADRID, 27 Jul. (EUROPA PRESS) -",0.983143
...,...,...,...,...,...
4195,1656341099276_9_ANA_0000000232,En el llamado Corredor Vasco del Hidrógeno hab...,1656341920409_6_IBE_0000000361,"Porque a pesar del camino recorrido, desde el ...",0.700087
6579,1656365944780_65884_ITX_0000001562,"Pero es en el sector energético donde, por el ...",1656365944780_65884_ITX_0000001563,"Tras un 2020 de relativa calma inversora, marc...",0.700077
2962,1656332653487_9_MEL_0000000774,"Mientras llegan las ayudas, el sector tiene cl...",1656373661536_6_AENA_0000000225,"En Renta 4 opinan que, aunque las cotizaciones...",0.700077
3005,1656332653487_9_MEL_0000000789,Ya hay centros logísticos en España donde los ...,1656332653487_9_MEL_0000000791,"Tras este proyecto piloto, el ITH extrajo vari...",0.700066
