# Connect TPU / GPU

In [None]:
import os
import tensorflow as tf

if 'COLAB_TPU_ADDR' in os.environ and os.environ['COLAB_TPU_ADDR']:
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  strategy = tf.distribute.TPUStrategy(cluster_resolver)
  print('Using TPU')

elif tf.config.list_physical_devices('GPU'):
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')
  
else:
  raise ValueError('Running on CPU is not recommended.')

Using TPU


# Libraries

In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 5.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 47.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 43.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
! pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 2.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 29.8 MB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=840292130f8882f5e6bae65623ad7f79e5d8c1cccdd3ce3c5015a45f3e7f98df
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentencepiece, sentence-transformers
Successfully installed sentenc

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import random
import json, pickle

In [None]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from keras import models, layers, callbacks, metrics
from transformers import AutoTokenizer, AutoModel, TFBertModel

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Constants

In [None]:
base_path = '/content/drive/MyDrive/Università/5/Stage/tweets-analysis/'

In [None]:
stopwords_list = set(stopwords.words('italian'))

In [None]:
KEYWORDS = ['febbre', 'brividi', 'dolori alle ossa', 'dolori muscolari', 'malessere generale', 
            'mal di testa', 'mal di gola', 'raffreddore', 'tosse', 'congiuntivite']

In [None]:
model_name = "dbmdz/bert-base-italian-xxl-uncased"
# model_name = "GroNLP/gpt2-medium-italian-embeddings"
# model_name = "m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
# model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# model_name = 'Jiva/xlm-roberta-large-it-mnli'

# Import files

In [None]:
# with open(base_path+'tweets/covid/covid_contents.json') as fp:
#   data = json.load(fp)

In [None]:
! mkdir files
! cp "drive/MyDrive/Università/5/Stage/tweets-analysis/TweetsUtils.py" .

from TweetsUtils import *

tmp_tweets = read_file(base_path + 'tweets/flu_tweets/' + 'flu_tweets_contents.json')

# data = select_fields(tmp_tweets, ['text'], as_list=True, unique=True)
# print(len(data))

# data = data[:20000]

# Clean

In [None]:
def clean_tweet_lite(text):
  
  # lowercase
  text = text.lower()

  # remove links
  regex = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'
  text = re.sub(regex, '', text)

  # fix start of tweet
  if text.startswith('rt '): 
    text = text[3:]

  # replace html chars
  text = BeautifulSoup(text).get_text()
  text = text.replace('\x92', '\'') # quotes
  text = text.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&').replace('&#39;', '\'').replace('&#039;', '\'')
  text = text.replace('[', '(').replace(']', ')') # brackets

  # replace users mentions
  text = ' '.join(re.sub("(@[A-Za-z0-9_]+)", "@user", text).split())

  # fix punctuation
  text = text.replace(' . ', '. ').replace(' , ', ', ').replace(' : ', ': ').replace(' ; ', '; ').replace(' ? ', '? ').replace(' ! ', '! ')

  # fix whitespaces
  text = ' '.join(text.split())

  return text

In [None]:
# df = pd.DataFrame(tmp_tweets)
df = pd.DataFrame(tmp_tweets)
df.set_index('datetime', inplace=True)
df = df[['text']]

In [None]:
df = df.sample(100000)

In [None]:
df['clean'] = df['text'].copy().apply(lambda x: clean_tweet_lite(x))

In [None]:
df = df.dropna()
df = df.drop_duplicates('clean')
df = df[df.clean.str.len() <= 100]
df = df.reset_index(drop=True)

In [None]:
original_t = list(df['text'])
t = list(df['clean'])

# PROVA - con bert (words)

In [None]:
from transformers import AutoTokenizer, AutoModel, BertModel, BertTokenizer
from sentence_transformers import util
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from scipy.spatial.distance import cosine
import torch

In [None]:
def get_tweets_by_keyword(tweets, keyword): 
  result = []
  for tweet in tweets: 
    if keyword in tweet:
      i = tweet.index(keyword)

      if not(i > 0 and tweet[i-1].isalpha()):
        if not(i+len(keyword) < len(tweet) and tweet[i+len(keyword)].isalpha()):
          result.append(tweet)

  return result

In [None]:
tmp = {keyword: get_tweets_by_keyword(t, keyword) for keyword in KEYWORDS}

{k: len(v) for k,v in tmp.items()}

{'febbre': 12068,
 'brividi': 21234,
 'dolori alle ossa': 11,
 'dolori muscolari': 105,
 'malessere generale': 13,
 'mal di testa': 10394,
 'mal di gola': 2371,
 'raffreddore': 4464,
 'tosse': 2511,
 'congiuntivite': 228}

In [None]:
with strategy.scope():

  # tokenizer
  tokenizer = BertTokenizer.from_pretrained(model_name)
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})

  # load model from HuggingFace
  model = BertModel.from_pretrained(model_name, output_hidden_states=True)
  model.eval()

Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def tokenize_tweet(tweet): 
  encoded = tokenizer.batch_encode_plus([tweet],
                                        max_length=128, 
                                        padding='max_length', 
                                        truncation=True)
  
  input_ids = torch.tensor(encoded['input_ids'])
  attention_masks = torch.tensor(encoded['attention_mask'])

  return input_ids, attention_masks


def encode_tweet(input_ids, attention_masks):

  with torch.no_grad():
      model_output = model(input_ids, attention_masks)
  
  hidden_states = model_output[2]

  # Concatenate the tensors for all layers (new dimension)
  token_embeddings = torch.stack(hidden_states, dim=0)

  # Remove dimension 1 (batches)
  token_embeddings = torch.squeeze(token_embeddings, dim=1)

  # Swap dimensions 0 and 1
  token_embeddings = token_embeddings.permute(1,0,2)

  # Stores the token vectors, with shape [22 x 768]
  token_vecs_sum = []

  # For each token in the sentence
  for token in token_embeddings:

      # Sum the vectors from the last X layers
      sum_vec = torch.sum(token[-1:], dim=0)
      token_vecs_sum.append(sum_vec)

  return token_vecs_sum


def tensor2array(t):
  return t.cpu().detach().numpy()[0]


def word2token(word):
  t = tokenize_tweet(word)[0]
  return tensor2array(t)[1]


def get_word_vector(tweet, word_token): 
  # print(tweet)
  input_ids, attention_masks = tokenize_tweet(remove_emojis(tweet))
  v = encode_tweet(input_ids, attention_masks)
  i = list(tensor2array(input_ids)).index(word_token)
  return v[i]


def remove_emojis(tweet):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', tweet)

In [None]:
k = 'tosse'

word_token = word2token(k)
corpus = tmp[k]

vectors = [get_word_vector(tweet, word_token) for tweet in corpus]

In [None]:
# keyword vectors
tweet = 'ho febbre, tosse, raffreddore, brividi, mal di gola, mal di testa, influenza, congiuntivite'
keyword_vector = get_word_vector(tweet, word_token)

In [None]:
sim = [1 - cosine(keyword_vector, v) for v in vectors]
sim_df = pd.DataFrame(zip(corpus, sim), columns=['tweet', 'sim'])
sim_df.to_csv(base_path+k+'_similarity.csv', index=False)

In [None]:
sim_df.sort_values(by='sim', ascending=False)

Unnamed: 0,tweet,sim
1326,"beh, ho mal di testa, tosse, raffreddore, e fe...",0.961765
2337,"raffreddore, tosse, mal di gola, otiti e febbr...",0.959292
662,"mal di gola, tosse, raffreddore, ciclo ed è su...",0.953406
812,"ciclo, tosse, mal di gola, raffreddore ok mi s...",0.952353
1498,"febbre, tosse, mal di gola e male alle ossa. g...",0.947403
...,...,...
2476,petizione per cambiare il nome delle marlboro ...,0.402155
458,"#risorgimento odoardo borrani ""cucitrici di ca...",0.390052
586,"alla tosse va in scena 'giusto', la...storia d...",0.375202
52,"an evening with manuel agnelli, sold out alla ...",0.370321


In [None]:
list(sim_df.sort_values(by='sim', ascending=False)['tweet'])[-20:]

["a javier servirà un'intera bottiglia di miele dopo tutta la tosse che ha fatto #amici19",
 'tosse noli, daino cade in un pozzo: liberato dai vigili del fuoco (video)',
 "l'amore è come la tosse... non si può nascondere... ivan cattaneo rules #gfvip #ivancattaneo",
 'hai delle labbra troppo tosse, non sono legali',
 'buoni i ravioli al gusto tosse di tommaso, li voglio provare pure io #gfvip',
 'task, progetto per la partecipazione culturale: presentazione alla tosse',
 "la tosse d'estate 2018: il programma degli spettacoli tra ville e centro storico",
 'giorno di tosse numero 4: altro che palestra, li faccio così gli addominali',
 'ho visto più macchine tosse oggi che in tutta la mia vita',
 'tre alberghi di jon robin baitz alla tosse',
 '#kean ora come ora segnerebbe pure con un colpo di tosse.. #spaljuve',
 'frida kahlo raccontata ai bambini alla tosse #xixeventi',
 'il filtro di tik tok si è per forza ispirato a lui, sinceramente invidiandolo molto *tosse*',
 'luigi marangoni alla

# PROVA - con bert (sentence)

In [None]:
from transformers import AutoTokenizer, AutoModel, BertModel, BertTokenizer
from sentence_transformers import util
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import torch

In [None]:
with strategy.scope():

  # tokenizer
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})

  # load model from HuggingFace
  model = AutoModel.from_pretrained(model_name)
  model.eval()

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def encode_tweets(tweets, batch_size=None):
  if not batch_size:
    batch_size=len(tweets)

  tot = []
  
  for i in range(0, len(tweets), batch_size):
    batch = tweets[i:min(len(tweets), i+batch_size)]

    encoded = tokenizer.batch_encode_plus(batch,
                                          max_length=128, 
                                          padding='max_length', 
                                          truncation=True)
    encoded = {key:torch.LongTensor(value) for key, value in encoded.items()}

    with torch.no_grad():
        model_output = model(**encoded)

    lhs = model_output.last_hidden_state
    attention = encoded['attention_mask'].reshape((lhs.size()[0], lhs.size()[1], -1)).expand(-1, -1, 768)
    embeddings = torch.mul(lhs, attention)
    denominator = torch.count_nonzero(embeddings, dim=1)
    summation = torch.sum(embeddings, dim=1)
    mean_embeddings = torch.div(summation, denominator)

    tot.append(mean_embeddings)

  return torch.cat(tot)


In [None]:
# ### write
# with open(base_path+"files/covid_embeddings_new.pkl","wb") as f:
#   pickle.dump(tot_embeddings, f)

# ### read
# with open(base_path+"files/covid_embeddings_new.pkl","rb") as f:
#   tot_embeddings = pickle.load(f)

In [None]:
keywords = [', '.join(['raffreddore', 'tosse', 'febbre', 'mal di gola', 'mal di testa', 'brividi'])]
# keywords = [
#     'tosse, mal di gola', 
#     'mal di testa, emicrania', 
#     'raffreddore, sinusite', 
#     'febbre, brividi'
#     ]

tmp = [
    'mattarella è il presidente della repubblica', 
    'il mio computer si blocca continuamente', 
    'questa canzone è da brividi', 
    'mario corre così veloce che mi fa venire il mal di testa', 
    'scopri 3 nuovi rimedi contro il covid, da oggi in edicola!', 
    'aggiornamento dati da fonti certificate su covid-19', 
    'ma com\'è che quando mi sveglio sono così stanco', 
    'ho contratto il covid-19', 
    'ho preso il raffreddore, che palle!', 
    'ho la febbre a 38, sto morendo'
]

In [None]:
k = encode_tweets(keywords)
tw0 = encode_tweets(tmp)

In [None]:
dists = cosine_similarity(k, tw0)

dists = pd.DataFrame(dists).T
dists.columns = keywords
dists.index = tmp

if len(keywords) > 1:
  dists['avg'] = dists.apply(lambda x: np.mean(x), axis=1)
  
dists

Unnamed: 0,"raffreddore, tosse, febbre, mal di gola, mal di testa, brividi"
mattarella è il presidente della repubblica,0.437108
il mio computer si blocca continuamente,0.439949
questa canzone è da brividi,0.407013
mario corre così veloce che mi fa venire il mal di testa,0.59841
"scopri 3 nuovi rimedi contro il covid, da oggi in edicola!",0.510595
aggiornamento dati da fonti certificate su covid-19,0.401599
ma com'è che quando mi sveglio sono così stanco,0.430103
ho contratto il covid-19,0.331797
"ho preso il raffreddore, che palle!",0.58247
"ho la febbre a 38, sto morendo",0.637218


In [None]:
tot_embeddings = encode_tweets(t[:1000], 32)

In [None]:
dists = cosine_similarity(k, tot_embeddings)[0]

In [None]:
n = 15
topn = sorted(range(len(dists)), key=lambda i: -dists[i])[:n]

for i in topn:
  print(dists[i], '|', t[i], '\n')

0.6787563 | ma questa tossettina fastidiosa cos'è? l'effetto del reflusso, un principio di tracheite, o covid? 

0.663388 | è normale avere il raffreddore e l’istinto di vomitare? sta a vedere che ho il covid 

0.6421105 | covid. «la tosse improvvisa, il ricovero... e ora mio marito è in fin di vita» 

0.6345444 | «dal covid ai dolori articolari, noi medici di base sotto pressione» 

0.6225733 | covid, un colpo di tosse al telefono smaschera gli asintomatici 

0.60636055 | voce alta e aerosol, così il covid si diffonde negli spazi chiusi: ecco come fare per difendersi 

0.59295654 | voce alta e aerosol, così il covid si diffonde negli spazi chiusi: ecco come fare per d... 

0.5915977 | talmente in ansia che avevo caldo e ho cominciato a pensare se avere caldo fosse sintomo di covid ok 

0.5892333 | coronavirus: polmonite da covid, l'ecografia è più efficace del tampone per diagnosticarla 

0.5835334 | covid, uso prolungato della mascherina? può far inalare meglio il virus. video - 

0.

# Embeddings
https://www.kaggle.com/code/rhtsingh/utilizing-transformer-representations-efficiently

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [None]:
def bertTokenize(docs, tokenizer, max_len):
  input_ids = []
  attention_masks = []

  for doc in docs:
    bert_inp = tokenizer.encode_plus(doc, 
                                     add_special_tokens = True, 
                                     max_length = max_len, 
                                     padding = 'max_length', 
                                    #  truncation = True, 
                                     return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

  input_ids = np.array(input_ids, dtype=object)
  attention_masks = np.array(attention_masks, dtype=object)

  input_ids = np.asarray(input_ids).astype('float32')
  attention_masks = np.asarray(attention_masks).astype('float32')

  return input_ids, attention_masks

In [None]:
### observe max length
encoded_tweets = [tokenizer.encode(tweet, add_special_tokens=True) for tweet in t[:100]]
max_len = max([len(tweet) for tweet in encoded_tweets])
print('Max length: ', max_len)

Max length:  76


In [None]:
max_len = 100
tot_ids, tot_masks = bertTokenize(t[:100], tokenizer, max_len)

In [None]:
with strategy.scope():
  bert_model = TFBertModel.from_pretrained(model_name)

  # model
  ids = layers.Input(shape=(max_len,), dtype='int32')
  masks = layers.Input(shape=(max_len,), dtype='int32')
  embed = bert_model([ids, masks]).last_hidden_state[:, 0, :]
  model = models.Model(inputs=[ids, masks], outputs=embed)
  model.compile('adam', 'categorical_crossentropy') # won't be used, but necessary

You are using a model of type gpt2 to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some layers from the model checkpoint at GroNLP/gpt2-small-italian were not used when initializing TFBertModel: ['transformer']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at GroNLP/gpt2-small-italian and are newly initialized: ['bert']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tweet_embeddings = model.predict([tot_ids, tot_masks])

# ### write
# with open(base_path+"files/covid_embeddings.pkl","wb") as f:
#   pickle.dump(tweet_embeddings, f)

In [None]:
# ### read
# with open(base_path+"files/covid_embeddings.pkl","rb") as f:
#   tweet_embeddings = pickle.load(f)

# Find useful tweets

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import spatial

In [None]:
keywords = ['raffreddore', 'tosse', 'febbre', 'mal di gola', 'mal di testa', 'perdita gusto', 'perdita olfatto']
# keywords = ['raffreddore', 'tosse', 'febbre']
tmp_ids, tmp_masks = bertTokenize(keywords, tokenizer, max_len)
k = model.predict([tmp_ids, tmp_masks])

In [None]:
tw0 = [
    'ho preso il raffreddore, che palle!', 
    'mattarella è il presidente della repubblica', 
    'scopri 3 nuovi rimedi contro il covid, da oggi in edicola!', 
    'questa canzone è da brividi'
    # 'non riesco a deglutire', 
    'ma com\'è che quando mi sveglio sono così stanco', 
    # 'sto davvero male, ho preso l\'influenza', 
    'aggiornamento dati da fonti certificate su covid-19', 
    'ho contratto il covid-19'
]

tmp_ids, tmp_masks = bertTokenize(tw0, tokenizer, max_len)
tw0 = model.predict([tmp_ids, tmp_masks])

dists = cosine_similarity(k, tw0)
print(dists, '\n')

dists = np.mean(dists, axis=0)
print(dists)

[[0.7527263  0.7133758  0.7303872  0.71861243 0.75824785 0.75078475]
 [0.6510491  0.67106974 0.684951   0.6743056  0.66973805 0.65135956]
 [0.6833457  0.72121173 0.6904731  0.676184   0.70746124 0.6910893 ]
 [0.70464337 0.73065984 0.7487775  0.68738353 0.76297086 0.70045006]
 [0.70516294 0.7179037  0.7489948  0.6951818  0.7622527  0.6952837 ]
 [0.69690937 0.75395626 0.700867   0.6922024  0.7531489  0.7142317 ]
 [0.7200891  0.7676362  0.71621    0.70339394 0.76748174 0.7419561 ]] 

[0.70198935 0.7251162  0.7172373  0.69246626 0.7401859  0.70645076]


In [None]:
dists = cosine_similarity(k, tweet_embeddings)
dists = np.mean(dists, axis=0)

NameError: ignored

In [None]:
n = 10
topn = sorted(range(len(dists)), key=lambda i: -dists[i])[:n]

for i in topn:
  print(dists[i], '|', original_t[i], '\n')

In [None]:
# keyword = 'raffreddore'
# tmp_ids, tmp_masks = bertTokenize([keyword], tokenizer, max_len)
# k_embedding = model.predict([tmp_ids, tmp_masks])

# keyword = 'naso'
# tmp_ids, tmp_masks = bertTokenize([keyword], tokenizer, max_len)
# k_embedding2 = model.predict([tmp_ids, tmp_masks])

# cosine_similarity(k_embedding, k_embedding2)

In [None]:
# tree = spatial.KDTree(tweet_embeddings)

In [None]:
# # diz = {tree.indices[i]: t[i] for i in range(len(original_t))} 
# diz = {tree.indices[i]: original_t[i] for i in range(len(original_t))}

In [None]:
# x = [(i, len(tree.query_ball_point(k_embedding, r=i)[0])) for i in np.arange(5,10,0.1)]
# tmp = pd.DataFrame(x, columns=['i', 'neighbours'])
# plt.plot(tmp['i'], tmp['neighbours'])

In [None]:
# # dist = 6
# # indices = tree.query_ball_point(k_embedding, r=dist)[0]
# # print(len(indices))
# distances, indices = tree.query(k_embedding, k=3)
# distances

In [None]:
# for i in indices[0]:
#   print(diz[i])
#   print()