<a href="https://colab.research.google.com/github/liesemarques/covid_kaggle/blob/main/covid_kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Carregamento da bibliotecas

* numpy 
* pandas 
* glob
* json
* seaborn
* spacy
* nltk
* matplotlib plt
* from IPython.core.display import HTML


# - Data Acquisition

* Função p leitura dos arquivos json
* Criação do DataFrame com os artigos json
* Criação do CSV com os dados 


# - Text Extraction and Cleanup


## Pre-processamento

* Identificar e remover valores NaN, o DataFrame deve estar com textos completos
* Remover valores duplicados
* Retirar uma amostra da base de dados



## Função para o pré-processamento
* Criação de tokens 'tokenize'
* Remoção das stop words 
* lematização
* Named Entity Recognizer (NER) Nomeando entidades

## Termos frequentes e Nuvem de Palavras

* Identificar temos frequentes para uma possivel atualização da StopWords

# - Evaluation
* Pesquisas com uma palavra e NLTK
* Pesquisa com 'find'



# Carregamento da bibliotecas

In [None]:
import numpy as np 
import pandas as pd 
import glob
import json
import seaborn as sns
import spacy
import nltk
from IPython.core.display import HTML
from matplotlib import pyplot as plt

## Instalação do scispaCy
scispaCy é um pacote python que contem modelos spacy para processamento de textos biomédicos cientificos ou clinicos


https://allenai.github.io/scispacy/

In [None]:
!pip install scispacy

In [None]:
pip install spacy==2.2

In [None]:
import scispacy

## Instalação do modelo en_core_sci_md
Um pipeline spaCy completo para dados biomédicos com um vocabulário maior e vetores de 50 mil palavras.

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_md-0.2.4.tar.gz

In [None]:
# Importando o modelo
import en_core_sci_md

In [None]:
!python -m spacy download en

--------------------------------------

## Criação do DataSet

In [None]:
corona_features = {'paper_id': [], 'title': [],
                   'abstract': [], 'text': []}

In [None]:
corona_df = pd.DataFrame(corona_features)
corona_df

In [None]:
json_filenames = glob.glob(f'{"/kaggle/input"}//**/*.json', recursive = True)

In [None]:
len(json_filenames)

In [None]:
def return_corona_df(json_filenames, df):
  for file_name in json_filenames:
    row = {'paper_id': None, 'title': None,
           'abstract': None, 'text': None}
    
    with open(file_name) as json_data:
      
      data = json.load(json_data)
      
      if 'paper_id' not in data:
        row['paper_id'] = np.nan
      else:
        row['paper_id'] = data['paper_id'].strip() 
      
      if 'metadata' not in data:
        row['metadata'] = np.nan
      else:  
        row['title'] = data['metadata']['title'].strip()
 
       
      if 'abstract' not in data:
        row['abstract'] = np.nan
      else:
          abstract_list = [abstract['text'] for abstract in data['abstract']]
          abstract = '\n '.join(abstract_list)
          row['abstract'] = abstract.strip()
   
      if 'body_text' not in data:
        row['body_text'] = np.nan
      else:
        text_list = [text['text'] for text in data['body_text']]
        text = '\n '.join(text_list)
        row['text'] = text.strip()

        df = df.append(row, ignore_index = True)
  return df


In [None]:
corona_df = return_corona_df(json_filenames, corona_df)

In [None]:
corona_df.to_csv('/kaggle/working/corona_df.csv')

----------------------------------------------------------

# - Text Extraction and Cleanup


## Pre-processamento

* Identificar e remover valores NaN, o DataFrame deve estar com textos completos
* Remover valores duplicados



In [None]:
corona_df = pd.read_csv('../input/covid-text/corona_df.csv')

In [None]:
corona_df.head(3)

In [None]:
corona_df.shape

In [None]:
corona_df = corona_df.iloc[:,1:5]
corona_df.head(3)

In [None]:
corona_df.shape

In [None]:
sns.heatmap(corona_df.isnull());

In [None]:
for i in corona_df.columns:
    corona_df = corona_df[corona_df[i].notnull()]

In [None]:
corona_df.shape

In [None]:
corona_df.isnull().sum()

In [None]:
sns.heatmap(corona_df.isnull());

In [None]:
for i in corona_df.columns:
    print(f" Numeros de {i} vazio {len(corona_df[corona_df[i] == ''])}")

In [None]:
corona_df.drop_duplicates(['abstract', 'text', 'title'], inplace = True)
corona_df.shape

# Amostra da base de dados
Foi retirado uma mostra de 500 artigos aleatorios de 79099 artigos

In [None]:
# Corona_df contendo 500 artigos
corona_df = corona_df.sample(n = 500, random_state=1)

In [None]:
corona_df.head()

In [None]:
sample_text = corona_df['text'][119326]
sample_text

# Função pre-processamento

* tokenize - spaCy converte o texto em 'spacy.tokens.doc.Doc'
https://spacy.io/api/tokenizer
* stop words - Remoção da palavras menos relevantes em termos medicos
* lemmatization - Estração dos radicais das palavras


In [None]:
# Modelo ja treinado do scispacy para textos medicos
# Disable -
# Sera usada somente remoção de stopwords nao sera necessario utilizar 'target' 
# 'parse' indica como uma palavra esta ligas a outra e não sera necessario
# 'ner' reconhecimento de entidade sera feito mais adiante
# https://spacy.io/usage/processing-pipelines#disabling
nlp = en_core_sci_md.load(disable=['tagger', 'parser', 'ner'])
nlp.max_length = 2000000

In [None]:
# print(spacy.lang.en.stop_words.STOP_WORDS)
# len(spacy.lang.en.stop_words.STOP_WORDS)

In [None]:
# Algumas palavras encontradas na nuvem de palavras que não estavam nas stop word do spacy
new_stop_words = ['et', 'al', 'doi', 'copyright', 'http', 'https', 'fig', 'table', 'result', 'show']
for word in new_stop_words:
  nlp.vocab[word].is_stop = True

In [None]:
# 'lower()' Transforma todo o texto em minusculo
# word.lemma_  extrai o radical das palavras
# https://spacy.io/usage/linguistic-features

def spacy_tokenizer(sentence):
  sentence = sentence.lower()
  list = []
  list = [word.lemma_ for word in nlp(sentence) if not (word.is_stop or
                                                        word.like_num or
                                                        word.is_punct or
                                                        word.is_space or
                                                        len(word) == 1)]
  list = ' '.join([str(element) for element in list])
  return list

In [None]:
# Texto oriiginal
print(sample_text)

In [None]:
# texto processado
test = sample_text
result = spacy_tokenizer(test)
print(result)

In [None]:
# Aplicando a função apcy_tokenizer em nossa base de 500 artigos
corona_df['text'] = corona_df['text'].apply(spacy_tokenizer)

# Termos frequentes

In [None]:
for index, row in corona_df.iterrows():
  # print(row['paper_id'], row['title'])
  text_file = open('./' + row['paper_id'] + '.txt', 'w')
  n = text_file.write(row['text'])
  text_file.close()

In [None]:
from nltk.corpus import PlaintextCorpusReader
corpus = PlaintextCorpusReader('./', '.*')

In [None]:
files = corpus.fileids()

In [None]:
files[0]

In [None]:
corpus.raw('00467bd1940aae7539467e3ae56a8210fd44fc80.txt')

In [None]:
words = corpus.words()
print(words)

In [None]:
len(words)

In [None]:
frequency = nltk.FreqDist(words)
most_common = frequency.most_common(100)
most_common

# Nuvem de palavras

In [None]:
from matplotlib.colors import ListedColormap
color_map = ListedColormap(['orange', 'green', 'red', 'magenta'])

In [None]:
from wordcloud import WordCloud
cloud = WordCloud(background_color = 'white', max_words=100, colormap=color_map)

In [None]:
cloud = cloud.generate(corona_df['text'].str.cat(sep='\n'))
plt.figure(figsize=(15,15))
plt.imshow(cloud)
plt.axis('off')
plt.show()

# Extração de entidades nomeadas

Named Entity Recognizer (NER)

In [None]:
text = str(corona_df['text'][119326])
print(text)

In [None]:
nlp_ent = spacy.load('en')
nlp_ent.max_length = 2000000

In [None]:
doc = nlp_ent(text)
type(doc)

In [None]:
# Entidades contidas no texto de exemplo
# https://spacy.io/api/annotation#named-entities
# 'NORP' - NACIONALIDADE 'GPE' PAISES
for entity in doc.ents:
  if entity.label_ == 'NORP' or entity.label_ == 'GPE':
    print(entity.text, entity.label_)

In [None]:
print(doc)

In [None]:
from spacy import displacy
displacy.render(doc, style = 'ent')

Contagem das entidades na base de dados

In [None]:
# poderia ser os ids
gpe = []
for index, row in corona_df.iterrows():
    text = row['text']
    doc = nlp_ent(text)
    for entity in doc.ents:
        if entity.label_ == 'GPE':
            gpe.append(str(entity.text))

In [None]:
print(gpe)

In [None]:
values_gpe, counts_gpe = np.unique(np.array(gpe), return_counts = True)

In [None]:
gpe_df = pd.DataFrame({'value': values_gpe, 'counts': counts_gpe})

In [None]:
gpe_df.head(20)

In [None]:
gpe_df.shape

In [None]:
gpe_df_filtered = gpe_df[gpe_df.counts > 50]

In [None]:
gpe_df_filtered.shape

In [None]:
gpe_df_filtered.head(16)

In [None]:
sns.set(rc={'figure.figsize': (15,8)})
sns.barplot(x = 'value', y = 'counts', hue='value', data=gpe_df_filtered);

# Textos utilizados para pesquisa

# Pesquisas com uma palavra e NLTK

In [None]:
# corpus c termos frequente
text = nltk.Text(corpus.words())

In [None]:
match = text.concordance('smoke', width = 50, lines = 30)

In [None]:
# Função muito limitada
type(match)
# dir(match)

# Pesquisa com 'find'

In [None]:
string = corona_df['text'][119326]
search_string = 'korea'
print(string.find(search_string))

In [None]:
#dir(string)
#help(string.find)

In [None]:
string[13:13+10]

In [None]:
string[13:13]

# Aplicação na base de dados

In [None]:
search_string = 'Smoking'
#search_string = 'Socio-economic'

In [None]:
search_string = spacy_tokenizer(search_string)
search_string

In [None]:
def find_all_texts(input_str, search_str, number_of_words):
  text_list = []
  index = 0
  number_of_words = number_of_words
  while index < len(input_str):
    i = input_str.find(search_str, index)
    if i == -1:
      return text_list
    
    if input_str[i-number_of_words:i] == '':
      start = 0
    else:
      start = i - number_of_words

    text_list.append(input_str[start:i] + input_str[i:i+number_of_words])
    index = i + 1
  return text_list

In [None]:
documents = []
for index, row in corona_df.iterrows():
  documents.append(find_all_texts(row['text'], search_string, 40))

In [None]:
for doc in documents:
  if doc != []:
    print(doc)

In [None]:
for index, row in corona_df.iterrows():
  texts = find_all_texts(row['text'], search_string, 400)
  if texts == []:
    continue
  
  paper_id = row['paper_id']
  title = row['title']
  display(HTML(f'<h1>{search_string.upper()}</h1>'))
  display(HTML(f"""<p>
                      <strong>Titulo:</strong> {title}</br>
                      <strong>ID:</strong> {paper_id}</br>
                      <strong>Numero de vezes:</strong> {len(texts)}
                   </p>"""))
  for i in texts:
    marked_text = str(i.replace(search_string, f"<mark>{search_string}</mark>"))
    display(HTML(f"""<blockquote>... {marked_text} ...</blockquote>"""))  

# Pesquisa com mais palavras e spaCy

## Testando o spaCy

In [None]:
search_strings = ['smoking','pulmonary disease']
tokens_list = [nlp(spacy_tokenizer(item)) for item in search_strings]
tokens_list

In [None]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
matcher.add('SEARCH', None, *tokens_list)
numbers_of_words = 50

In [None]:
search_string_html = ' '.join([str(element) for element in search_strings])

In [None]:
for index, row in corona_df.iterrows():
    marked_text = ''
    doc = nlp(row['text'])
    paper_id = row['paper_id']
    title = row['title']
    matches = matcher(doc)
    if matches == []:
        continue
    display(HTML(f'<h1>{search_string_html.upper()}</h1>'))
    display(HTML(f"""<p>
                      <strong>Titulo:</strong> {title}</br>
                      <strong>ID:</strong> {paper_id}</br>
                      <strong>Numero de vezes:</strong> {len(matches)}
                     </p>"""))
    for i in matches:
        start = i[1] - numbers_of_words
        if start < 0 :
            start = 0
        for j in range(len(tokens_list)):
            if doc[i[1]:i[2]].similarity(tokens_list[j]) == 1.0:
                search_text = str(tokens_list[j])
                marked_text = str(doc[start:i[2] + numbers_of_words]).replace(search_text, f"<mark>{search_text}</mark>")
                marked_text += "<br /><br /> "
    display(HTML(f"""<blockquote>...{marked_text}...</blockquote>"""))            