# Analise exploratória

In [3]:
from pathlib import Path
import sys

parent = Path().absolute().parents[0].as_posix()

sys.path.insert(0, parent)

In [130]:
from tqdm import tqdm

import pandas as pd

import yake
import spacy

#from spacytextblob.spacytextblob import SpacyTextBlob

from nlpiper.core import Compose
from nlpiper.transformers import cleaners, tokenizers
from nlpiper.core import Document


from gensim.corpora.dictionary import Dictionary
from gensim import models 

import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from linguakit import Sentiment

from resources.stopwords import WORDS

Downloading linguakit-streaming...
[OK!]
Installing linguakit-streaming...
[OK!]
Installing the Python wrapper...


In [131]:
spacy.__version__

'3.2.4'

In [132]:
data = pd.read_csv('../data/scraping_data.csv.gz', compression='gzip')
data_political_parties = pd.read_csv('../data/scraping_political_parties.csv.gz', compression='gzip')

In [133]:
data.head()

Unnamed: 0.1,Unnamed: 0,city,title,content,year,tstamp,link
0,0,Lisboa,POL | Local Lisboa,POL | Local Lisboa SECÇÕES 1ª Página Destaque ...,1999,19991111042737,https://arquivo.pt/wayback/19991111042737/http...
1,1,Lisboa,JN Editorial - Text57,JN Editorial - Text57 26 milhões para dar casa...,1999,19990822002536,https://arquivo.pt/wayback/19990822002536/http...
2,2,Lisboa,Outras Paginas,Outras Paginas 11 de Novembro de 1999 Igreja d...,1999,19991117215651,https://arquivo.pt/wayback/19991117215651/http...
3,3,Lisboa,JN Editorial - Texult1,JN Editorial - Texult1 Macau: Rão Kyao é o aut...,1999,19991118004529,https://arquivo.pt/wayback/19991118004529/http...
4,4,Lisboa,PÚBLICONLINE-Os Destaques da Primeira Página,PÚBLICONLINE-Os Destaques da Primeira Página S...,1999,19991012235908,https://arquivo.pt/wayback/19991012235908/http...


In [134]:
data.shape

(30759, 7)

In [135]:
data.isna().any()

Unnamed: 0    False
city          False
title          True
content        True
year          False
tstamp        False
link          False
dtype: bool

In [136]:
data.dropna(inplace=True)
data.shape

(30712, 7)

In [137]:
data_political_parties.shape

(1042, 5)

In [138]:
data.city.value_counts()

Lisboa              5350
Porto               3827
Coimbra             2078
Braga               2045
Guarda              1967
Setúbal             1789
Aveiro              1469
Beja                1400
Leiria              1342
Bragança            1214
Faro                1207
Viseu               1186
Vila Real           1103
Santarém            1070
Castelo Branco       957
Viana do Castelo     943
Évora                907
Portalegre           858
Name: city, dtype: int64

In [139]:
data.groupby(['year', 'city']).size()

year  city            
1999  Aveiro               4
      Beja                 3
      Braga                4
      Bragança             6
      Castelo Branco       3
                          ..
2021  Setúbal             47
      Viana do Castelo    34
      Vila Real           40
      Viseu               36
      Évora               50
Length: 335, dtype: int64

## Data Pre-processing

In [140]:
pipeline = Compose([
    cleaners.CleanURL(),
    cleaners.CleanPunctuation(),
    cleaners.CleanEOF(),
    cleaners.CleanMarkup(),
    cleaners.CleanAccents(),
    cleaners.CleanNumber(),
    #tokenizers.BasicTokenizer()
])

In [141]:
simple_pipeline = Compose([
    cleaners.CleanURL(),
    cleaners.CleanEOF(),
    cleaners.CleanMarkup(),
    cleaners.CleanAccents(),
    #cleaners.CleanNumber(),
])

In [142]:
stop_words = Document(WORDS)
stop_words_ = pipeline(stop_words)
stop_words_ = stop_words_.cleaned.split(' ')
stop_words_ = list(filter(None, stop_words_))


In [143]:
nlp = spacy.load("pt_core_news_lg")

In [144]:
class TextCleaner:
    
    def __init__(self, model, stop_words):
        self.model = model
        self.stop_words = stop_words
        
    def __call__(self, document):
        
        processed_doc = self._remove_stop_words(document)
        processed_doc = self._remove_double_spaces(processed_doc)
        processed_doc = self._remove_one_char_words(processed_doc)
        processed_doc = self._apply_pos_tagger(processed_doc)
        
        return processed_doc

    def _remove_stop_words(self, document):
        return ' '.join([word for word in document.split(' ') if word not in self.stop_words])        

    def _remove_double_spaces(self, document):
        return document.replace('  ', ' ')

    def _remove_one_char_words(self, document):
        return ' '.join([word for word in document.split(' ') if len(word) > 1])

    def _apply_pos_tagger(self, document):

        tokens = []
        for word in self.model(document):
            if word.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']:
                tokens.append(word.text)
        return ' '.join(tokens)
        

In [145]:
tc = TextCleaner(model=nlp, stop_words=stop_words_)

In [215]:
docs = []
docs_on_tokens = []
for _, val in tqdm(data.sample(n=5000, random_state=1).iterrows()):
    doc = Document(val['content'].lower())
    doc_p = pipeline(doc)
    doc_p = tc(doc_p.cleaned)
    docs.append(doc_p)
    docs_on_tokens.append(doc_p.split(' '))

5000it [05:50, 14.26it/s]


In [187]:
len(docs)

1000

In [92]:
# Create a corpus from a list of texts
dictionary = Dictionary(docs_on_tokens)
corpus = [dictionary.doc2bow(text) for text in docs_on_tokens]

In [None]:
dictionary.token2id.keys()

In [None]:
corpus

In [93]:
model = models.LdaModel(corpus, num_topics=50)

In [70]:
print(model.print_topics())


[(46, '0.033*"47" + 0.010*"48" + 0.008*"31" + 0.008*"4106" + 0.006*"50" + 0.005*"57" + 0.004*"72" + 0.004*"1" + 0.004*"1658" + 0.004*"63"'), (32, '0.015*"48" + 0.009*"31" + 0.007*"47" + 0.006*"57" + 0.005*"72" + 0.005*"50" + 0.004*"321" + 0.004*"387" + 0.004*"67" + 0.004*"19"'), (29, '0.011*"48" + 0.009*"31" + 0.008*"321" + 0.008*"72" + 0.007*"50" + 0.006*"57" + 0.004*"67" + 0.004*"200" + 0.004*"12" + 0.004*"1"'), (3, '0.021*"321" + 0.018*"48" + 0.006*"1810" + 0.006*"1720" + 0.005*"1749" + 0.005*"1727" + 0.005*"34" + 0.005*"1776" + 0.005*"16" + 0.005*"72"'), (26, '0.015*"48" + 0.013*"321" + 0.009*"2017" + 0.009*"1810" + 0.008*"1749" + 0.007*"1705" + 0.007*"57" + 0.006*"72" + 0.006*"268" + 0.005*"1776"'), (28, '0.010*"48" + 0.009*"321" + 0.007*"57" + 0.005*"50" + 0.005*"31" + 0.004*"67" + 0.004*"387" + 0.004*"161" + 0.004*"47" + 0.003*"1705"'), (13, '0.013*"321" + 0.011*"48" + 0.007*"31" + 0.007*"72" + 0.005*"57" + 0.005*"46" + 0.005*"200" + 0.004*"1749" + 0.004*"19" + 0.004*"1727"'), (

In [94]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
gensimvis.prepare(model, corpus, dictionary)


  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


## Keyword Detection

In [56]:
data.link.iloc[-1]

'https://arquivo.pt/wayback/20201218202033/https://expresso.pt/coronavirus/2020-12-14-Covid.-Esta-e-lista-atualizada-dos-concelhos-de-risco-moderado-elevado-muito-elevado-e-extremamente-elevado--e-com-as-devidas-restricoes-'

In [212]:
language = "pt"
max_ngram_size = 3
deduplication_thresold = 0.9
deduplication_algo = 'seqm'
windowSize = 2
numOfKeywords = 5

custom_kw_extractor = yake.KeywordExtractor(
    lan=language, 
    n=max_ngram_size, 
    dedupLim=deduplication_thresold, 
    dedupFunc=deduplication_algo, 
    windowsSize=windowSize, 
    top=numOfKeywords, 
    features=None
)
keywords = custom_kw_extractor.extract_keywords(docs[-100])

for kw in keywords:
    print(kw)
    

('xadrez ioga esgrima', 0.00015773327234901594)
('esgrima escolas publicas', 0.0001897702043662771)
('atividades enriquecimento curricular', 0.0001971241338082687)
('ioga esgrima escolas', 0.00020433610828626955)
('publicas loja twitter', 0.00036833869104985206)


In [98]:
keywords_on_docs = []
for doc in docs:
    keywords_on_docs.append(custom_kw_extractor.extract_keywords(doc))

In [213]:
keywords_on_docs[-10]

[('entrar facebook entrar', 4.9460822120934905e-05),
 ('facebook entrar palavrachave', 5.279479591477676e-05),
 ('entrar palavrachave lembrarse', 5.49496014976677e-05),
 ('palavrachave lembrarse palavrachave', 5.865457133069792e-05),
 ('sessao entrar facebook', 6.418176471758751e-05),
 ('iniciar sessao entrar', 7.52373582936868e-05),
 ('faces projecto editorial', 0.00012353963441901375),
 ('projecto editorial inovador', 0.00012353963441901375),
 ('projecto editorial recorrer', 0.00012353963441901375),
 ('lembrarse palavrachave assine', 0.0001262453755688322)]

In [214]:
custom_kw_extractor.extract_keywords(simple_pipeline(Document(data.content.iloc[-10].lower())).cleaned)

[('protecao civil decidiu', 0.006365226982637995),
 ('civil decidiu colocar', 0.006445494013096144),
 ('especial nivel vermelho', 0.007149952565596091),
 ('alerta vermelho', 0.007242397962312481),
 ('alerta especial nivel', 0.009814278200911847)]

In [None]:
data.content.iloc[1].lower()

In [161]:
data.title.iloc[8].lower()

'jornal publico: sondagem expresso-euroexpansão de 29 de julho de 1995'

## Embeddings

In [172]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [226]:
model = Word2Vec(sentences=docs_on_tokens, vector_size=100, window=4, min_count=1, workers=4, sg=0, epochs=20)

In [227]:
model.wv.similar_by_word('ditadura')

[('democratica', 0.5903539061546326),
 ('censura', 0.5763965249061584),
 ('democracia', 0.572188138961792),
 ('ideologia', 0.5699273943901062),
 ('derrubar', 0.5666220188140869),
 ('conservadora', 0.5597254633903503),
 ('radical', 0.5586162805557251),
 ('fascista', 0.5425527691841125),
 ('conservador', 0.5366416573524475),
 ('retorica', 0.5243933200836182)]

In [229]:
model.wv.most_similar('poluicao', topn=5)

[('ozono', 0.6491729021072388),
 ('poluentes', 0.6271681785583496),
 ('subsidencia', 0.6127200722694397),
 ('niveis', 0.6022851467132568),
 ('particulas', 0.5948370695114136)]

In [209]:
model.wv.similarity('homen', 'poluicao')

0.90934694

## Sentiment Analysis