# Analise exploratória

In [None]:
from pathlib import Path
import sys

parent = Path().absolute().parents[0].as_posix()

sys.path.insert(0, parent)

from tqdm import tqdm

import pandas as pdss
import numpy as np

import yake
import spacy

#from spacytextblob.spacytextblob import SpacyTextBlob

from nlpiper.core import Compose
from nlpiper.transformers import cleaners
from nlpiper.core import Document


from gensim.corpora.dictionary import Dictionary
from gensim import models 

from sklearn.cluster import DBSCAN, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

from resources.stopwords import WORDS

In [None]:
spacy.__version__

In [None]:
nlp = spacy.load("pt_core_news_lg")

In [None]:
for word in nlp('Esta é uma fila.'):
    print(word.pos_)

In [None]:
data = pd.read_csv('../data/scraping_data.csv.gz', compression='gzip')
data_political_parties = pd.read_csv('../data/scraping_political_parties.csv.gz', compression='gzip')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.isna().any()

In [None]:
data.dropna(inplace=True)
data.shape

In [None]:
data_political_parties.shape

In [None]:
data.city.value_counts()

In [None]:
data.groupby(['year', 'city']).size()

## Data Pre-processing

In [None]:
pipeline = Compose([
    cleaners.CleanURL(),
    #cleaners.CleanPunctuation(),
    cleaners.CleanEOF(),
    cleaners.CleanMarkup(),
    cleaners.CleanAccents(),
    cleaners.CleanNumber(),
    #tokenizers.BasicTokenizer()
])

In [None]:
simple_pipeline = Compose([
    cleaners.CleanURL(),
    cleaners.CleanEOF(),
    cleaners.CleanMarkup(),
    cleaners.CleanAccents(),
    #cleaners.CleanNumber(),
])

In [None]:
stop_words = Document(WORDS)
stop_words_ = pipeline(stop_words)
stop_words_ = stop_words_.cleaned.split(' ')
stop_words_ = list(filter(None, stop_words_))


In [None]:
import re

class TextCleaner:
    
    def __init__(self, model, stop_words):
        self.model = model
        self.stop_words = stop_words
        
    def __call__(self, document):
        
        processed_doc = self._apply_pos_tagger(document)
        processed_doc = self._remove_punctuation(processed_doc)
        processed_doc = self._remove_stop_words(processed_doc)
        processed_doc = self._remove_double_spaces(processed_doc)
        processed_doc = self._remove_space_at_sentence_end(processed_doc)
        
        return processed_doc

    def _apply_pos_tagger(self, document):

        tokens = []
        for word in self.model(document):
            if word.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV', 'PUNCT']:
                tokens.append(word.text)
        return ' '.join(tokens)

    def _remove_stop_words(self, document):
        return ' '.join([word for word in document.split(' ') if word not in self.stop_words])        

    def _remove_double_spaces(self, document):
        return document.replace('  ', ' ')

    def _remove_punctuation(self, document):
        punctuation =  '!"#$%&\'()*+,-/:;<=>?@[\\]”^“_`{|}~'
        return document.translate(str.maketrans('', '', punctuation))

    def _remove_space_at_sentence_end(self, document):
        return document.replace(' .', '.')    

        

In [None]:
tc = TextCleaner(model=nlp, stop_words=stop_words_)

In [None]:
docs = []
docs_on_tokens = []
for _, val in tqdm(data.sample(n=100, random_state=1).iterrows()):
    doc = Document(val['content'].lower())
    doc_p = pipeline(doc)
    doc_p = tc(doc_p.cleaned)
    docs.append(doc_p)
    docs_on_tokens.append(doc_p.split(' '))

In [None]:
docs_on_tokens = np.load('../data/processed/docs_cleaned.npz', allow_pickle=True)['files']

In [None]:
len(docs_on_tokens)

In [None]:
docs_on_tokens[0]

In [None]:
data.content.iloc[2].lower()

In [None]:
docs[2]

In [None]:
re.sub(' .', '.', docs[2])

## Topic modeling

In [None]:
# Create a corpus from a list of texts
dictionary = Dictionary(docs_on_tokens)
corpus = [dictionary.doc2bow(text) for text in docs_on_tokens]

In [None]:
dictionary.token2id.keys()

In [None]:
corpus

In [None]:
model = models.LdaModel(corpus, num_topics=50)

In [None]:
print(model.print_topics())


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
gensimvis.prepare(model, corpus, dictionary)


## Keyword Detection

In [None]:
data.link.iloc[-1]

In [None]:
language = "pt"
max_ngram_size = 3
deduplication_thresold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 5

custom_kw_extractor = yake.KeywordExtractor(
    lan=language, 
    n=max_ngram_size, 
    dedupLim=deduplication_thresold, 
    dedupFunc=deduplication_algo, 
    windowsSize=windowSize, 
    top=numOfKeywords, 
    features=None
)
keywords = custom_kw_extractor.extract_keywords(docs[0])

for kw in keywords:
    print(kw)
    

In [None]:
keywords_on_docs = []
for idx in range(len(data)):
    keywords_on_docs.append(custom_kw_extractor.extract_keywords(simple_pipeline(Document(data.content.iloc[idx].lower())).cleaned))

In [None]:
custom_kw_extractor.extract_keywords(simple_pipeline(Document(data.content.iloc[0].lower())).cleaned)

## Embeddings

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [None]:
model = Word2Vec(sentences=docs_on_tokens, vector_size=100, window=5, min_count=1, workers=4, sg=0, epochs=30)

In [None]:
model.wv.most_similar('rei', topn=10)

In [None]:
model.wv.similarity('homen', 'rei')

In [None]:
keywords_embeddings = []
for doc in keywords_on_docs:
    for word, _ in doc:
        try:
            keywords_embeddings.append(model.wv.get_vector(word))
        except:
            continue

In [None]:
embs_on_docs = []
word_corpora = []
for doc in tqdm(docs_on_tokens[0:]):
    for word in doc:
        if word not in word_corpora:
            embs_on_docs.append((word, model.wv.get_vector(word)))
            word_corpora.append(word)
        else:
            continue
        
np.savez_compressed('../data/processed/docs_embbeded', files=embs_on_docs)

In [None]:
len(embs_on_docs)

## Clustering

In [None]:
corpus = [' '.join(doc) for doc in docs_on_tokens]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [None]:
x = vectorizer.transform(['covid ataca cada vez mais.'])

In [None]:
len(x.toarray()[0])

In [None]:
w = dict(zip(vectorizer.get_feature_names(), x.toarray()[0]))

In [None]:
w['covid']

In [None]:
from sklearn.cluster import DBSCAN, KMeans, MiniBatchKMeans

In [None]:
clustering = DBSCAN(eps=3, min_samples=2).fit(keywords_embeddings)

In [None]:
len(np.unique(clustering.labels_))  

In [None]:
kmeans_preds = KMeans(n_clusters=20, random_state=0).fit([val[1] for val in embs_on_docs])

In [None]:
kmeans_preds.labels_ 

In [None]:
res = pd.DataFrame(columns=['Word', 'Concept'])
res['Word'] = [val[0] for val in embs_on_docs]
res['Concept'] = kmeans_preds.labels_ 

In [None]:
res.head()

In [None]:
res.Concept.value_counts()

In [None]:
res[res.Concept == 15].head(50)

## Sentiment Analysis