In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!pip install PyPDF2

In [None]:
import unicodedata
import re
import os
import string
import numpy as np

import PyPDF2
from PyPDF2 import PdfReader

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from nltk.corpus.reader.plaintext import PlaintextCorpusReader

from nltk.probability import FreqDist
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
from wordcloud import WordCloud


In [None]:
import spacy
!python -m spacy download pt_core_news_lg
nlp = spacy.load("pt_core_news_lg")

In [None]:
#Separador de sentenças
nltk.download('punkt')

#Stopwords
nltk.download('stopwords')

#Base de dados léxica
nltk.download('wordnet')
nltk.download('rslp')

#Pos-tagging
nltk.download('averaged_perceptron_tagger')


# Leitura dos dados

In [None]:
def getText(fileName):
    pdf = open('/content/drive/MyDrive/dados/artigos/'+fileName, 'rb')
    reader = PdfReader(pdf)
    text = []
    for i in range(0,len(reader.pages)):
        text.append(reader.pages[i].extract_text().replace('\t', ' '))
    pdf.close()
    return '\n'.join(text)

files = [getText('file'+str(i)+'.pdf') for i in range(0,37)]

# Pré processamento

Some of the common text preprocessing / cleaning steps are:

- Normalization
- Lower casing
- Removal of Punctuations
- Removal of Stopwords
- Removal of Frequent words
- Removal of Rare words
- Stemming
- Lemmatization
- Removal of emojis
- Removal of emoticons
- Conversion of emoticons to words
- Conversion of emojis to words
- Removal of URLs
- Removal of HTML tags
- Chat words conversion
- Spelling correction

So these are the different types of text preprocessing steps which we can do on text data. But we need not do all of these all the times. We need to carefully choose the preprocessing steps based on our use case since that also play an important role.

In [None]:
print("\u00C7", "\u0043"+"\u0327")

In [None]:
word1 = "CACHA\u00C7A"
word2 = "CACHA\u0043\u0327A"

print(word1, word2)

In [None]:
word1 == word2

In [None]:
#Decompoe em componentes menores:
# \u00C7 => \u0043 + \u0327
unicodedata.normalize('NFD', word1) == word2

In [None]:
#Decompoe em componentes menores seguido de composição
#\u00C7 => \u0043 + \u0327 => \u00C7
#No exemplo: \u0043 + \u0327 => \u00C7
unicodedata.normalize('NFC', word2) == word1

In [None]:
#Decompoe na sua versão normal
unicodedata.normalize('NFKD', word1) == unicodedata.normalize('NFKD', word2)

In [None]:
texto = "O plenário do     Senado aprovou hoje no dia 08 a proposta de reforma tributária, uma das principais pautas da agenda econômica do governo Lula (PT), com o apoio das bancadas do centrão. Como o texto sofreu alterações, o projeto voltará para mais uma análise da Câmara dos Deputados."


In [None]:
# Normalizar caracteres acentuados para suas formas não acentuadas
texto = unicodedata.normalize('NFKD', texto).encode('ASCII', 'ignore').decode('utf-8')
print(texto)

In [None]:
# Remove caracteres não alfanuméricos
texto = re.sub(r'[^\w\sáÁâÂãÃàÀéÉêÊíÍóÓôÔõÕúÚçÇ]', ' ', texto)
print(texto)

# Remove números
texto = ''.join([i for i in texto if not i.isdigit()])
print(texto)

# Remove espaços em branco adicionais
texto = re.sub('[\s]+', ' ', texto)
texto = re.sub('[\n]+', ' ', texto)
print(texto)

In [None]:
stop_words = set(stopwords.words('portuguese'))

s = sent_tokenize(texto)
t = [word_tokenize(sent) for sent in s]
print(texto)
print(s)
print(t)


In [None]:
s_tokens= [token for sentence_tokens in t for token in sentence_tokens if token.lower() not in stop_words]

print("Com stopwords:", t)
print("Sem stopwords:", s_tokens)

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatizer_tokens1 = [lemmatizer.lemmatize(token) for token in s_tokens]

doc = nlp(' '.join(s_tokens))
lemmatizer_tokens2 = [token.lemma_ for token in doc]

stemmer = SnowballStemmer("portuguese")
stemmer_tokens1 = [stemmer.stem(token) for token in s_tokens]

stemmer = nltk.stem.RSLPStemmer()
stemmer_tokens2 = [stemmer.stem(token) for token in s_tokens]

print("Lemmatizer:", lemmatizer_tokens1)
print("Lemmatizer:", lemmatizer_tokens2)

print("Stemmer:", stemmer_tokens1)
print("Stemmer:", stemmer_tokens2)


In [None]:
def preProcessing(text):

    # Normalizar caracteres acentuados para suas formas não acentuadas
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

    # Remove caracteres não alfanuméricos
    text = re.sub(r'[^\w\sáÁâÂãÃàÀéÉêÊíÍóÓôÔõÕúÚçÇ]', ' ', text)

    # Remove números
    text = ''.join([i for i in text if not i.isdigit()])

    # Remove espaços em branco adicionais
    text = re.sub('[\s]+', ' ', text)
    text = re.sub('[\n]+', ' ', text)

    # Tokenização
    sentences = sent_tokenize(text)
    tokens = [word_tokenize(sent) for sent in sentences]

    # Converter para minúsculas
    tokens = [[token.lower() for token in sent if token.isalpha()] for sent in tokens]

    # Remover pontuação
    tokens = [[token for token in sent if token not in string.punctuation] for sent in tokens]

    # Remoção de stopwords
    stop_words = set(stopwords.words('portuguese'))
    stopwords_tokens = [[token for token in sent if token not in stop_words] for sent in tokens]

    # Stemming
    stemmer = SnowballStemmer("portuguese")
    transformed_tokens = [[stemmer.stem(token) for token in sent] for sent in stopwords_tokens]
    return transformed_tokens

    # Lemmatization
    #transformed_tokens = []
    #lemmatizer = WordNetLemmatizer()
    #transformed_tokens = [[lemmatizer.lemmatize(token) for token in sent] for sent in stopwords_tokens]
    #return transformed_tokens

    #texto = [sent for sent in stopwords_tokens[0]]
    #texto = ' '.join(texto)
    #doc = nlp(texto)
    #transformed_tokens = [token.lemma_ for token in doc]
    #return [transformed_tokens]


# Montagem do corpus

In [None]:
def createCorpus(files):
    # Cria diretório para salvar o corpus processado
    path = 'corpus/'
    if not os.path.isdir(path): os.mkdir(path)

    for idx, file in enumerate(files):
        preprocessed_tokens = preProcessing(file)

        # Grava os dados processados em arquivo texto
        with open(path + str(idx) + '.txt', 'w') as fout:
            fout.write('\n'.join([' '.join(sent) for sent in preprocessed_tokens]))

    return PlaintextCorpusReader(path, '.*')


def showCorpus(corpus):
  for file_id in corpus.fileids():
    print("File:", file_id)
    tokens = corpus.words(file_id)
    print(tokens)
    print(nltk.pos_tag(tokens))
    print("=" * 80)


corpus = createCorpus(files)
showCorpus(corpus)
#open('corpus/0.txt', 'r').read()

# Agrupamento de textos por afinidade

In [None]:
# Leitura do corpus
path = 'corpus/'
corpus = PlaintextCorpusReader(path, '.*')

# Obter os textos pré-processadas
text = [corpus.raw(file) for file in corpus.fileids()]

print("- ", text[0])
print("- ", text[1])
print("- ", text[36])

In [None]:
# Cria uma matriz TF-IDF
# Convert a collection of raw documents to a matrix of TF-IDF features.

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text)
print(X)

In [None]:
# Agrupamento K-Means
number_clusters = 6
kmeans = KMeans(n_clusters=number_clusters, init='k-means++', algorithm='lloyd')
kmeans.fit(X)

for cluster_id in range(number_clusters):
    print(f"Cluster {cluster_id + 1}:")
    cluster_indices = np.where(kmeans.labels_ == cluster_id)[0]
    for idx in cluster_indices:
        print(corpus.fileids()[idx])
    print("=" * 80)

# Nuvem de palavras

In [None]:
stop_words = set(stopwords.words('portuguese'))

def showWordCloud(nameFile):
    file = getText(nameFile)

    tokens = preProcessing(file)
    frequency_dist = nltk.FreqDist(tokens[0])

    wcloud = WordCloud().generate_from_frequencies(frequency_dist)
    plt.figure(figsize=(10,10))
    plt.imshow(wcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()


In [None]:
showWordCloud('file0.pdf')

In [None]:
showWordCloud('file10.pdf')

In [None]:
showWordCloud('file35.pdf')

# Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

doc1 = getText('file0.pdf')
doc2 = getText('file10.pdf')
doc3 = getText('file35.pdf')


#path = 'corpus/'
#corpus = PlaintextCorpusReader(path, '.*')
#text = [corpus.raw(file) for file in corpus.fileids()]
#doc1 = text[0]
#doc2 = text[10]
#doc3 = text[35]


bow_vectorizer = CountVectorizer()

X = bow_vectorizer.fit_transform([doc1,doc2,doc3])

bow_df = pd.DataFrame(X.toarray(),columns=bow_vectorizer.get_feature_names_out())
bow_df.head()

# Visualizando as palavras mais frequentes

In [None]:
for file_id in corpus.fileids():
    print("File:", file_id)

    tokens = corpus.words(file_id)
    frequencia = nltk.FreqDist(tokens)
    most_common = frequencia.most_common(5)

    print(most_common)
    print("=" * 80)
