In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/df_requests.csv")
len(df)

  df = pd.read_csv("../data/df_requests.csv")


934516

In [3]:
df = df.drop_duplicates()
len(df)

934516

In [4]:
import re

def remove_emails_com(text):
    text = str(text)
    return re.sub(r'[^\s]+.com', '', text).strip()

def remove_emails_br(text):
    text = str(text)
    return re.sub(r'[^\s]+.br', '', text).strip()

def remove_links(text):
    text = str(text)
    return re.sub(r'http[^\s]+', '', text).strip()

def remove_citations(text):
    text = str(text)
    return re.sub(r'@[^\s]+', '', text).strip()

def remove_numbers(text):
    text = str(text)
    return re.sub(r'\d+', '', text).strip()

def remove_special_characters(text):
    text = str(text)
    return re.sub(r'[^\w\s.!?]', '', text).strip()

def remove_empty_lines(text):
    text = str(text)
    return "\n".join([line for line in text.split('\n') if line.strip() != ''])

def remove_repeated_symbols(text):
    text = str(text)
    return re.sub(r'(\W)\1+', r'\1', text).strip()

def remove_excessive_spaces(text):
    text = str(text)
    return re.sub(r'\s+', ' ', text).strip()

def fix_isolated_commas(text):
    text = str(text)
    # Replace punctuation with a blank character before
    text = re.sub(r' ([.,:;!?])', r'\1', text)
    return text.strip()

In [5]:
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import FunctionTransformer


pipeline_clean_text = Pipeline([
    ('remove_links', FunctionTransformer(remove_links)),
    ('remove_emails_br', FunctionTransformer(remove_emails_br)),
    ('remove_emails_com', FunctionTransformer(remove_emails_com)),
    ('remove_citations', FunctionTransformer(remove_citations)),
    ('remove_excessive_spaces', FunctionTransformer(remove_excessive_spaces)),
    ('remove_repeated_symbols', FunctionTransformer(remove_repeated_symbols)),
    ('fix_isolated_commas', FunctionTransformer(fix_isolated_commas)),
    ('remove_numbers', FunctionTransformer(remove_numbers)),
    ('remove_special_characters', FunctionTransformer(remove_special_characters)),
    ('remove_empty_lines', FunctionTransformer(remove_empty_lines)),
])

In [6]:
df['DetalhamentoSolicitacao'][0], pipeline_clean_text.transform(df['DetalhamentoSolicitacao'][0])

('Solicitação de processo administrativo 03000200973201520, cadastrado no site  protocolointegrado.gov.br',
 'Solicitação de processo administrativo  cadastrado no site')

In [7]:
df['DetalhamentoSolicitacao'] = df['DetalhamentoSolicitacao'].apply(pipeline_clean_text.transform)

In [8]:
!python -m spacy download pt_core_news_sm

Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [9]:
import spacy

nlp = spacy.load('pt_core_news_sm')
stopwords_spacy = nlp.Defaults.stop_words
list(stopwords_spacy)[:10]

['estiveste',
 'após',
 'quatro',
 'entre',
 'ademais',
 'ambos',
 'seu',
 'tu',
 'área',
 'estar']

In [10]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords_nltk = stopwords.words('portuguese')
list(stopwords_nltk)[:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/isaaclourenco/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['a',
 'à',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquilo',
 'as']

In [11]:
both_stopwords = set(stopwords_nltk) | set(stopwords_spacy)

In [12]:
def remove_stop_word(text):
    text = re.sub(r'[^\w\s]', '', text)

    tokens = text.split()

    tokens = filter(lambda token: token not in both_stopwords, tokens)

    return ' '.join(tokens)

In [13]:
df['DetalhamentoSolicitacao'] = df['DetalhamentoSolicitacao'].str.lower().apply(remove_stop_word)
df['DetalhamentoSolicitacao']

0         solicitação processo administrativo cadastrado...
1         prezados senhores enviar processo acima fotos ...
2         presidente associação projeto ação comunitária...
3         vista cópia termos acusação relatório processo...
4         dia nome andré pieve período curso ciências bi...
                                ...                        
934511    solicito gentileza informarem situação defesa ...
934512    solicito taxa evasão taxa reprovação disciplin...
934513    solicito reposta reclamação aberta protocolo n...
934514    íntegra seguintes processos fiscalização ccc i...
934515    solicito mctic especificamente comissão técnic...
Name: DetalhamentoSolicitacao, Length: 934516, dtype: object

In [14]:
def spacy_lemmatizer(text):
    doc = nlp(text)

    txt = [token.lemma_ for token in doc]

    txt = [word for word in txt if len(word) > 2]

    return ' '.join(txt)

In [15]:
df['DetalhamentoSolicitacao'] = df['DetalhamentoSolicitacao'].str.lower().apply(spacy_lemmatizer)

In [19]:
lengths = df['DetalhamentoSolicitacao'].dropna().str.len()

q1 = lengths.quantile(0.25)
q3 = lengths.quantile(0.75)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
iqr = q3 - q1
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr
outliers = lengths[(lengths < lower) | (lengths > upper)]

indices = outliers.index.tolist()
df = df.drop(index=indices)
len(df)

831341

In [17]:
df.dropna(subset=['DetalhamentoSolicitacao'], inplace=True)

In [18]:
df.to_csv("../data/df_requests_cleaned.csv")