In [None]:
import pandas as pd
import nltk
import string
import os
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import StanfordPOSTagger
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## Leitura dos dados

In [None]:
train_df = pd.read_csv('train.csv')
train_df.shape

In [None]:
test_df = pd.read_csv('test.csv')
test_df.shape

In [None]:
train_df.head(5)

## Obtendo vetor de documentos

In [None]:
os.environ['CLASSPATH'] = 'stanford-pos'
os.environ['STANFORD_MODELS'] = 'stanford-pos/models'
st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
porter_stemmer = PorterStemmer()

def postag_sentence(sentence_tokens):
    return st.tag(sentence_tokens)

def postag_filter(sentence_tokens):
    pos_keep = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 
                'VBZ', 'NNP', 'NNPS','RB', 'RBR', 'RBS', 'JJ', 'JJR', 'JJS']
    sentence_tokens_filtered = [pt[0] for pt in postag_sentence(sentence_tokens) if pt[1] in pos_keep]
    return sentence_tokens_filtered

def stem_token(token):
    return porter_stemmer.stem(token)

def tokenize_doc(doc, use_pt_filter=False,
                      use_lowercase_filter=False,
                      use_stopwords_filter=False,
                      use_stemming_filter=False):
    sentences = sent_tokenize(doc)
    tokens = []
    for s in sentences:
        tk_sentence = word_tokenize(s)
        if use_pt_filter:
            tk_sentence = postag_filter(tk_sentence)
        if use_lowercase_filter:
            tk_sentence = [tk.lower() for tk in tk_sentence]
        if use_stopwords_filter:
            tk_sentence = [tk for tk in tk_sentence if tk not in stopwords.words('english')]
        if use_stemming_filter:
            tk_sentence = [stem_token(tk) for tk in tk_sentence ]
        tokens.extend(tk_sentence)
        
    return tokens
        
def remove_punctuation(tokens):
    return [ t for t in tokens if t not in string.punctuation ]

def get_vocabulary_tokenized_from_docs(tk_docs):
    vocabulary = Counter()
    for tk_d in tk_docs:
        for tk in tk_d:
            vocabulary[tk] += 1
    return vocabulary

In [None]:
docs = []
for index, row in train_df.iterrows():
    docs.append(' '.join([str(row['title']), str(row['text'])]).replace("’", "'"))

## Vocabulario e reducao de dimensionalidade por vocabulario

#### Sem Pos-Tagging, Sem Lowercase, Sem remoção Stopwords, Sem Stemming

In [None]:
tk_docs = [tokenize_doc(d, False, False, False, False) for d in docs]
vocabulary = get_vocabulary_tokenized_from_docs(tk_docs)
print('Tamanho do vocabulario inicial:', len(vocabulary.keys()))

#### Com Pos-Tagging, Sem Lowercase, Sem remoção Stopwords, Sem Stemming

In [None]:
tk_docs = [tokenize_doc(d, True, False, False, False) for d in docs]
vocabulary = get_vocabulary_tokenized_from_docs(tk_docs)
print('Tamanho do vocabulario inicial:', len(vocabulary.keys()))

#### Com Pos-Tagging, Com Lowercase, Sem remoção Stopwords, Sem Stemming

In [None]:
tk_docs = [tokenize_doc(d, True, True, False, False) for d in docs]
vocabulary = get_vocabulary_tokenized_from_docs(tk_docs)
print('Tamanho do vocabulario com lowercase:', len(vocabulary.keys()))

#### Com Pos-Tagging, Com Lowercase, Com remoção Stopwords, Sem Stemming

In [None]:
tk_docs = [tokenize_doc(d, True, True, True, False) for d in docs]
vocabulary = get_vocabulary_tokenized_from_docs(tk_docs)
print('Tamanho do vocabulario com lowercase:', len(vocabulary.keys()))

#### Com Pos-Tagging, Com Lowercase, Com remoção Stopwords, Com Stemming

In [None]:
tk_docs = [tokenize_doc(d, True, True, True, True) for d in docs]
vocabulary = get_vocabulary_tokenized_from_docs(tk_docs)
print('Tamanho do vocabulario com lowercase:', len(vocabulary.keys()))