## Importar as bibliotecas necessárias

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
import time
import re

### "Setar" a random seed

In [2]:
np.random.seed(500)

## Importar a base de dados

In [3]:
corpus_treino = pd.read_csv("data/labeledTrainData.tsv",encoding='latin-1', sep='\t')

In [4]:
corpus_treino.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


## Preparação dos dados pré-treino

### Função para remover tags html

In [5]:
def html_cleaner(string):
    alvo = re.compile('<.*?>')
    cleantext = re.sub(alvo,' ',string)
    return cleantext

In [6]:
corpus_treino['review'] = corpus_treino['review'].apply(html_cleaner)

In [7]:
corpus_treino.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


### Remover linhas nulas

In [8]:
corpus_treino['review'].dropna(inplace=True)

### Palavras em minúsculo

In [9]:
corpus_treino['review'] = [entry.lower() for entry in corpus_treino['review']]

### "Tokenization"

In [10]:
corpus_treino['review']= [word_tokenize(entry) for entry in corpus_treino['review']]

### Remoção de stop words e lemmatization

In [11]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/mynssem/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mynssem/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mynssem/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mynssem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
inicio = time.time()

# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(corpus_treino['review']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    corpus_treino.loc[index,'review_final'] = str(Final_words)
fim = time.time()
print("Tempo para lematização %.2f horas"%((fim - inicio)/3600))

## Salvar a base tratada

In [None]:
corpus_treino.to_csv('corpus_treino_regex.csv', sep = ';')