In [1]:
# uncomment the line below to install the package
#!pip install --user -U nltk

In [2]:
import nltk
# uncomment the lines below to install necessary files for the pipeline
# nltk.download('punkt')
# nltk.download('brown')
# nltk.download('names')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('rslp')

### Tokenization

Generates a list of tokenized words

In [3]:
from nltk.tokenize import word_tokenize

with open ('preprocessing.txt', encoding = 'utf-8') as fin:
    tokens = word_tokenize(fin.read())

In [4]:
tokens

['An',
 'explosion',
 'targeting',
 'a',
 'tourist',
 'bus',
 'has',
 'injured',
 'at',
 'least',
 '16',
 'people',
 'near',
 'the',
 'Grand',
 'Egyptian',
 'Museum',
 ',',
 'next',
 'to',
 'the',
 'pyramids',
 'in',
 'Giza',
 ',',
 'security',
 'sources',
 'say',
 'E.U',
 '.',
 'South',
 'African',
 'tourists',
 'are',
 'among',
 'the',
 'injured',
 '.',
 'Most',
 'of',
 'those',
 'hurt',
 'suffered',
 'minor',
 'injuries',
 ',',
 'while',
 'three',
 'were',
 'treated',
 'in',
 'hospital',
 ',',
 'N.A.T.O',
 '.',
 'say',
 '.',
 'http',
 ':',
 '//www.ibiblio.org/pub/docs/books/gutenberg/etext01',
 '@',
 'nickname',
 'of',
 'twitter',
 'user',
 'and',
 'his',
 'email',
 'is',
 'email',
 '@',
 'gmail.com',
 '.',
 'A',
 'device',
 'went',
 'off',
 'close',
 'to',
 'the',
 'museum',
 'fence',
 'as',
 'the',
 'bus',
 'was',
 'passing',
 'on',
 '16/02/2012',
 '.',
 'On',
 'the',
 '13',
 'Feb.',
 '2007',
 ',',
 'Theresa',
 'May',
 'announced',
 'on',
 'MTV',
 'news',
 'that',
 'the',
 'rate',

### Cleaning

#### Lower casing

In [5]:
cleaning = open('preprocessing.txt', 'rt', encoding = 'utf-8').read().lower()

In [6]:
print(cleaning)

an explosion targeting a tourist bus has injured at least 16 people near the grand egyptian museum, 
next to the pyramids in giza, security sources say e.u.

south african tourists are among the injured. most of those hurt suffered minor injuries, 
while three were treated in hospital, n.a.t.o. say.

http://www.ibiblio.org/pub/docs/books/gutenberg/etext01

@nickname of twitter user and his email is email@gmail.com . 

a device went off close to the museum fence as the bus was passing on 16/02/2012.

on the 13 feb. 2007, theresa may announced on mtv news that the rate of childhod obesity had risen from 7.3-9.6 % in just 3 years , costing the n.a.t.o £20m


#### Removal of URLs

In [7]:
# to work with regular expressions
import re

cleaning = re.sub(r'(https|http|ftp)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', cleaning, flags=re.MULTILINE)

In [8]:
print(cleaning)

an explosion targeting a tourist bus has injured at least 16 people near the grand egyptian museum, 
next to the pyramids in giza, security sources say e.u.

south african tourists are among the injured. most of those hurt suffered minor injuries, 
while three were treated in hospital, n.a.t.o. say.



@nickname of twitter user and his email is email@gmail.com . 

a device went off close to the museum fence as the bus was passing on 16/02/2012.

on the 13 feb. 2007, theresa may announced on mtv news that the rate of childhod obesity had risen from 7.3-9.6 % in just 3 years , costing the n.a.t.o £20m


#### Removal of punctuations

In [9]:
# uncomment the lines below to install the packages
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [10]:
import spacy
import en_core_web_sm
import string

In [11]:
nlp = en_core_web_sm.load()

In [12]:
doc = nlp(cleaning)
tokens = [t.text for t in doc]

In [13]:
tokens_without_punct_python = [t for t in tokens if t not in string.punctuation]

In [14]:
# tokens after removal of punctuation
tokens_without_punct_python

['an',
 'explosion',
 'targeting',
 'a',
 'tourist',
 'bus',
 'has',
 'injured',
 'at',
 'least',
 '16',
 'people',
 'near',
 'the',
 'grand',
 'egyptian',
 'museum',
 '\n',
 'next',
 'to',
 'the',
 'pyramids',
 'in',
 'giza',
 'security',
 'sources',
 'say',
 'e.u',
 '\n\n',
 'south',
 'african',
 'tourists',
 'are',
 'among',
 'the',
 'injured',
 'most',
 'of',
 'those',
 'hurt',
 'suffered',
 'minor',
 'injuries',
 '\n',
 'while',
 'three',
 'were',
 'treated',
 'in',
 'hospital',
 'n.a.t.o',
 'say',
 '\n\n\n\n',
 '@nickname',
 'of',
 'twitter',
 'user',
 'and',
 'his',
 'email',
 'is',
 'email@gmail.com',
 '\n\n',
 'a',
 'device',
 'went',
 'off',
 'close',
 'to',
 'the',
 'museum',
 'fence',
 'as',
 'the',
 'bus',
 'was',
 'passing',
 'on',
 '16/02/2012',
 '\n\n',
 'on',
 'the',
 '13',
 'feb',
 '2007',
 'theresa',
 'may',
 'announced',
 'on',
 'mtv',
 'news',
 'that',
 'the',
 'rate',
 'of',
 'childhod',
 'obesity',
 'had',
 'risen',
 'from',
 '7.3',
 '9.6',
 'in',
 'just',
 '3',


In [15]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

#detokenize to string
cleaning = TreebankWordDetokenizer().detokenize(tokens_without_punct_python)

In [16]:
print(cleaning)

an explosion targeting a tourist bus has injured at least 16 people near the grand egyptian museum 
 next to the pyramids in giza security sources say e.u 

 south african tourists are among the injured most of those hurt suffered minor injuries 
 while three were treated in hospital n.a.t.o say 



 @nickname of twitter user and his email is email@gmail.com 

 a device went off close to the museum fence as the bus was passing on 16/02/2012 

 on the 13 feb 2007 theresa may announced on mtv news that the rate of childhod obesity had risen from 7.3 9.6 in just 3 years costing the n.a.t.o £ 20 m


#### Removal of stop words

In [17]:
nlp = en_core_web_sm.load()

In [18]:
text_without_stop_words = [t.text for t in nlp(cleaning) if not t.is_stop]

In [19]:
cleaning = TreebankWordDetokenizer().detokenize(text_without_stop_words)

In [20]:
print(cleaning)

explosion targeting tourist bus injured 16 people near grand egyptian museum 
  pyramids giza security sources e.u 

  south african tourists injured hurt suffered minor injuries 
  treated hospital n.a.t.o 



  @nickname twitter user email email@gmail.com 

  device went close museum fence bus passing 16/02/2012 

  13 feb 2007 theresa announced mtv news rate childhod obesity risen 7.3 9.6 3 years costing n.a.t.o £ 20 m


#### Removal of frequent words

In [21]:
cleaning_iliad = open('The-Iliad-of-Homer.txt', 'rt', encoding = 'utf-8').read().lower()

In [22]:
#Preprocessing: removal of punctuation
cleaning_iliad = cleaning_iliad.translate(str.maketrans('', '', string.punctuation))

In [23]:
tokens_iliad = nltk.word_tokenize(cleaning_iliad)

In [24]:
fdist_iliad = nltk.FreqDist(tokens_iliad)

In [25]:
frequent_words = [*fdist_iliad][:50] #50 most frequent words

In [26]:
frequent_words

['the',
 'and',
 'of',
 'to',
 'his',
 'in',
 'he',
 'him',
 'that',
 'a',
 'with',
 'for',
 'but',
 'from',
 'i',
 'thou',
 'on',
 'they',
 'all',
 'them',
 'so',
 'son',
 'not',
 'then',
 'as',
 'was',
 'is',
 'it',
 'me',
 'their',
 'be',
 'my',
 'even',
 'when',
 'now',
 'spake',
 'by',
 'thee',
 'were',
 'achaians',
 'men',
 'trojans',
 'her',
 'will',
 'man',
 'ships',
 'zeus',
 'at',
 'thy',
 'hector']

In [27]:
resultwords  = [word for word in re.split("\W+",cleaning_iliad) if word.lower() not in frequent_words]

In [28]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

cleaning_iliad = TreebankWordDetokenizer().detokenize(resultwords)

In [29]:
print(cleaning_iliad)

project gutenberg etext iliad homer done into english prose andrew lang ma walter leaf littd ernest myers ma copyright laws are changing over world sure check laws your country before redistributing these files please take look important information this header we encourage you keep this file your own disk keeping an electronic path open next readers do remove this this should first thing seen anyone opens book do change or edit without written permission words are carefully chosen provide users information need about what can legally do texts welcome world free plain vanilla electronic texts etexts readable both humans computers since 1971 these etexts prepared hundreds volunteers donations information contacting project gutenberg get etexts further information included below we need your donations presently contributions are only being solicited people texas nevada idaho montana wyoming colorado south dakota iowa indiana vermont requirements other states are met additions this list m

#### Removal of rare words

In [30]:
tokens_iliad = nltk.word_tokenize(cleaning_iliad)

In [31]:
fdist_iliad = nltk.FreqDist(tokens_iliad)

In [32]:
rare_words = [*fdist_iliad][-50:] #50 less frequent words

In [33]:
rare_words

['chafe',
 'prime',
 'crierherald',
 'felloes',
 'wellspun',
 'unworthy',
 'rightful',
 'cunninglywrought',
 'opposite',
 'sipylos',
 'couchingplaces',
 'acheloos',
 'skinned',
 'courtyardclose',
 'bedstead',
 'mantles',
 'clothing',
 'meaning',
 'reference',
 'custom',
 'tellest',
 'recess',
 'firmwrought',
 'slumbered',
 'unespied',
 'saffron',
 'crier',
 'unendurable',
 'minstrel',
 'dirge',
 'guardedst',
 'keptest',
 'voyaging',
 'memorable',
 'evermore',
 'smoking',
 'keenedged',
 'unending',
 'troylandwould',
 'twentieth',
 'despiteful',
 'palacehalls',
 'motherbut',
 'ownthen',
 'soothe',
 'gentleness',
 'watchers',
 'zeusfostered',
 'homers',
 'translated']

In [34]:
resultwords  = [word for word in re.split("\W+",cleaning_iliad) if word.lower() not in rare_words]

In [35]:
cleaning_iliad = TreebankWordDetokenizer().detokenize(resultwords)

In [36]:
print(cleaning_iliad)

project gutenberg etext iliad homer done into english prose andrew lang ma walter leaf littd ernest myers ma copyright laws are changing over world sure check laws your country before redistributing these files please take look important information this header we encourage you keep this file your own disk keeping an electronic path open next readers do remove this this should first thing seen anyone opens book do change or edit without written permission words are carefully chosen provide users information need about what can legally do texts welcome world free plain vanilla electronic texts etexts readable both humans computers since 1971 these etexts prepared hundreds volunteers donations information contacting project gutenberg get etexts further information included below we need your donations presently contributions are only being solicited people texas nevada idaho montana wyoming colorado south dakota iowa indiana vermont requirements other states are met additions this list m

#### Spelling correction

In [37]:
from spellchecker import SpellChecker

In [38]:
spell = SpellChecker(language='pt')

In [39]:
bras_cubas = open('bras_cubas.txt', 'rt', encoding = 'utf-8').read().lower()

In [40]:
print(bras_cubas)

em vedade vos digo que toda a sabdoria humana não vale um par de botas cuurtas. o vício é muitaas vezes o extrume da virtude. o que não 
inpede que a virtude seja uma flor cheiroosa e sã. eu, que meditava ir ter com a moorte, não ouzei fitá-la quando ela veio ter commigo.


In [41]:
tokens_bras_cubas = nltk.word_tokenize(bras_cubas)

In [42]:
correction = []

for word in tokens_bras_cubas:
    # Get the one `most likely` answer
    correction.append(spell.correction(word))

In [43]:
correction = TreebankWordDetokenizer().detokenize(correction)
print(correction)

em verdade vos digo que toda a sabedoria humana não vale um par de botas curtas . o vício é muitas vezes o estrume da virtude . o que não impede que a virtude seja uma flor cheirosa e sã . eu, que meditar ir ter com a morte, não ousei fitá-la quando ela veio ter comigo.


#### Removal of emojis

In [44]:
import pandas as pd

In [45]:
tweets = pd.read_csv('sample.csv')

In [46]:
pd.set_option('display.max_colwidth', -1)

print(tweets["text"].loc[[0]])

0    @AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is opened😡😡😡
Name: text, dtype: object


  """Entry point for launching an IPython kernel.


In [47]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [48]:
tweets["text_without_emojis"] = tweets["text"].apply(lambda text: remove_emoji(text))

In [49]:
print(tweets["text_without_emojis"].loc[[0]])

0    @AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is opened
Name: text_without_emojis, dtype: object


#### Removal of emoticons

In [50]:
tweets["text"].loc[[79]]

79    @105856 Can you DM me your full name, address and email? I'd be happy to look into this further for you :) Thanks - Mike
Name: text, dtype: object

In [51]:
# uncomment the line below to install the package
#!pip install emot --upgrade

In [52]:
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

In [53]:
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

In [54]:
tweets["text_without_emoticons"] = tweets["text"].apply(lambda text: remove_emoticons(text))

In [55]:
tweets["text_without_emoticons"].loc[[79]]

79    @105856 Can you DM me your full name, address and email? I'd be happy to look into this further for you  Thanks - Mike
Name: text_without_emoticons, dtype: object

#### Removal of HTML tags/markups

In [56]:
# uncomment the line below to install the package
#!pip install beautifulsoup4

In [57]:
raw_html = open('w3_schools.txt', 'rt', encoding = 'utf-8').read().lower()

In [58]:
print(raw_html)

easy learning with html "try it yourself"

with our "try it yourself" editor, you can edit the html code and view the result:
example
<!doctype html>
<html>
<head>
<title>page title</title>
</head>
<body>

<h1>this is a heading</h1>
<p>this is a paragraph.</p>

</body>
</html>

click on the "try it yourself" button to see how it works.


In [59]:
from bs4 import BeautifulSoup

cleantext = BeautifulSoup(raw_html, "lxml").text

In [60]:
print(cleantext)

easy learning with html "try it yourself"

with our "try it yourself" editor, you can edit the html code and view the result:
example


page title
this is a heading
this is a paragraph.




click on the "try it yourself" button to see how it works.


#### Chat words conversion

In [61]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BB=Baby
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
GTG=Got to Go
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
STFU=Shut the F... Up
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [62]:
chat = open('urban_dictionary.txt', 'rt', encoding = 'utf-8').read().replace('\n', ' ')

In [63]:
chat = chat.translate(str.maketrans('', '', string.punctuation)) #Preprocessing: removal of punctuation

In [64]:
chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [65]:
chat_words_conversion(chat)

"When will the page be back As Far As I Know in 2 hours I was kicked for being Away From Keyboard in the game This work has to be made As Soon As Possible Some twitter bots actually have real people At The Keyboard Sorry Im busy At The Moment i would love to catch up with you Anytime, Anywhere, Anyplace person1 were are you person2 I were Away From Keyboard but now Im Back At Keyboard Dave Hi Josh do you want to go online in a sec Josh Yeah sure but Be Right Back going to get some food Dave OK 8 Hours Later Josh Im Back Dave Where the hell have you been To the other side of the planet and back Josh No I went to watch about 20 movies and never mentioned it to you because I couldnt be asked telling you Dave Ok Come and join this server oke ill Be Right There You bin there Before bro ya wit You Bye For Now Ill See You the restaurant See You later Houses Urbandictionary needs an Frequently Asked Questions page For What It's Worth the Supra was the best sports car Toyota ever made For Your 

### Normalization

#### Dates, numbers, currency, percent to text

In [66]:
normalization = open('preprocessing.txt', 'rt', encoding = 'utf-8').read()

In [67]:
punctuation = '!"#&\'()*+,-/:;<=>?[\\]^_`{|}~' #Preprocessing: removal of punctuation

In [68]:
nlp = en_core_web_sm.load()

In [69]:
doc = nlp(normalization)
tokens = [t.text for t in doc]

In [70]:
norm_without_punct_python = [t for t in tokens if t not in punctuation]

In [71]:
normalization = TreebankWordDetokenizer().detokenize(norm_without_punct_python)

In [72]:
print(normalization)

An explosion targeting a tourist bus has injured at least 16 people near the Grand Egyptian Museum 
 next to the pyramids in Giza security sources say E.U. 

 South African tourists are among the injured . Most of those hurt suffered minor injuries 
 while three were treated in hospital N.A.T.O. say . 

 http://www.ibiblio.org/pub/docs/books/gutenberg/etext01 

 @nickname of twitter user and his email is email@gmail.com . 

 A device went off close to the museum fence as the bus was passing on 16/02/2012 . 

 On the 13 Feb. 2007 Theresa May announced on MTV news that the rate of childhod obesity had risen from 7.3 9.6% in just 3 years costing the N.A.T.O £ 20 m


In [73]:
# uncomment the line below to install the package
#!pip install normalise

In [74]:
from normalise import normalise

user_abbr = {
    "N.A.T.O": "North Atlantic Treaty Organization"
}

normalized_tokens = normalise(word_tokenize(normalization), user_abbrevs=user_abbr, verbose=False)



In [75]:
normalization = TreebankWordDetokenizer().detokenize(normalized_tokens)

In [76]:
print(normalization)

An explosion marketing a tourist bus has injured at least sixteen people near the Grand Egyptian Museum next to the pyramids in Giza security sources say E U . South African tourists are among the injured . Most of those hurt suffered minor injuries while three were treated in hospital North Atlantic Treaty Organization . say . help: W W W dot i bib l io dot org pub does books gutenberg text one @ nickname of twitter user and his email is email @ gmail.com . A device went off close to the museum fence as the bus was passing on the sixteenth of February twenty twelve . On the thirteenth of February two thousand and seven Theresa May announced on M T V news that the rate of childhood obesity had risen from seven point three nine point six% in just three years costing the North Atlantic Treaty Organization £ twenty metres


#### Conversion of emoticons to words

In [77]:
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

In [78]:
tweets["text"].loc[[79]]

79    @105856 Can you DM me your full name, address and email? I'd be happy to look into this further for you :) Thanks - Mike
Name: text, dtype: object

In [79]:
tweets["emoticons_to_text"] = tweets["text"].apply(lambda text: convert_emoticons(text))

In [80]:
tweets["emoticons_to_text"].loc[[79]]

79    @105856 Can you DM me your full name, address and email? I'd be happy to look into this further for you Happy_face_or_smiley Thanks - Mike
Name: emoticons_to_text, dtype: object

#### Conversion of emojis to words

In [81]:
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return text

In [82]:
tweets["text"].loc[[0]]

0    @AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is opened😡😡😡
Name: text, dtype: object

In [83]:
tweets["emojis_to_text"] = tweets["text"].apply(lambda text: convert_emojis(text))

In [84]:
tweets["emojis_to_text"].loc[[0]]

0    @AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is openedpouting_facepouting_facepouting_face
Name: emojis_to_text, dtype: object

### Lemmatization

In [85]:
#(PYTORCH NEEDED) In Anaconda Prompt: conda install pytorch=1.3.0 -c pytorch
# OR
#pip install pip install pytorch==1.3.0
# uncomment the line below to install necessary files for the pipeline
#!pip install stanza

In [86]:
import stanza

In [87]:
# uncomment the line below to install necessary files for the pipeline
#stanza.download('pt')
nlp = stanza.Pipeline('pt')

2020-10-19 02:08:07 INFO: Loading these models for language: pt (Portuguese):
| Processor | Package |
-----------------------
| tokenize  | bosque  |
| mwt       | bosque  |
| pos       | bosque  |
| lemma     | bosque  |
| depparse  | bosque  |

2020-10-19 02:08:08 INFO: Use device: gpu
2020-10-19 02:08:08 INFO: Loading: tokenize
2020-10-19 02:08:11 INFO: Loading: mwt
2020-10-19 02:08:11 INFO: Loading: pos
2020-10-19 02:08:14 INFO: Loading: lemma
2020-10-19 02:08:14 INFO: Loading: depparse
2020-10-19 02:08:17 INFO: Done loading processors!


In [88]:
capivara = open('capivara-pt.txt', 'rt', encoding = 'utf-8').read()

In [89]:
lemma = ""

In [90]:
for sent in nlp(capivara).sentences:
    for word in sent.words:
        lemma += word.lemma + " "

In [91]:
# lemmatizated text
print(lemma)

o capivara ( nome científico : hydrochoeru hydrochaeri ) ser um espécie de mamífero roedor de o família caviidae e subfamília hydrochoerinae . algum autor considerar que dever ser classificado em um família próprio . estar incluir em o mesmo grupo de roedor a o qual se classificar o paca , cutia , o preá e o porquinho-da-índia . ocorrer por todo o américa de o sul a o leste de o ande em habitat associar a rio , lago e pântano , de o nível de o mar até 1 300 m de altitude . extremamente adaptável , poder ocorrer em ambiente altamente alterar por o ser humano . ser o grande roedor de o mundo , pesar até 91 kg e medir até 1,2 m de comprimento e 60 cm de altura . o pelagem ser denso , de cor avermelhar a marrom escuro . ser possível distinguir o macho por conta de o presença de um glândula proeminente em o focinho apesar de o dimorfismo sexual não ser aparente . existir um série de adaptação em o sistema digestório a o herbivoria , principalmente em o ceco . alcançar o maturidade sexual co

#### Stemming

In [92]:
stemmer = nltk.stem.RSLPStemmer()

In [93]:
capivara_tokens = nltk.word_tokenize(capivara)

In [94]:
from nltk.stem import RSLPStemmer

def Stemming(sentence):
    stemmer = RSLPStemmer()
    phrase = []
    for word in sentence:
        phrase.append(stemmer.stem(word.lower()))
    return phrase

In [95]:
capivara_tokens = Stemming(capivara_tokens)

In [96]:
capivara_tokens = TreebankWordDetokenizer().detokenize(capivara_tokens)

In [97]:
# stemmed text
print(capivara_tokens)

a capiv (nom científ: hydrochoeru hydrochaeril) é uma espéci de mamífer roed da famíl caviida e subfamíl hydrochoerina . algum autor consid que dev ser classific em uma famíl própr . est inclu no mesm grup de roed ao qual se classific as pac, cut, os pre e o porquinho-da-índ . ocorr por tod a amér do sul ao lest do and em habitat associ a rio, lag e pânt, do nível do mar até 1 300 m de altitud . extrem adapt, pod ocorr em ambi alt alter pel ser human . é o mai roed do mund, pes até 91 kg e med até 1,2 m de compr e 60 cm de altur . a pel é dens, de cor avermelh a marrom escur . é possível distingu os mach por cont da presenç de uma glândul proemin no foc apes do dimorf sex não ser aparent . exist uma séri de adapt no sistem digestóri à herbiv, princip no cec . alcanç a matur sex com cerc de 1,5 ano de idad, e as fême dão à luz geral a quatr filhot por vez, pes até 1,5 kg e já nasc com pel e dent permanent . em cativ, pod viv até 12 ano de idad.


### Text data to vectors of numbers

#### Word counts

In [98]:
# uncomment the line below to install necessary files for the pipeline
# pip install -U scikit-learn
# OR
# conda install scikit-learn (if using Anaconda)

In [99]:
from sklearn.feature_extraction.text import CountVectorizer

In [100]:
vectorizer = CountVectorizer()

In [101]:
# the text must be converted to a iterable
cleaning_iliad = [cleaning_iliad]

# learn the vocabulary from text
vectorizer.fit(cleaning_iliad)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [102]:
#print learned vocabulary from text
print(vectorizer.vocabulary_)

{'project': 5351, 'gutenberg': 3139, 'etext': 2334, 'iliad': 3601, 'homer': 3463, 'done': 1990, 'into': 3707, 'english': 2238, 'prose': 5377, 'andrew': 293, 'lang': 3938, 'ma': 4231, 'walter': 7571, 'leaf': 3987, 'littd': 4112, 'ernest': 2306, 'myers': 4578, 'copyright': 1501, 'laws': 3974, 'are': 381, 'changing': 1198, 'over': 4891, 'world': 7936, 'sure': 6757, 'check': 1232, 'your': 8028, 'country': 1535, 'before': 680, 'redistributing': 5569, 'these': 6983, 'files': 2598, 'please': 5186, 'take': 6832, 'look': 4149, 'important': 3637, 'information': 3669, 'this': 7015, 'header': 3263, 'we': 7652, 'encourage': 2216, 'you': 8021, 'keep': 3809, 'file': 2596, 'own': 4918, 'disk': 1919, 'keeping': 3812, 'an': 285, 'electronic': 2183, 'path': 5002, 'open': 4808, 'next': 4660, 'readers': 5530, 'do': 1967, 'remove': 5620, 'should': 6112, 'first': 2616, 'thing': 7000, 'seen': 5968, 'anyone': 335, 'opens': 4814, 'book': 877, 'change': 1195, 'or': 4824, 'edit': 2151, 'without': 7885, 'written':

In [103]:
# encode the text as a vector
vector = vectorizer.transform(cleaning_iliad)

In [104]:
# vocabulary length
print(vector.shape)

(1, 8045)


In [105]:
# a sparse vector, to make it possible to handle a lot of zeros
print(type(vector))

<class 'scipy.sparse.csr.csr_matrix'>


In [106]:
import numpy
import sys

numpy.set_printoptions(threshold=sys.maxsize)
# print the vector
print(vector.toarray())

[[  1   1   1   1   2   1   2   1   1   1   1   1   2   1   1   1   1   1
    1   1   1   1   1   1   1   1   1   2   1   2   1   1   1   1   1   1
    1   4   1   1   2   1   2  49   4   1   4   3   1   2   3   1  21   1
    2 163  63   1   1   1   1   1   3   3   1   2  11   1   2   1   1  19
   13   1   4   3   2   5   1   3   1   2   1   8  30   1   1   1   2   1
    1   1 387   1   1  31   5   3   1   4   2   1   3   5   1   1   5   1
    2   2   1   3   1   4   3   2   2   1   1   1   1   1  10  27   1  31
    1   4   1   2   4   4   1   1   1   1   7  21   1 108   2   7   2 167
    1 207 162   5   1   1   1   1  20   8   3   2   2  13   1   1   1   4
    2   1  23   1  14   9  29 124  26   1   1   1   1   1   1   1   1   1
    1   1   3  14   3  77   1   1   1   1   1  19   2   3   2   1   3   1
    1   1   2   1   6   5   1   1   4   1   1   1   4   3   1  19   1  41
    3   2  14  23   1   5   1   5   3   2   1   1  29   2   1   1   1   2
    1   1   6  57  42  12   1   1  55 

#### Term Frequency-Inverse Document Frequencies (TF-IDF)

In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [108]:
tfidf_vectorizer = TfidfVectorizer()

In [109]:
#we handle more than one text when using TF-IDF; each one will be the element of an array
text = []

noticia1 = open('noticia1.txt', 'rt', encoding = 'utf-8').read().lower()

noticia1.translate(str.maketrans('', '', string.punctuation))

text.append(noticia1)

In [110]:
noticia2 = open('noticia2.txt', 'rt', encoding = 'utf-8').read().lower()

noticia2.translate(str.maketrans('', '', string.punctuation))

text.append(noticia2)

In [111]:
noticia3 = open('noticia3.txt', 'rt', encoding = 'utf-8').read().lower()

noticia3.translate(str.maketrans('', '', string.punctuation))

text.append(noticia3)

In [112]:
# array of texts
text

['em entrevista à globonews, hoje, o promotor lincoln gakiya, do gaeco (grupo de atuação especial de combate ao crime organizado), \ndo ministério público de são paulo, disse que a liberdade de andré do rap é uma ameaça a mais à sua vida, mas também à sociedade. o \npromotor, que também é responsável pelos processos de execuções criminais (análise do cumprimento da pena) dos presos da penitenciária de \npresidente venceslau 2, está ameaçado de morte desde de novembro de 2018, quando pediu a transferência dos líderes do pcc para presídios \nfederais, entre eles marco willians herbas camacho, o marcola.\nmarcola e 21 líderes do pcc acabaram transferidos para presídios federais em fevereiro de 2019 por medo de retaliação do pcc, conforme \nrevelado pelo uol ano passado. ao ser perguntado se a libertação de andré do rap significava uma ameaça a mais contra ele, o promotor \nrespondeu: "sem sombra de dúvida".\nem seguida, explicou que é ameaçado desde o pedido de transferência e emendou: "e

In [113]:
tfidf_vectorizer.fit(text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [114]:
print(tfidf_vectorizer.vocabulary_)

{'em': 72, 'entrevista': 76, 'globonews': 110, 'hoje': 114, 'promotor': 192, 'lincoln': 124, 'gakiya': 107, 'do': 66, 'gaeco': 106, 'grupo': 112, 'de': 58, 'atuação': 19, 'especial': 81, 'combate': 38, 'ao': 14, 'crime': 51, 'organizado': 148, 'ministério': 136, 'público': 195, 'são': 227, 'paulo': 157, 'disse': 65, 'que': 198, 'liberdade': 122, 'andré': 10, 'rap': 199, 'uma': 249, 'ameaça': 8, 'mais': 127, 'sua': 222, 'vida': 254, 'mas': 130, 'também': 230, 'sociedade': 219, 'responsável': 201, 'pelos': 164, 'processos': 190, 'execuções': 90, 'criminais': 52, 'análise': 12, 'cumprimento': 56, 'da': 57, 'pena': 165, 'dos': 68, 'presos': 184, 'penitenciária': 166, 'presidente': 183, 'venceslau': 252, 'está': 87, 'ameaçado': 9, 'morte': 137, 'desde': 61, 'novembro': 143, '2018': 1, 'quando': 196, 'pediu': 162, 'transferência': 245, 'líderes': 126, 'pcc': 160, 'para': 151, 'presídios': 186, 'federais': 95, 'entre': 75, 'eles': 71, 'marco': 128, 'willians': 257, 'herbas': 113, 'camacho': 2

In [115]:
print(tfidf_vectorizer.idf_)

[1.69314718 1.28768207 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.28768207 1.28768207 1.         1.69314718
 1.69314718 1.69314718 1.69314718 1.28768207 1.69314718 1.69314718
 1.69314718 1.69314718 1.28768207 1.28768207 1.69314718 1.69314718
 1.69314718 1.69314718 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718 1.69314718 1.         1.69314718
 1.28768207 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718 1.28768207 1.69314718 1.
 1.69314718 1.28768207 1.69314718 1.28768207 1.         1.28768207
 1.69314718 1.28768207 1.69314718 1.69314718 1.69314718 1.
 1.         1.69314718 1.69314718 1.69314718 1.         1.69314718
 1.         1.69314718 1.69314718 1.69314718 1.         1.28768207
 1.69314718 1.69314718 1.69314718 1.69314718 1.28768207 1.69314718
 1.         1.69314718 1.69314718 1.28768207 1.69314718 1.
 1.69314718 1.69314

In [116]:
vector1 = tfidf_vectorizer.transform([text[0]])

In [117]:
vector2 = tfidf_vectorizer.transform([text[1]])

In [118]:
vector3 = tfidf_vectorizer.transform([text[2]])

In [119]:
print(vector1.shape)

(1, 258)


In [120]:
print(vector2.shape)

(1, 258)


In [121]:
print(vector3.shape)

(1, 258)


In [122]:
#text 1 vector
print(vector1.toarray())

[[0.         0.04096305 0.05386149 0.05386149 0.05386149 0.
  0.         0.         0.08192611 0.08192611 0.06362293 0.05386149
  0.05386149 0.         0.10772299 0.04096305 0.         0.
  0.         0.05386149 0.04096305 0.04096305 0.         0.
  0.         0.         0.04096305 0.         0.05386149 0.
  0.         0.         0.         0.         0.06362293 0.
  0.08192611 0.         0.05386149 0.         0.         0.
  0.05386149 0.         0.         0.05386149 0.         0.
  0.         0.         0.         0.04096305 0.05386149 0.03181147
  0.         0.04096305 0.05386149 0.16385222 0.54079492 0.08192611
  0.         0.08192611 0.         0.         0.         0.06362293
  0.34992613 0.         0.10772299 0.05386149 0.06362293 0.05386149
  0.15905733 0.05386149 0.         0.05386149 0.03181147 0.04096305
  0.         0.         0.         0.05386149 0.04096305 0.
  0.03181147 0.         0.         0.04096305 0.         0.06362293
  0.05386149 0.         0.04096305 0.0409630

In [123]:
#text 2 vector
print(vector2.toarray())

[[0.         0.06838364 0.         0.         0.         0.
  0.08991627 0.         0.06838364 0.06838364 0.106212   0.
  0.         0.         0.         0.06838364 0.         0.
  0.         0.         0.06838364 0.06838364 0.         0.
  0.         0.         0.06838364 0.         0.         0.
  0.         0.         0.         0.         0.106212   0.
  0.13676729 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.053106
  0.         0.06838364 0.         0.13676729 0.37174199 0.13676729
  0.         0.06838364 0.         0.         0.         0.053106
  0.42484799 0.08991627 0.         0.         0.053106   0.
  0.21242399 0.         0.         0.         0.053106   0.06838364
  0.         0.         0.         0.         0.06838364 0.
  0.053106   0.         0.         0.         0.         0.106212
  0.         0.         0.         0.06838364 0.06838364 0.
  0.08

In [124]:
#text 3 vector
print(vector3.toarray())

[[0.04236292 0.         0.         0.         0.         0.04236292
  0.         0.04236292 0.         0.         0.10008088 0.
  0.         0.04236292 0.         0.         0.04236292 0.04236292
  0.04236292 0.         0.         0.         0.04236292 0.04236292
  0.12708875 0.08472583 0.         0.04236292 0.         0.04236292
  0.04236292 0.04236292 0.04236292 0.04236292 0.05004044 0.04236292
  0.         0.5507179  0.         0.04236292 0.04236292 0.04236292
  0.         0.04236292 0.08472583 0.         0.04236292 0.04236292
  0.04236292 0.04236292 0.04236292 0.03221809 0.         0.02502022
  0.08472583 0.         0.         0.         0.2502022  0.
  0.04236292 0.         0.04236292 0.04236292 0.04236292 0.05004044
  0.2502022  0.         0.         0.         0.17514154 0.
  0.1251011  0.         0.04236292 0.         0.05004044 0.
  0.04236292 0.04236292 0.04236292 0.         0.         0.04236292
  0.02502022 0.04236292 0.04236292 0.03221809 0.04236292 0.05004044
  0.        

#### Tri-grams

In [125]:
from nltk.corpus import stopwords

In [126]:
from nltk.corpus import machado

raw_text = machado.raw('miscelanea/mams03.txt') # "Carta ao Sr. Bispo do RJ (1862)", from 'machado' corpus on NLTK

raw_text = [raw_text]

In [127]:
#Preprocessing before extract the n-grams: removal of special characters and stopwords
def remove_string_special_characters(s):  
    stripped = re.sub('[^a-zA-z\s]', '', s) 
    stripped = re.sub('_', '', stripped) 
      
    stripped = re.sub('\s+', ' ', stripped) 
       
    stripped = stripped.strip() 
    if stripped != '': 
            return stripped.lower()

In [128]:
stop_words = set(stopwords.words('portuguese'))  
for i, line in enumerate(raw_text): 
    raw_text[i] = ' '.join([x for 
        x in nltk.word_tokenize(line) if 
        ( x not in stop_words )]) 

In [129]:
trigram_vectorizer = CountVectorizer(ngram_range = (3,3)) 
X1 = trigram_vectorizer.fit_transform(raw_text)  
features = (trigram_vectorizer.get_feature_names()) 
print("\n\nTrigrams: \n", features) 
print("\n\nX1: \n", X1.toarray()) 



Trigrams: 
 ['04 1862 exma', '18 04 1862', '1862 carta sr', '1862 exma revma', '1994 publicado originalmente', 'abala se grande', 'absolvido atenta pureza', 'acho apreciar ignorância', 'acima interesses falsos', 'acontecendo assim invectivas', 'acusado no meio', 'adiantando situação supremo', 'admirar raras elevadas', 'admirável história redenção', 'afaz desde infância', 'afeito desde infância', 'afetada flagelo indiferença', 'afetavam algum modo', 'afrouxarem diminuírem crescem', 'agora veja exa', 'aguilar iii 1994', 'ainda hoje exmo', 'algum modo sublimidade', 'alguma coisa fizer', 'algumas palavras atavios', 'alma tranqüilidade pouco', 'almas verdadeiramente cristãs', 'altar exa receberá', 'altar tornou se', 'altas funções prelado', 'anima os põe', 'antes verdadeira única', 'aos espíritos menos', 'aos menos rústicos', 'apareceu colunas folhas', 'aplicando bela expressão', 'apreciar ignorância dessa', 'apreciei perto creio', 'apresenta se fórmulas', 'aqueles discípulos filho', 'aqu

In [130]:
trigram_vectorizer_tfidf = TfidfVectorizer(ngram_range = (3,3)) 
X2 = trigram_vectorizer_tfidf.fit_transform(raw_text) 
scores = (X2.toarray()) 
print("\n\nTri-gram Scores: \n", scores)



Tri-gram Scores: 
 [[0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.06904767 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.03452383 0.03452383 0.03452383 0.03452383
  0.03452383 0.03452383 0.0

In [131]:
sums = X2.sum(axis = 0) 
data1 = [] 
for col, term in enumerate(features): 
    data1.append( (term, sums[0,col] )) 
ranking = pd.DataFrame(data1, columns = ['term','rank']) 
words = (ranking.sort_values('rank', ascending = False)) 
print ("\n\nTri-gram Ranking: \n", words.head(10))



Tri-gram Ranking: 
                            term      rank
73   carta sr bispo              0.069048
0    04 1862 exma                0.034524
574  pode contestar lado         0.034524
551  parte clérigos evita        0.034524
552  parte secular eclesiástica  0.034524
553  partido clerical pretexto   0.034524
554  passou procissões ainda     0.034524
555  paulo escritas tinta        0.034524
556  país grotescas ridículas    0.034524
557  países católicos sido       0.034524


#### Bi-grams

In [132]:
#With the same text used in the 3-gram code
bigram_vectorizer = CountVectorizer(ngram_range =(2, 2)) 
X1 = bigram_vectorizer.fit_transform(raw_text)  
features = (bigram_vectorizer.get_feature_names()) 
print("\n\nBi-grams: \n", features)
print("\n\nX1: \n", X1.toarray()) 



Bi-grams: 
 ['04 1862', '18 04', '1862 carta', '1862 exma', '1994 publicado', 'abala se', 'absolvido atenta', 'acho apreciar', 'acima interesses', 'acontecendo assim', 'acusado no', 'adiantando situação', 'admirar raras', 'admirável história', 'afaz desde', 'afeito desde', 'afetada flagelo', 'afetavam algum', 'afrouxarem diminuírem', 'agora veja', 'aguilar iii', 'ainda hoje', 'algum modo', 'alguma coisa', 'algumas palavras', 'alma tranqüilidade', 'almas verdadeiramente', 'altar exa', 'altar tornou', 'altas funções', 'anima os', 'antes verdadeira', 'aos espíritos', 'aos menos', 'apareceu colunas', 'aplicando bela', 'apreciar ignorância', 'apreciei perto', 'apresenta se', 'aqueles discípulos', 'aqui sacerdócio', 'articulista provar', 'artigo pedindo', 'as conseqüências', 'as queixas', 'assim ex', 'assim invectivas', 'assis rio', 'assistir sob', 'atavios polêmica', 'atenta pureza', 'atual chefe', 'autoridade eclesiástica', 'auxílio estado', 'aí correm', 'balcão evangelho', 'base moral',

In [133]:
bigram_vectorizer_tfidf = TfidfVectorizer(ngram_range = (2, 2)) 
X2 = bigram_vectorizer_tfidf.fit_transform(raw_text) 
scores = (X2.toarray()) 
print("\n\nScores: \n", scores)



Scores: 
 [[0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.06726728 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0.03363364 0.03363364 0.03363364
  0.03363364 0.03363364 0.03363364 0

In [134]:
sums = X2.sum(axis = 0) 
data1 = [] 
for col, term in enumerate(features): 
    data1.append( (term, sums[0, col] )) 
ranking = pd.DataFrame(data1, columns = ['term', 'rank']) 
words = (ranking.sort_values('rank', ascending = False)) 
print ("\n\nBi-gram Ranking: \n", words.head(7)) 



Bi-gram Ranking: 
                   term      rank
301  exmo sr            0.168168
666  rio janeiro        0.100901
355  grande descrédito  0.067267
575  possa contar       0.067267
73   carta sr           0.067267
378  idéia religiosa    0.067267
288  exa não            0.067267


#### Binary

In [135]:
cv = CountVectorizer(cleaning_iliad,binary=True)

In [136]:
cv.fit(cleaning_iliad)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8',
        input=['project gutenberg etext iliad homer done into english prose andrew lang ma walter leaf littd ernest myers ma copyright laws are changing over world sure check laws your country before redistributing these files please take look important information this header we encourage you keep this fil...palace priam king thus held funeral tamer horses end project gutenberg etext iliad lang leaf myers'],
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [137]:
print(cv.vocabulary_)

{'project': 5351, 'gutenberg': 3139, 'etext': 2334, 'iliad': 3601, 'homer': 3463, 'done': 1990, 'into': 3707, 'english': 2238, 'prose': 5377, 'andrew': 293, 'lang': 3938, 'ma': 4231, 'walter': 7571, 'leaf': 3987, 'littd': 4112, 'ernest': 2306, 'myers': 4578, 'copyright': 1501, 'laws': 3974, 'are': 381, 'changing': 1198, 'over': 4891, 'world': 7936, 'sure': 6757, 'check': 1232, 'your': 8028, 'country': 1535, 'before': 680, 'redistributing': 5569, 'these': 6983, 'files': 2598, 'please': 5186, 'take': 6832, 'look': 4149, 'important': 3637, 'information': 3669, 'this': 7015, 'header': 3263, 'we': 7652, 'encourage': 2216, 'you': 8021, 'keep': 3809, 'file': 2596, 'own': 4918, 'disk': 1919, 'keeping': 3812, 'an': 285, 'electronic': 2183, 'path': 5002, 'open': 4808, 'next': 4660, 'readers': 5530, 'do': 1967, 'remove': 5620, 'should': 6112, 'first': 2616, 'thing': 7000, 'seen': 5968, 'anyone': 335, 'opens': 4814, 'book': 877, 'change': 1195, 'or': 4824, 'edit': 2151, 'without': 7885, 'written':

In [138]:
bin_vector = cv.transform(cleaning_iliad)

In [139]:
print(bin_vector.shape)

(1, 8045)


In [140]:
print(type(vector))

<class 'scipy.sparse.csr.csr_matrix'>


In [141]:
numpy.set_printoptions(threshold=sys.maxsize)
print(bin_vector.toarray())

[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

### Github:

URL: https://github.com/higopires/PLN_UFMA_2020_2_Projeto_1

Used files:
 - bras_cubas.txt: plain text file
 - capivara-pt.txt: plain text file
 - noticia1.txt: plain text file
 - noticia2.txt: plain text file
 - noticia3.txt: plain text file
 - preprocessing.txt: plain text file
 - sample.csv: CSV dataset with tweets
 - The-Iliad-of-Homer.txt: plain text file
 - urban_dictionary.txt: plain text file
 - w3_schools.txt: plain text file