In [1]:
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [6]:
from bs4 import BeautifulSoup

### Downloading NLTK resources

The first time you run anything using NLTK, you'll want to go ahead and download the additional resources that aren't distributed directly with the NLTK package. Upon running the nltk.download() command below, the the NLTK Downloader window will pop-up. In the Collections tab, select "all" and click on Download. As mentioned earlier, this may take several minutes depending on your network connection speed, but you'll only ever need to run it a single time.

or you can download some of the functions you need to use 
- nltk.download_shell()
- hit d and and type in all to down all packages 
- install averaged_perceptron_tagger
- install maxent_ne_chunker
- install words

#### Extracting text from HTML

In [10]:
from urllib.request import urlopen

url = "http://venturebeat.com/2014/07/04/facebooks-little-social-experiment-got-you-bummed-out-get-over-it/"
html = urlopen(url).read()

#### Stripping-out HTML formatting

Readability, which pulls the main body content out of an HTML document and subsequently "cleans it up."

In [21]:
from readability import Document
from bs4 import BeautifulSoup

readable_article = Document(html).summary()
readable_title = Document(html).title()
soup = BeautifulSoup(readable_article,"lxml")
body = soup.get_text()

print ('*** TITLE *** \n\"' + readable_title + '\"\n')
print ('*** CONTENT *** \n\"' + body[:200] + '[...]\"')


*** TITLE *** 
"Facebook’s little social experiment got you bummed out? Get over it | VentureBeat | Social | by Simon Cohen"

*** CONTENT *** 
"
OP-ED — You would think by the reaction some are having to it that Facebook’s recent admission that it experimented with some people’s feeds is tantamount to Watergate.
You would think there had been[...]"


## Frequency Analysis

#### Tokenize

In [40]:
## tokenize by sentence
sent_tokenize(body)[0]

'\nOP-ED — You would think by the reaction some are having to it that Facebook’s recent admission that it experimented with some people’s feeds is tantamount to Watergate.'

In [43]:
## tokenize by word
word_tokenize(body)[:10]

['OP-ED', '—', 'You', 'would', 'think', 'by', 'the', 'reaction', 'some', 'are']

In [38]:
from collections import Counter
tokens = [word for sent in sent_tokenize(body) for word in word_tokenize(sent)]

# show most common words 
vocab_count = Counter(tokens)
vocab_count.most_common(10)

[('.', 43),
 ('the', 38),
 ('’', 32),
 ('of', 29),
 (',', 28),
 ('to', 22),
 ('that', 18),
 ('s', 18),
 ('Facebook', 17),
 ('it', 16)]

##### Stemming and lemmatization 

In [49]:
## this is stemming -- not very good 
porter_stemmer = PorterStemmer()
print('stemming:', porter_stemmer.stem('wolves'))

## this is lemmatizing
lemmatizer = WordNetLemmatizer()
print('lemmatizing:', lemmatizer.lemmatize('wolves'))

## process our data 
stemmed_tokens = [porter_stemmer.stem(t) for t in tokens]
lemmatized_tokens = [lemmatizer.lemmatize(t) for t in tokens]

stemming: wolv
lemmatizing: wolf


#### Stop words

In [53]:
stopw = stopwords.words('english')
stopw[:5]

## you can also import your own stopword list from a file 
## from http://www.lextek.com/manuals/onix/stopwords1.html
#  stopwords = set(w.rstrip() for w in open('stopwords.txt'))

## filter stop words
lemmatized_tokens_no_stop = [t for t in lemmatized_tokens if t not in stopw]

#### Pos tagging

In [56]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
## example of pos tagging 
nltk.pos_tag("Machine learning is great".split())

[('Machine', 'NN'), ('learning', 'NN'), ('is', 'VBZ'), ('great', 'JJ')]

#### Tagging Tokens

In [59]:
### simple graph to show sentance structure
s = "Albert Einstein was born on March 14, 1879"
#s = "Chengyu Huang is a chinese name"
tags = nltk.pos_tag(s.split())
result = nltk.ne_chunk(tags)  ## stored the reulst,for some reason, when you tried to print it out, it will give you an error  
result.draw()                 ## it will draw a recognized chart

#### Chunking with regx

In [60]:
## group all adj + name together 
## this is just a basic chunk
chunkGram = r"""chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged[0])
chunked.draw()

#### Chinking with regx

In [44]:
## remove tings from chunks 
chinkGram = r"""chunk: {<.*>+}
                        }<VB.?|IN|DT|TO>{"""   ## remove any VB* or IN or DT from chunk
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged[0])
chunked.draw()