# Introduction to NLTK
- https://katacoda.com/basiafusinska/courses/nlp-with-python/nltk-intro

In [1]:
import nltk
nltk.download("movie_reviews", quiet=True)
#from nltk.corpus import movie_reviews

def load_movie_reviews():

    # movie_reviews is a sizeable corpus to import, so only load it if we have to
    from nltk.corpus import movie_reviews
    try:
        movie_reviews.categories()
    except:
        import nltk
        print('This appears to be your first time using the NLTK Movie Reviews corpus. We will first download the necessary corpus (this is a one-time download that might take a little while')
        nltk.download('movie_reviews')
        from nltk.corpus import movie_reviews

    raw_data = []

    # NLTK's corpus is structured in an interesting way
    # first iterate through the two categories (pos and neg)
    for category in movie_reviews.categories():

        if category == 'pos':
            pretty_category_name = 'positive'
        elif category == 'neg':
            pretty_category_name = 'negative'

        # each of these categories is just fileids, so grab those
        for fileid in movie_reviews.fileids(category):

            # then each review is a NLTK class where each item in that class instance is a word
            review_words = movie_reviews.words(fileid)
            review_text = ''

            for word in review_words:
                review_text += ' ' + word

            review_dictionary = {
                'text': review_text,
                'sentiment': pretty_category_name
            }

            raw_data.append(review_dictionary)

    return raw_data 

documents=load_movie_reviews()
documents=[document['text'] for document in documents]

In [2]:
documents[1]

' the happy bastard \' s quick movie review damn that y2k bug . it \' s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . little do they know the power within . . . going for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . we don \' t know why the crew was really out in the middle of nowhere , we don \' t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don \' t know why donald sutherland is stumbling around drunkenly throughout . here , it \' s just " hey , let \' s chase these people around with some robots " . the acting is below average , even from the likes of curtis . you \' re more likely to get a kick out of 

In [12]:
from nltk import sent_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
sent_tokenize(documents[0])

[' plot : two teen couples go to a church party , drink and then drive .',
 'they get into an accident .',
 'one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares .',
 "what ' s the deal ?",
 'watch the movie and " sorta " find out .',
 '.',
 '.',
 'critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package .',
 "which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn ' t snag this one correctly .",
 'they seem to have taken this pretty neat concept , but executed it terribly .',
 'so what are the problems with the movie ?',
 "well , its main problem is that it ' s simply too jumbled .",
 'it starts off " normal " but then downshifts into this " fantasy " world 

In [16]:
from nltk import word_tokenize

In [17]:
word_tokenize(documents[0])

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an',
 'accident',
 '.',
 'one',
 'of',
 'the',
 'guys',
 'dies',
 ',',
 'but',
 'his',
 'girlfriend',
 'continues',
 'to',
 'see',
 'him',
 'in',
 'her',
 'life',
 ',',
 'and',
 'has',
 'nightmares',
 '.',
 'what',
 "'",
 's',
 'the',
 'deal',
 '?',
 'watch',
 'the',
 'movie',
 'and',
 '``',
 'sorta',
 '``',
 'find',
 'out',
 '.',
 '.',
 '.',
 'critique',
 ':',
 'a',
 'mind',
 '-',
 'fuck',
 'movie',
 'for',
 'the',
 'teen',
 'generation',
 'that',
 'touches',
 'on',
 'a',
 'very',
 'cool',
 'idea',
 ',',
 'but',
 'presents',
 'it',
 'in',
 'a',
 'very',
 'bad',
 'package',
 '.',
 'which',
 'is',
 'what',
 'makes',
 'this',
 'review',
 'an',
 'even',
 'harder',
 'one',
 'to',
 'write',
 ',',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'which',
 'attempt',
 'to',
 'break',
 'the',
 'mold',
 ',',
 'mess',
 'with',
 'you

In [18]:
from nltk.probability import FreqDist

In [28]:
words_counts=FreqDist(word_tokenize(documents[0]))

In [30]:
words_counts.most_common(20)

[(',', 44),
 ('the', 38),
 ('.', 34),
 ('it', 25),
 ('and', 20),
 ('to', 16),
 ('of', 16),
 ("'", 16),
 ('a', 14),
 ('that', 13),
 ('are', 13),
 ('is', 12),
 ('but', 10),
 ('``', 10),
 ('-', 10),
 ('this', 10),
 ('there', 10),
 ('10', 10),
 ('s', 9),
 ('(', 9)]

In [35]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [37]:
pos_tag("Mary was wearing the hat while walking down the street")

[('M', 'NNP'),
 ('a', 'DT'),
 ('r', 'NN'),
 ('y', 'NN'),
 (' ', 'NNP'),
 ('w', 'VBZ'),
 ('a', 'DT'),
 ('s', 'JJ'),
 (' ', 'NN'),
 ('w', 'NN'),
 ('e', 'VBZ'),
 ('a', 'DT'),
 ('r', 'NN'),
 ('i', 'NN'),
 ('n', 'VBP'),
 ('g', 'NN'),
 (' ', 'NNP'),
 ('t', 'NN'),
 ('h', 'NN'),
 ('e', 'NN'),
 (' ', 'NNP'),
 ('h', 'VBZ'),
 ('a', 'DT'),
 ('t', 'NN'),
 (' ', 'NNP'),
 ('w', 'NN'),
 ('h', 'NN'),
 ('i', 'JJ'),
 ('l', 'NN'),
 ('e', 'NN'),
 (' ', 'NNP'),
 ('w', 'VBZ'),
 ('a', 'DT'),
 ('l', 'NN'),
 ('k', 'NN'),
 ('i', 'JJ'),
 ('n', 'VBP'),
 ('g', 'JJ'),
 (' ', 'NNP'),
 ('d', 'NN'),
 ('o', 'NN'),
 ('w', 'NN'),
 ('n', 'JJ'),
 (' ', 'NNP'),
 ('t', 'NN'),
 ('h', 'NN'),
 ('e', 'NN'),
 (' ', 'NNP'),
 ('s', 'NN'),
 ('t', 'NN'),
 ('r', 'NN'),
 ('e', 'NN'),
 ('e', 'NN'),
 ('t', 'NN')]