In [40]:
from nltk.corpus import movie_reviews

In [83]:
movie_reviews.categories()

['neg', 'pos']

In [42]:
movie_reviews.fileids()[3]

'neg/cv003_12683.txt'

In [43]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [44]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [45]:
import random
random.shuffle(documents)
documents[0:5]

[(['once', 'upon', 'a', 'time', 'a', 'solitary', 'ogre', ...], 'pos'),
 (['this', 'is', 'one', 'of', 'the', 'worst', 'big', '-', ...], 'neg'),
 (['up', 'until', 'about', 'a', 'year', 'ago', ',', ...], 'pos'),
 (['dark', 'city', 'is', 'such', 'a', 'rare', 'treat', ...], 'pos'),
 (['"', 'book', '"', 'should', 'have', 'remained', 'in', ...], 'neg')]

In [46]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [47]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [48]:
from nltk import pos_tag
w = 'better'
pos_tag([w]) #pos_tag takes an array, otherwise it return for every letter

[('better', 'RBR')]

In [49]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [50]:
def clean_review(words):
    op_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag(w)
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            op_words.append(clean_word.lower())
    return op_words

In [51]:
documents = [(clean_review(document), category) for document, category in documents]

In [53]:
training_doc = documents[0:1500]
testing_doc = documents[1500:]

In [54]:
all_words = []
for doc in training_doc:
    all_words += doc[0]

In [55]:
import nltk

In [64]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]

In [67]:
#features

In [68]:
#documents[0]

In [69]:
def get_feature_dict(words):
    current_features = {}
    word_set = set(words)
    for w in features:
        current_features[w] = w in word_set
    return current_features

In [85]:
op = get_feature_dict(training_doc[0][0])
op

{'film': True,
 'movie': True,
 'one': True,
 'character': True,
 'like': True,
 'make': True,
 'time': True,
 'get': True,
 'even': False,
 'good': True,
 'story': True,
 'would': True,
 'much': True,
 'take': False,
 'well': False,
 'life': True,
 'also': False,
 'two': False,
 'see': False,
 '--': False,
 'first': True,
 'year': False,
 'go': False,
 'way': False,
 'come': True,
 'thing': False,
 'say': False,
 'really': True,
 'plot': False,
 'little': False,
 'know': True,
 'people': False,
 'man': True,
 'could': True,
 'scene': False,
 'work': True,
 'bad': True,
 'never': False,
 'new': True,
 'best': False,
 'performance': False,
 'end': False,
 'director': False,
 'look': True,
 'many': False,
 'actor': False,
 'action': False,
 'scenes': False,
 'play': False,
 'want': False,
 'watch': False,
 'give': True,
 'role': False,
 'great': True,
 'love': False,
 'star': False,
 'another': True,
 'find': False,
 'show': True,
 'still': False,
 'us': False,
 'back': True,
 'big': Fal

In [79]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_doc]
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_doc]

In [74]:
from nltk import NaiveBayesClassifier

In [75]:
classifier = NaiveBayesClassifier.train(training_data)

In [80]:
nltk.classify.accuracy(classifier, testing_data)

0.806

In [81]:
classifier.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     13.1 : 1.0
                    anna = True              pos : neg    =     10.9 : 1.0
                  finest = True              pos : neg    =     10.6 : 1.0
             outstanding = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.5 : 1.0
                  seagal = True              neg : pos    =      8.4 : 1.0
                  alicia = True              neg : pos    =      8.4 : 1.0
            respectively = True              pos : neg    =      7.8 : 1.0
                    lame = True              neg : pos    =      7.4 : 1.0
             wonderfully = True              pos : neg    =      7.4 : 1.0
                     era = True              pos : neg    =      7.2 : 1.0
              ridiculous = True              neg : pos    =      6.4 : 1.0
                   mulan = True              pos : neg    =      6.3 : 1.0