In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
movie_reviews.fileids()

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [4]:
len(movie_reviews.fileids())

2000

In [5]:
len(movie_reviews.fileids('pos'))

1000

In [6]:
movie_reviews.words(movie_reviews.fileids()[1])

['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...]

In [7]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[-1:]

[(['truman', '(', '"', 'true', '-', 'man', '"', ')', ...], 'pos')]

In [8]:
import random
random.shuffle(documents)
documents[:5]

[(['remember', 'back', 'in', 'the', 'mid', '1990s', ...], 'neg'),
 (['i', 'know', 'it', 'already', 'opened', 'in', ...], 'pos'),
 (['don', "'", 't', 'let', 'this', 'movie', 'fool', ...], 'neg'),
 (['here', 'is', 'a', 'film', 'that', 'is', 'so', ...], 'pos'),
 (['written', 'by', 'alex', 'cox', ',', 'tod', 'davies', ...], 'neg')]

In [9]:
from nltk.stem import WordNetLemmatizer
Lemmatizer = WordNetLemmatizer()


In [10]:
from nltk.corpus import wordnet
from nltk import pos_tag
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADJ
    else:
        return wordnet.NOUN
    

In [11]:
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
import string
punc = list(string.punctuation)
stops.update(punc)

In [12]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = Lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [13]:
documents = [(clean_review(document), category) for document, category in documents]

In [14]:
documents[0]

(['remember',
  'back',
  'mid',
  '1990s',
  'crime',
  'macabre',
  'movie',
  'rage',
  'pulp',
  'fiction',
  'fargo',
  'manage',
  'get',
  'oscar',
  'nomination',
  'best',
  'picture',
  'surprisingly',
  'slew',
  'rip',
  'offs',
  'follow',
  'year',
  'thereafter',
  'fad',
  'seem',
  'come',
  'christopher',
  'mcquarrie',
  'write',
  'direct',
  'way',
  'gun',
  'first',
  'glance',
  'look',
  'like',
  'another',
  'wanna',
  'upon',
  'closer',
  'inspection',
  'look',
  'like',
  'anything',
  'comprehendable',
  'crime',
  'story',
  'wrap',
  'little',
  'world',
  'claustrophobic',
  'atmosphere',
  'film',
  'allow',
  'story',
  'much',
  'room',
  'expand',
  'outside',
  'handful',
  'character',
  'somehow',
  'still',
  'manages',
  'turn',
  'simple',
  'premise',
  'something',
  'complex',
  'ridiculous',
  'complexity',
  'come',
  'surprise',
  'anyone',
  'familiar',
  'mcquarrie',
  'write',
  'usual',
  'suspect',
  'become',
  'cult',
  'favorit

In [15]:
training_documents = documents[0:1500]
testing_documents = documents[1500:]


In [16]:
all_words = []
for doc in training_documents:
    all_words+=doc[0]

In [17]:
import nltk

In [18]:
freq = nltk.FreqDist(all_words)

In [19]:
common = freq.most_common(3000)
features = [i[0] for i in common]
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'good',
 'see',
 'go',
 'time',
 'even',
 'scene',
 'story',
 'take',
 'would',
 'much',
 'come',
 'also',
 'bad',
 'life',
 'two',
 'end',
 'well',
 'give',
 'way',
 'look',
 'first',
 '--',
 'know',
 'seem',
 'year',
 'work',
 'thing',
 'plot',
 'say',
 'really',
 'play',
 'show',
 'little',
 'people',
 'man',
 'could',
 'star',
 'great',
 'try',
 'love',
 'never',
 'best',
 'director',
 'new',
 'performance',
 'many',
 'big',
 'want',
 'action',
 'actor',
 'find',
 'watch',
 'u',
 'think',
 'role',
 'another',
 'act',
 'back',
 'something',
 'audience',
 'still',
 'turn',
 'day',
 'world',
 'old',
 'set',
 'use',
 'however',
 'though',
 'feel',
 'comedy',
 'begin',
 'cast',
 'guy',
 'every',
 'part',
 'enough',
 'point',
 'last',
 'real',
 'write',
 'around',
 'run',
 'interest',
 'funny',
 'may',
 'young',
 'right',
 'long',
 'friend',
 'actually',
 'woman',
 'minute',
 'fact',
 'script',
 'name',
 'almost',
 'noth

In [20]:
def get_features_dict(words):
    current_features ={}
    word_set = set(words)
    for w in features:
        current_features[w] = w in word_set 
    return current_features

In [21]:
output = get_features_dict(training_documents[0][0])
output

{'film': True,
 'movie': True,
 'one': True,
 'make': True,
 'like': True,
 'character': True,
 'get': True,
 'good': True,
 'see': True,
 'go': True,
 'time': True,
 'even': True,
 'scene': True,
 'story': True,
 'take': True,
 'would': True,
 'much': True,
 'come': True,
 'also': True,
 'bad': False,
 'life': False,
 'two': True,
 'end': False,
 'well': True,
 'give': True,
 'way': True,
 'look': True,
 'first': True,
 '--': False,
 'know': True,
 'seem': True,
 'year': True,
 'work': True,
 'thing': True,
 'plot': False,
 'say': True,
 'really': True,
 'play': False,
 'show': True,
 'little': True,
 'people': True,
 'man': True,
 'could': True,
 'star': False,
 'great': True,
 'try': False,
 'love': True,
 'never': True,
 'best': True,
 'director': False,
 'new': True,
 'performance': False,
 'many': True,
 'big': True,
 'want': True,
 'action': False,
 'actor': True,
 'find': False,
 'watch': False,
 'u': True,
 'think': True,
 'role': False,
 'another': True,
 'act': True,
 'back'

In [22]:
training_data = [(get_features_dict(doc), category) for doc, category in training_documents]

In [23]:
testing_data = [(get_features_dict(doc), category) for doc, category in testing_documents]

In [24]:
from nltk import NaiveBayesClassifier 

In [25]:
classifier = NaiveBayesClassifier.train(training_data)

In [26]:
nltk.classify.accuracy(classifier, testing_data)

0.782

In [27]:
classifier.show_most_informative_features(15)

Most Informative Features
             outstanding = True              pos : neg    =     19.2 : 1.0
               stupidity = True              neg : pos    =      9.9 : 1.0
                   mulan = True              pos : neg    =      8.3 : 1.0
                    pitt = True              pos : neg    =      8.3 : 1.0
                 idiotic = True              neg : pos    =      8.2 : 1.0
                   jolie = True              neg : pos    =      7.7 : 1.0
             magnificent = True              pos : neg    =      7.5 : 1.0
               ludicrous = True              neg : pos    =      7.3 : 1.0
                   damon = True              pos : neg    =      7.2 : 1.0
                  seagal = True              neg : pos    =      7.1 : 1.0
                  castle = True              pos : neg    =      6.9 : 1.0
                 fincher = True              pos : neg    =      6.9 : 1.0
             wonderfully = True              pos : neg    =      6.9 : 1.0

In [28]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier


In [29]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [30]:
classifier_sklearn.train(training_data)

<SklearnClassifier(SVC())>

In [31]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.858

In [32]:
from sklearn.ensemble import RandomForestClassifier

In [33]:
rfc = RandomForestClassifier()
classify_sklearn = SklearnClassifier(rfc)

In [34]:
classify_sklearn.train(training_data)

<SklearnClassifier(RandomForestClassifier())>

In [35]:
nltk.classify.accuracy(classify_sklearn, testing_data)

0.81

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [37]:
train_set = {"the sky is blue", "the sun sun is bright"}
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)
a

<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [38]:
a.todense()

matrix([[1, 0, 1],
        [1, 2, 1]])

In [39]:
count_vec.get_feature_names()

['is', 'sun', 'the']

In [40]:
documents[0]

(['remember',
  'back',
  'mid',
  '1990s',
  'crime',
  'macabre',
  'movie',
  'rage',
  'pulp',
  'fiction',
  'fargo',
  'manage',
  'get',
  'oscar',
  'nomination',
  'best',
  'picture',
  'surprisingly',
  'slew',
  'rip',
  'offs',
  'follow',
  'year',
  'thereafter',
  'fad',
  'seem',
  'come',
  'christopher',
  'mcquarrie',
  'write',
  'direct',
  'way',
  'gun',
  'first',
  'glance',
  'look',
  'like',
  'another',
  'wanna',
  'upon',
  'closer',
  'inspection',
  'look',
  'like',
  'anything',
  'comprehendable',
  'crime',
  'story',
  'wrap',
  'little',
  'world',
  'claustrophobic',
  'atmosphere',
  'film',
  'allow',
  'story',
  'much',
  'room',
  'expand',
  'outside',
  'handful',
  'character',
  'somehow',
  'still',
  'manages',
  'turn',
  'simple',
  'premise',
  'something',
  'complex',
  'ridiculous',
  'complexity',
  'come',
  'surprise',
  'anyone',
  'familiar',
  'mcquarrie',
  'write',
  'usual',
  'suspect',
  'become',
  'cult',
  'favorit

In [43]:
categories = [category for document, category in documents]
categories

['neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'neg',
 'pos',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'neg',
 'neg',
 'pos',
 'pos',
 'neg',
 'pos',
 'pos',
 'neg',
 'neg',
 'neg',
 'neg',
 'pos',
 'pos',
 'pos',
 'neg',
 'pos',
 'neg',


In [44]:
text_documents = [" ".join(document) for document, category in documents]

In [45]:
text_documents

['remember back mid 1990s crime macabre movie rage pulp fiction fargo manage get oscar nomination best picture surprisingly slew rip offs follow year thereafter fad seem come christopher mcquarrie write direct way gun first glance look like another wanna upon closer inspection look like anything comprehendable crime story wrap little world claustrophobic atmosphere film allow story much room expand outside handful character somehow still manages turn simple premise something complex ridiculous complexity come surprise anyone familiar mcquarrie write usual suspect become cult favorite despite little critical acclaim film great story well direct elaborate confuse screenplay leaf many viewer scratch head repeat viewing wonder mcquarrie really everything mapped go hollywood idea really confuse criminal cop make fascinate way gun mcquarrie seem capitalize idea time story far less commercial shame maybe would help film tell story two drifter ambition barely reason live hell bent death either

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X_train, X_test, Y_train, Y_test = train_test_split(text_documents, categories)

In [52]:
count_vec = CountVectorizer(max_features = 2000)
X_train_features = count_vec.fit_transform(X_train)
X_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [53]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '2001',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'achievement',
 'across',
 'act',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adapt',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'afraid',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'ala',
 'alan',
 'alex',
 'alice',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anna',
 'anne',
 'annie',
 'annoy',
 'another',
 'answer',
 'anthony',
 'anti',
 'anyone',
 'anyt

In [55]:
X_test_features = count_vec.transform(X_test)
X_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 86200 stored elements in Compressed Sparse Row format>

In [56]:
from sklearn.svm import SVC

In [57]:
svc = SVC()
svc.fit(X_train_features, Y_train)

SVC()

In [58]:
svc.score(X_test_features, Y_test)

0.834

In [66]:
count_vec = CountVectorizer(max_features = 2000, ngram_range=(2, 3))
X_train_features = count_vec.fit_transform(X_train)
X_train_features
count_vec.get_feature_names()

['10 10',
 '10 minute',
 '10 scale',
 '10 thing',
 '10 thing hate',
 '10 year',
 '100 million',
 '13th warrior',
 '14 year',
 '15 minute',
 '17 year',
 '1999 eugene',
 '1999 eugene novikov',
 '19th century',
 '20 minute',
 '20 year',
 '2001 space',
 '2001 space odyssey',
 '20th century',
 '30 minute',
 '90 minute',
 'able make',
 'absolutely nothing',
 'academy award',
 'ace ventura',
 'act ability',
 'act film',
 'act good',
 'act like',
 'act one',
 'act talent',
 'action adventure',
 'action comedy',
 'action film',
 'action flick',
 'action hero',
 'action movie',
 'action packed',
 'action scene',
 'action sequence',
 'action star',
 'action thriller',
 'actor film',
 'actor play',
 'actually get',
 'actually quite',
 'actually work',
 'adam sandler',
 'african american',
 'al pacino',
 'albert brook',
 'alec baldwin',
 'alien film',
 'alien resurrection',
 'almost always',
 'almost entirely',
 'almost every',
 'along line',
 'along way',
 'already know',
 'also direct',
 'also fe

In [67]:
X_test_features = count_vec.transform(X_test)
X_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 12855 stored elements in Compressed Sparse Row format>

In [68]:
svc = SVC()
svc.fit(X_train_features, Y_train)

SVC()

In [69]:
svc.score(X_test_features, Y_test)

0.706