In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories()

['neg', 'pos']

In [3]:
movie_reviews.fileids()[3]

'neg/cv003_12683.txt'

In [4]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [5]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [6]:
import random
random.shuffle(documents)
documents[0:5]

[(['a', 'big', 'surprise', 'to', 'me', '.', 'the', ...], 'pos'),
 (['deserves', 'recognition', 'for', ':', 'achieving', ...], 'neg'),
 (['in', 'the', 'continuation', 'of', 'warner', ...], 'neg'),
 (['upon', 'first', 'viewing', 'of', 'this', 'movie', ...], 'neg'),
 (['>', 'from', 'the', 'commercials', ',', 'this', ...], 'pos')]

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [8]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
from nltk import pos_tag
w = 'better'
pos_tag([w]) #pos_tag takes an array, otherwise it return for every letter

[('better', 'RBR')]

In [10]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [11]:
def clean_review(words):
    op_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag(w)
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            op_words.append(clean_word.lower())
    return op_words

In [48]:
documents = [(clean_review(document), category) for document, category in documents]
documents[0]

KeyboardInterrupt: 

In [13]:
training_doc = documents[0:1500]
testing_doc = documents[1500:]

In [14]:
all_words = []
for doc in training_doc:
    all_words += doc[0]

In [15]:
import nltk

In [16]:
freq = nltk.FreqDist(all_words)
common = freq.most_common(3000)
features = [i[0] for i in common]

In [17]:
#features

In [18]:
#documents[0]

In [19]:
def get_feature_dict(words):
    current_features = {}
    word_set = set(words)
    for w in features:
        current_features[w] = w in word_set
    return current_features

In [33]:
op = get_feature_dict(training_doc[0][0])


In [34]:
training_data = [(get_feature_dict(doc), category) for doc, category in training_doc]
testing_data = [(get_feature_dict(doc), category) for doc, category in testing_doc]

In [35]:
from nltk import NaiveBayesClassifier

In [36]:
classifier = NaiveBayesClassifier.train(training_data)

In [37]:
nltk.classify.accuracy(classifier, testing_data)

0.8

In [38]:
classifier.show_most_informative_features(15)

Most Informative Features
               ludicrous = True              neg : pos    =     18.6 : 1.0
             outstanding = True              pos : neg    =     14.1 : 1.0
                  seagal = True              neg : pos    =     10.8 : 1.0
                  finest = True              pos : neg    =     10.0 : 1.0
                   mulan = True              pos : neg    =      8.5 : 1.0
                  poorly = True              neg : pos    =      7.5 : 1.0
                  turkey = True              neg : pos    =      7.4 : 1.0
                   anger = True              pos : neg    =      7.1 : 1.0
                   jolie = True              neg : pos    =      6.9 : 1.0
                   damon = True              pos : neg    =      6.8 : 1.0
               stupidity = True              neg : pos    =      6.6 : 1.0
               laughable = True              neg : pos    =      6.6 : 1.0
                   inept = True              neg : pos    =      6.4 : 1.0

In [40]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier

In [41]:
svc = SVC()
classifier_sklearn = SklearnClassifier(svc)

In [42]:
classifier_sklearn.train(training_data)



<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))>

In [43]:
nltk.classify.accuracy(classifier_sklearn, testing_data)

0.76

In [44]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
rfc = RandomForestClassifier()
classifier_sklearn1 = SklearnClassifier(rfc)

In [51]:
classifier_sklearn1.train(training_data)

<SklearnClassifier(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))>

In [53]:
nltk.classify.accuracy(classifier_sklearn1, testing_data)

0.664

In [54]:
from sklearn.feature_extraction.text import CountVectorizer

In [63]:
train_set = {"the sky sky is blue", "the sun is bright"}
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)
a.todense()

matrix([[1, 2, 1],
        [1, 0, 1]], dtype=int64)

In [64]:
count_vec.get_feature_names()

['is', 'sky', 'the']

In [65]:
categories = [category for document, category in documents]

In [67]:
text_documents = [" ".join(document) for document, category in documents]

In [68]:
text_documents[0]

'big surprise good trailer hinted pulled impossible making blue brother movie without jake blue john belushi dumb idea really hope replaced fine john goodman much anything brilliant actor joe morton really sing fourth blue brother j evan bonifant ten another fear film really good dancer play mean harmonica although may dubbed thing intellectually bothered like mission god everyone richer filmed chicago -- give problem quite pleased le car pile ups meant le music john landis seems lost interest whole thing early crash one huge pile stop music one problem first cab calloway song good actually blue brother look dull problem music good ever ton great musician showing -- exception johnny lang sing musician great job real problem special effect superfluous waste money since film well could mean possibility another sequel want see bluegrass version rider sky even better rawhide -- http // www geocities com hollywood academy 8034 remove spam reply drive carefully recklessly mama child toy exce

In [70]:
from sklearn.model_selection import train_test_split

In [71]:
x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [95]:
count_vec = CountVectorizer(max_features = 2000, ngram_range = (1,3))
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [96]:
#count_vec.get_feature_names()

In [97]:
x_test_features = count_vec.transform(x_test)

In [98]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 83270 stored elements in Compressed Sparse Row format>

In [99]:
from sklearn.svm import SVC

In [100]:
svc = SVC()
svc.fit(x_train_features, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [101]:
svc.score(x_test_features, y_test)

0.83