In [1]:
from nltk.corpus import movie_reviews

movie_reviews.categories()

movie_reviews.fileids('neg')

documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))

import random
random.shuffle(documents)

from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
lemmatizer = WordNetLemmatizer()

from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)
stops, string.punctuation

def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

documents = [(clean_review(document), category) for document, category in documents]

# Getting the data in format what sklearn requires and just using nltk for cleaning purpose only

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
train_set = {"the sky sky is blue", "the sun is bright"}
count_vec = CountVectorizer(max_features = 3)
a = count_vec.fit_transform(train_set)
#count vectorizer is going to go through all the data and
# choose the best possible features.(most frequent features)
# it wont worry about the stop words, that is something we are suppose to take 
# care of while cleaning the data
#it will find the no of features ==max_features
#it will convert each document to a row which shows frequency of the choosen features in the document
#we can use .todense() or .toarray()
#difference is toarray returns an ndarray; todense returns a matrix. 
#If you want a matrix, use todense ; otherwise, use toarray .

a.todense()

matrix([[1, 2, 1],
        [1, 0, 1]], dtype=int64)

In [4]:
#sparse matrix means a matrix having a lot of zeros
# as all documents wont have all the features so there are going to be 
#a lot of zero entries
a

<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

#### to get feature names choosen by countvectorizer

In [5]:
count_vec.get_feature_names()

['is', 'sky', 'the']

In [6]:
a = ["ad", "is"]
" ".join(a)

'ad is'

In [7]:
" ".join(documents[0][0])

'vannesa kensington austin smoke sex austin power know baby never look begin journey anticipate sequel summer season austin power 2 sequel sleeper hit 1997 fill brim uproarious sight gag lurid toilet joke make keel hilarity mind mike myers obviously bizarre place myers return swing 60 spy arch nemesis bald head dr evil give much spotlight early scene dr evil son scott seth green appear jerry springer segment entitle dad evil want take world host springer talk show gag spoof everything oprah regis kathie lee longer funny happily exception especially fight break dr evil guest sample dialogue come back mother #@%$^ want piece audience stitch many good thing sequel dr evil assist mini verne troyer pint size clone result experiment go awry mini perfect new character fan could ask troyer terrific job mimic everything big brother pinkie mindy sterling return frau farbissina evil loud mouth assassin sidekick treat brief romantic liaison two underground bedroom chamber work surprisingly well af

In [8]:
categories = [category for document, category in documents]

text_documents = [" ".join(document) for document, category in documents]

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

### we use count vectorizer only on training data

In [9]:
count_vec = CountVectorizer(max_features = 2000)
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 2, 0],
        ...,
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '17',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accompany',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'afraid',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'aid',
 'aim',
 'air',
 'al',
 'ala',
 'alan',
 'alex',
 'alice',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anne',
 'annoy',
 'another',
 'answer',
 'anthony',
 'anti',
 'anyone',
 'anything',
 'anyway',
 'apart',
 'apa

In [11]:
len(count_vec.get_feature_names())

2000

In [12]:
x_test_features = count_vec.transform(x_test)

In [13]:
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 84966 stored elements in Compressed Sparse Row format>

In [14]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features, y_train)
svc.score(x_test_features, y_test)

0.806

# using n grams
#### ngrams means combination of n words as a feature
#### 1gram means one word, 2gram(bigram) means combination of 2 words such as sky blue in above example . ngram helps in scenarios like not good, where combination of 2 changes meaning
#### ngrams can increase or decrease frequency. depends on experimentation.
##### syntax is ngram_range=(a,b) where everything betwen a and b and both a,b are included in range



In [15]:
count_vec = CountVectorizer(max_features = 2000, ngram_range=(1,2))
x_train_features = count_vec.fit_transform(x_train)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 2],
        ...,
        [0, 0, 0, ..., 1, 0, 1],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
count_vec.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '17',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '50',
 '60',
 '70',
 '80',
 '90',
 'abandon',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'accomplish',
 'achieve',
 'across',
 'act',
 'action',
 'action film',
 'action movie',
 'action scene',
 'action sequence',
 'actor',
 'actress',
 'actual',
 'actually',
 'ad',
 'adam',
 'adaptation',
 'add',
 'addition',
 'admit',
 'adult',
 'adventure',
 'affair',
 'affleck',
 'afraid',
 'african',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'aid',
 'aim',
 'air',
 'alan',
 'alex',
 'alice',
 'alien',
 'alive',
 'allen',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'along way',
 'already',
 'also',
 'although',
 'always',
 'amaze',
 'america',
 'american',
 'among',
 'amount',
 'amuse',
 'amy',
 'anderson',
 'andrew',
 'angel',
 'angle',
 'angry',
 'animal',
 'animate',
 'animation',
 'anne',
 'annoy',
 'another',
 'answer',
 'anthony',
 'an

In [17]:
len(count_vec.get_feature_names())

2000

In [18]:
x_test_features = count_vec.transform(x_test)
x_test_features

<500x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 86111 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features, y_train)
svc.score(x_test_features, y_test)

0.806

# using maxdf and mindf

In [20]:
count_vec = CountVectorizer(max_features = 2000, max_df=.95,min_df=.1)
x_train_features = count_vec.fit_transform(x_train)
x_test_features = count_vec.transform(x_test)

In [21]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features, y_train)
svc.score(x_test_features, y_test)

0.784

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer as TIV

t=TIV(max_features=2000)

x_train_features=t.fit_transform(x_train)
x_test_features=t.transform(x_test)
# len(t.get_feature_names())

from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_features, y_train)
svc.score(x_test_features, y_test)

0.832