# Basic Sentiment Analysis

## 1. Exploring the `movie_reviews` corpus

In [None]:
from nltk.corpus import movie_reviews # These are movie reviews already separated as positive and negative.
print(movie_reviews.readme())

In [None]:
movie_reviews.fileids()

In [None]:
len(movie_reviews.fileids())

In [None]:
print(movie_reviews.raw("neg/cv000_29416.txt"))

<br>
<br>
<hr>
<br>
<br>

## 2. Building and testing the classifier

### Make a Stop Words Excemption List

In [None]:
from nltk.corpus import stopwords

stops = stopwords.words('english')
stops.extend('.,[,],(,),;,/,-,\',?,",:,<,>,n\'t,|,#,\'s,\",\'re,\'ve,\'ll,\'d,\'re'.split(','))
stops.extend(',')
stops

### Movie Review `Featurizer`
This function will take each review and only grab the actual words, which the classifier will use to derive sentiment.

In [None]:
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

# our word `Featurizer` makes it so we only pass legitimate words to our classsifier.
def word_feats(words):
    return dict([(word, True) for word in words if word not in stops and word.isalpha()])

pos_ids = movie_reviews.fileids('pos')
neg_ids = movie_reviews.fileids('neg')

print(len(pos_ids))
print(len(neg_ids))

### Construct `pos` and `neg` features

In [None]:
# We take the positive/negative words, create the feature for such words, and store it in a positive/negative features list.
pos_feats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in pos_ids]
neg_feats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in neg_ids]

In [None]:
print(len(pos_feats))
pos_feats[0]

In [None]:
print(len(neg_feats))
neg_feats[0]

### Make `3/4` of `pos_feats` and `neg_feats` into `train` and `test` sets

In [1]:
# 3/4 of the features will be used for training.
pos_len_train = int(len(pos_feats) * 3 / 4)
neg_len_train = int(len(neg_feats) * 3 / 4)

print(pos_len_train)
print(neg_len_train)

NameError: name 'pos_feats' is not defined

In [None]:
# neg_feats[:neg_len_train][1]

### Train Classifier

In [None]:
train_feats = neg_feats[:neg_len_train] + pos_feats[:pos_len_train]
test_feats = neg_feats[neg_len_train:] + pos_feats[pos_len_train:]

# Training a NaiveBayesClassifier with our training feature words.
classifier = NaiveBayesClassifier.train(train_feats)

### Test Classifier

In [None]:
print('Accuracy: ', accuracy(classifier, test_feats))

### What were the important features?

In [None]:
# We can see which words fit best in each class.
classifier.show_most_informative_features()

<br>
<br>
<hr>
<br>
<br>

# 3. Classifying new data

In [None]:
from nltk import word_tokenize, pos_tag

#### Make a new Sentence

In [None]:
sentence = "I feel so miserable, it makes me amazing"
sentence

#### Tokenize

In [None]:
tokens = [word for word in word_tokenize(sentence) if word not in stops]
tokens

#### Transform `tokens` into `features`

In [None]:
feats = word_feats(word for word in tokens)
feats

#### Classify

In [None]:
classifier.classify(feats)

<br>
<br>

#### Make a new Sentence

In [2]:
sentence2 = "You are a pathetic fool, a terrible excuse for a human being."
sentence2

'You are a pathetic fool, a terrible excuse for a human being.'

#### Tokenize

In [None]:
tokens2 = [word for word in word_tokenize(sentence2) if word not in stops]
tokens2

#### Transform `tokens` into `features`

In [None]:
feats2 = word_feats(word for word in tokens2)
feats2

#### Classify

In [None]:
classifier.classify(feats2)