In [28]:
from html.parser import HTMLParser
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from utils import display_evaluation_metrics, display_confusion_matrix, display_classification_report
import html
import nltk
import numpy as np
import pandas as pd
import os
import re
import string
import unicodedata

ImportError: No module named 'utils'

In [None]:
stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()

In [29]:
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_html(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [30]:
_CONTRACTION_DICT = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

In [31]:
def get_contraction_dict():
    """Returns CONTRACTION DICT"""
    return _CONTRACTION_DICT

In [32]:
def expand_contractions(sentence, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        expanded_contraction = first_char+expanded_contraction[1:]

        return expanded_contraction

    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence

In [33]:
# lemmatize text based on POS tags
def lemmatize_text(text):
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in
                                    tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def tokenize_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens

def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in
                       stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def keep_text_characters(text):
    filtered_tokens = []
    tokens = tokenize_text(text)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [34]:
# Annotate text tokens with POS tags
def pos_tag_text(text):
    # convert Penn treebank tag to wordnet tag

    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    text = word_tokenize(text)
    tagged_text = pos_tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text

In [35]:
def normalize_accented_characters(text):
    text = unicodedata.normalize('NFKD', text.encode().decode('utf-8')).encode('ascii', 'ignore')
    return text

In [36]:
def normalize_corpus(corpus, lemmatize=True, only_text_chars=False, tokenize=False):
    normalized_corpus = []
    for index, text in enumerate(corpus):
        text = normalize_accented_characters(text)
        text = str(text)
        text = html.unescape(text)
        text = strip_html(text)
        text = expand_contractions(text, get_contraction_dict())
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        if only_text_chars:
            text = keep_text_characters(text)
        
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)
    return normalized_corpus

In [37]:
def build_feature_matrix(documents, feature_type='frequency', ngram_range=(1, 1), min_df=0.0, max_df=1.0):
    feature_type = feature_type.lower().strip()
    
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
        max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
        max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df,
        ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values:'binary', 'frequency', 'tfidf'")
    feature_matrix = vectorizer.fit_transform(documents).astype(float)

    return vectorizer, feature_matrix

In [38]:
def display_evaluation_metrics(true_labels, predicted_labels, positive_class=1):
    print('Accuracy:', np.round(metrics.accuracy_score(true_labels, predicted_labels), 2))
    print('Precision:', np.round(metrics.precision_score(true_labels, predicted_labels, 
                                                         pos_label=positive_class, average='binary'), 2))
    print('Recall:', np.round(metrics.recall_score(true_labels, predicted_labels, 
                                                   pos_label=positive_class, average='binary'), 2))
    print('F1 Score:', np.round(metrics.f1_score(true_labels, predicted_labels, 
                                                 pos_label=positive_class, average='binary'), 2))

In [39]:
def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
    cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels, labels=classes)
    cm_frame = pd.DataFrame(data=cm, columns=pd.MultiIndex(levels=[['Predicted:'], classes], labels=[[0,0],[0,1]]), 
                            index=pd.MultiIndex(levels=[['Actual:'], classes], labels=[[0,0],[0,1]]))
    print(cm_frame)

In [40]:
def display_classification_report(true_labels, predicted_labels, classes=[1,0]):
    report = metrics.classification_report(y_true=true_labels,
    y_pred=predicted_labels,
    labels=classes)
    print(report)

In [41]:
# Load dataset (http://ai.stanford.edu/%7Eamaas/data/sentiment/)
datasets_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))+'/datasets/imdb/'
imdb_dataset = 'movie_reviews.csv'
# load movie reviews data
dataset = pd.read_csv(datasets_path+imdb_dataset, encoding='utf-8-sig')

#with open(datasets_path+imdb_dataset, 'r') as f:
#    dataset = pd.read_csv(f)

In [42]:
# print sample data
print(dataset.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [43]:
# prepare training and testing datasets
train_data = dataset[:35000]
test_data = dataset[35000:]

In [44]:
train_reviews = np.array(train_data['review'])
train_sentiments = np.array(train_data['sentiment'])
test_reviews = np.array(test_data['review'])
test_sentiments = np.array(test_data['sentiment'])

In [45]:
# prepare sample dataset for experiments
sample_docs = [100, 5817, 7626, 7356, 1008, 7155, 3533, 13010]
sample_data = [(test_reviews[index], test_sentiments[index]) for index in sample_docs]

## We have taken a total of 35,000 reviews out of the 50,000 to be our training dataset and we will evaluate our models and test them on the remaining 15,000 reviews. 

## This is in line with a typical 70:30 separation used for training and testing dataset building.

## We have also extracted a total of eight reviews from the test dataset and we will be looking closely at the results for these documents as well as evaluating the model performance on the complete test dataset

In [20]:
# normalize train reviews
norm_train_reviews = normalize_corpus(train_reviews, lemmatize=True, only_text_chars=True)

# feature extraction
vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews, feature_type='tfidf', 
                                                  ngram_range=(1, 1), min_df=0.0, max_df=1.0)

In [21]:
# normalize test reviews
norm_test_reviews = normalize_corpus(test_reviews, lemmatize=True, only_text_chars=True)
# extract features
test_features = vectorizer.transform(norm_test_reviews)

In [22]:
# We will now build our model using the support vector machine (SVM) algorithm

In [23]:
from sklearn.linear_model import SGDClassifier
# build the model
svm = SGDClassifier(loss='hinge', n_iter=200)
svm.fit(train_features, train_sentiments)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=200, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

## Now that we have our features for the entire test dataset, before we predict the sentiment and measure model prediction performance for the entire test datase

In [27]:
# predict sentiment for sample docs from test data
for doc_index in sample_docs:
    print('Review:-')
    print(test_reviews[doc_index])
    print('Actual Labeled Sentiment:', test_sentiments[doc_index])
    doc_features = test_features[doc_index]
    predicted_sentiment = svm.predict(doc_features)[0]
    print('Predicted Sentiment:', predicted_sentiment)
    print()

Review:-
Worst movie, (with the best reviews given it) I've ever seen. Over the top dialog, acting, and direction. more slasher flick than thriller.With all the great reviews this movie got I'm appalled that it turned out so silly. shame on you martin scorsese
Actual Labeled Sentiment: negative
Predicted Sentiment: negative

Review:-
I hope this group of film-makers never re-unites.
Actual Labeled Sentiment: negative
Predicted Sentiment: negative

Review:-
no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!
Actual Labeled Sentiment: negative
Predicted Sentiment: negative

Review:-
Add this little gem to your list of holiday regulars. It is<br /><br />sweet, funny, and endearing
Actual Labeled Sentiment: positive
Predicted Sentiment: positive

Review:-
a mesmerizing film that certainly keeps your attention... Ben Daniels is fascinating (and courageous) to watch.
Actual Labeled Sentiment: positive
Predicted Sentiment: positive

Review:-
This mov

### You can look at each review, its actual labeled sentiment, and our predicted sentiment in the preceding output and see that we have some negative and positive reviews, and our model is able to correctly identify the sentiment for most of the sampled reviews except the last two reviews. If you look closely at the last two reviews, some part of the review has a negative sentiment ( "worst horror film" , "voted this movie to be bad" ) but the general sentiment or opinion of the person who wrote the review was intended positive.


### These are the examples I mentioned earlier about the overlap of positive and negative emotions , which makes it difficult for the model to predict the actual sentiment!

### Let us now predict the sentiment for all our test dataset reviews and evaluate our model performance:

In [46]:
# predict the sentiment for test dataset movie reviews
predicted_sentiments = svm.predict(test_features)

In [47]:
# evaluate model prediction performance
display_evaluation_metrics(true_labels=test_sentiments, predicted_labels=predicted_sentiments, 
                           positive_class='positive')

Accuracy: 0.89
Precision: 0.88
Recall: 0.9
F1 Score: 0.89


In [48]:
# show confusion matrix
display_confusion_matrix(true_labels=test_sentiments, predicted_labels=predicted_sentiments, 
                         classes=['positive', 'negative'])

                 Predicted:         
                   positive negative
Actual: positive       6767      743
        negative        924     6566


In [49]:
# show detailed per-class classification report
display_classification_report(true_labels=test_sentiments, predicted_labels=predicted_sentiments, 
                              classes=['positive', 'negative'])

             precision    recall  f1-score   support

   positive       0.88      0.90      0.89      7510
   negative       0.90      0.88      0.89      7490

avg / total       0.89      0.89      0.89     15000



### The preceding outputs show the various performance metrics that depict the performance of our SVM model with regard to predicting sentiment for movie reviews.

### We have an average sentiment prediction accuracy of 89 percent, which is really good if you compare it with standard baselines for text classification using supervised techniques.

### The classification report also shows a per-class detailed report, and we see that our F1-score (harmonic mean of precision and recall) is 89 percent for both positive and negative sentiment. The support metric shows the number of reviews having positive (7510) sentiment and negative (7490) sentiment. The confusion matrix shows how many reviews for which we predicted the correct sentiment ( positive : 6770/7510, negative : 6578/7490) and the number of reviews for which we predicted the wrong sentiment ( positive : 740/7510, negative : 912/7490). 

### Do try out building more models with different features and different supervised learning algorithms. Can you get a better model which predicts sentiment more accurately?

## References

### Text Analytics with Python: A Practical Real-World Approach to Gaining Actionable Insights from Your Data, By Dipanjan Sarkar (Chapter 7: Semantic and Sentiment Analysis)