### Imports

In [1]:
# make compatible with Python 2 and Python 3
from __future__ import print_function, division, absolute_import

In [2]:
# Remove warnings
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

### Loading data

In [3]:
dataset = pd.read_csv(r'C:\Users\jetin\Desktop\dataset.csv', encoding='latin-1')

In [4]:
dataset.head()

Unnamed: 0,SentimentText,Sentiment
0,"first think another Disney movie, might good, ...",1
1,"Put aside Dr. House repeat missed, Desperate H...",0
2,"big fan Stephen King's work, film made even gr...",1
3,watched horrid thing TV. Needless say one movi...,0
4,truly enjoyed film. acting terrific plot. Jeff...,1


In [5]:
# import packages

import bs4 as bs
import nltk

# nltk.download('all')
from nltk.tokenize import sent_tokenize  # tokenizes sentences
import re

from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")

eng_stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jetin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from nltk.corpus import stopwords
from nltk.util import ngrams


ps = PorterStemmer()
wnl = WordNetLemmatizer()


def review_cleaner(reviews, lemmatize=True, stem=False):
    """
    Clean and preprocess a review.

    1. Remove HTML tags
    2. Use regex to remove all special characters (only keep letters)
    3. Make strings to lower case and tokenize / word split reviews
    4. Remove English stopwords
    5. Rejoin to one string
    """
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()
    # 1. Remove HTML tags

    cleaned_reviews = []
    for i, SentimentText in enumerate(dataset["SentimentText"]):
        # print progress
        if (i + 1) % 500 == 0:
            print("Done with %d reviews" % (i + 1))
        SentimentText = bs.BeautifulSoup(SentimentText).text

        # 2. Use regex to find emoticons
        emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", SentimentText)

        # 3. Remove punctuation
        SentimentText = re.sub("[^a-zA-Z]", " ", SentimentText)

        # 4. Tokenize into words (all lower case)
        SentimentText = SentimentText.lower().split()

        # 5. Remove stopwords
        eng_stopwords = set(stopwords.words("english"))

        clean_review = []
        for word in SentimentText:
            if word not in eng_stopwords:
                if lemmatize is True:
                    word = wnl.lemmatize(word)
                elif stem is True:
                    if word == "oed":
                        continue
                    word = ps.stem(word)
                clean_review.append(word)

        # 6. Join the review to one sentence

        review_processed = " ".join(clean_review + emoticons)
        cleaned_reviews.append(review_processed)

    return cleaned_reviews


In [7]:
from sklearn.ensemble import RandomForestClassifier

# # CountVectorizer can actucally handle a lot of the preprocessing for us
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics  # for confusion matrix, accuracy score etc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


np.random.seed(0)


def train_predict_sentiment(
    cleaned_reviews, y=dataset["Sentiment"], ngram=1, max_features=1000
):
    print("Creating the bag of words model!\n")
    # CountVectorizer" is scikit-learn's bag of words tool, here we show more keywords
    vectorizer = CountVectorizer(
        ngram_range=(1, ngram),
        analyzer="word",
        tokenizer=None,
        preprocessor=None,
        stop_words=None,
        max_features=max_features,
    )

    X_train, X_test, y_train, y_test = train_test_split(
        cleaned_reviews, y, random_state=0, test_size=0.2
    )

    train_bag = vectorizer.fit_transform(X_train).toarray()
    test_bag = vectorizer.transform(X_test).toarray()
    #     print('TOP 20 FEATURES ARE: ',(vectorizer.get_feature_names()[:20]))

    print("Training the random forest classifier!\n")
    # Initialize a Random Forest classifier with 75 trees
    forest = RandomForestClassifier(n_estimators=50)
      # Fit the forest to the training set, using the bag of words as
    # features and the sentiment labels as the target variable
    forest = forest.fit(train_bag, y_train)

    train_predictions = forest.predict(train_bag)
    test_predictions = forest.predict(test_bag)

    train_acc = metrics.accuracy_score(y_train, train_predictions)
    valid_acc = metrics.accuracy_score(y_test, test_predictions)
    print(
        " The training accuracy is: ",
        train_acc,
        "\n",
        "The validation accuracy is: ",
        valid_acc,
    )
    print()
    print("CONFUSION MATRIX:")
    print("         Predicted")
    print("          neg pos")
    print(" Actual")
    c = confusion_matrix(y_test, test_predictions)
    print("     neg  ", c[0])
    print("     pos  ", c[1])

    # Extract feature importnace
    print("\nTOP TEN IMPORTANT FEATURES:")
    importances = forest.feature_importances_
    indices = np.argsort(importances)[::-1]
    top_10 = indices[:10]
    print([vectorizer.get_feature_names()[ind] for ind in top_10])

In [8]:
# Here I use the original reviews without lemmatizing and stemming
original_clean_reviews = review_cleaner(dataset["SentimentText"], lemmatize=False, stem=False)
train_predict_sentiment(
    cleaned_reviews=original_clean_reviews,
    y=dataset["Sentiment"],
    ngram=1,
    max_features=1000,
)

Done with 500 reviews
Done with 1000 reviews
Done with 1500 reviews
Done with 2000 reviews
Done with 2500 reviews
Done with 3000 reviews
Done with 3500 reviews
Done with 4000 reviews
Done with 4500 reviews
Done with 5000 reviews
Done with 5500 reviews
Done with 6000 reviews
Done with 6500 reviews
Done with 7000 reviews
Done with 7500 reviews
Done with 8000 reviews
Done with 8500 reviews
Done with 9000 reviews
Done with 9500 reviews
Done with 10000 reviews
Done with 10500 reviews
Done with 11000 reviews
Done with 11500 reviews
Done with 12000 reviews
Done with 12500 reviews
Done with 13000 reviews
Done with 13500 reviews
Done with 14000 reviews
Done with 14500 reviews
Done with 15000 reviews
Done with 15500 reviews
Done with 16000 reviews
Done with 16500 reviews
Done with 17000 reviews
Done with 17500 reviews
Done with 18000 reviews
Done with 18500 reviews
Done with 19000 reviews
Done with 19500 reviews
Done with 20000 reviews
Done with 20500 reviews
Done with 21000 reviews
Done with 21

In [9]:
# For original reviews with unigram and 1000 max_features:
original_clean_reviews = review_cleaner(dataset["SentimentText"], lemmatize=False, stem=False)
train_predict_sentiment(
    cleaned_reviews=original_clean_reviews,
    y=dataset["Sentiment"],
    ngram=1,
    max_features=1000,
)

Done with 500 reviews
Done with 1000 reviews
Done with 1500 reviews
Done with 2000 reviews
Done with 2500 reviews
Done with 3000 reviews
Done with 3500 reviews
Done with 4000 reviews
Done with 4500 reviews
Done with 5000 reviews
Done with 5500 reviews
Done with 6000 reviews
Done with 6500 reviews
Done with 7000 reviews
Done with 7500 reviews
Done with 8000 reviews
Done with 8500 reviews
Done with 9000 reviews
Done with 9500 reviews
Done with 10000 reviews
Done with 10500 reviews
Done with 11000 reviews
Done with 11500 reviews
Done with 12000 reviews
Done with 12500 reviews
Done with 13000 reviews
Done with 13500 reviews
Done with 14000 reviews
Done with 14500 reviews
Done with 15000 reviews
Done with 15500 reviews
Done with 16000 reviews
Done with 16500 reviews
Done with 17000 reviews
Done with 17500 reviews
Done with 18000 reviews
Done with 18500 reviews
Done with 19000 reviews
Done with 19500 reviews
Done with 20000 reviews
Done with 20500 reviews
Done with 21000 reviews
Done with 21

In [10]:
# For lemmatized reviews with unigram and 1000 max_features:
wnl_clean_reviews = review_cleaner(dataset["SentimentText"], lemmatize=True, stem=False)
train_predict_sentiment(
    cleaned_reviews=wnl_clean_reviews, y=dataset["Sentiment"], ngram=1, max_features=1000
)

Done with 500 reviews
Done with 1000 reviews
Done with 1500 reviews
Done with 2000 reviews
Done with 2500 reviews
Done with 3000 reviews
Done with 3500 reviews
Done with 4000 reviews
Done with 4500 reviews
Done with 5000 reviews
Done with 5500 reviews
Done with 6000 reviews
Done with 6500 reviews
Done with 7000 reviews
Done with 7500 reviews
Done with 8000 reviews
Done with 8500 reviews
Done with 9000 reviews
Done with 9500 reviews
Done with 10000 reviews
Done with 10500 reviews
Done with 11000 reviews
Done with 11500 reviews
Done with 12000 reviews
Done with 12500 reviews
Done with 13000 reviews
Done with 13500 reviews
Done with 14000 reviews
Done with 14500 reviews
Done with 15000 reviews
Done with 15500 reviews
Done with 16000 reviews
Done with 16500 reviews
Done with 17000 reviews
Done with 17500 reviews
Done with 18000 reviews
Done with 18500 reviews
Done with 19000 reviews
Done with 19500 reviews
Done with 20000 reviews
Done with 20500 reviews
Done with 21000 reviews
Done with 21

In [11]:
# For stemmed reviews with unigram and 1000 max_features:
ps_clean_reviews = review_cleaner(dataset["SentimentText"], lemmatize=False, stem=True)
train_predict_sentiment(
    cleaned_reviews=ps_clean_reviews, y=dataset["Sentiment"], ngram=1, max_features=1000
)

Done with 500 reviews
Done with 1000 reviews
Done with 1500 reviews
Done with 2000 reviews
Done with 2500 reviews
Done with 3000 reviews
Done with 3500 reviews
Done with 4000 reviews
Done with 4500 reviews
Done with 5000 reviews
Done with 5500 reviews
Done with 6000 reviews
Done with 6500 reviews
Done with 7000 reviews
Done with 7500 reviews
Done with 8000 reviews
Done with 8500 reviews
Done with 9000 reviews
Done with 9500 reviews
Done with 10000 reviews
Done with 10500 reviews
Done with 11000 reviews
Done with 11500 reviews
Done with 12000 reviews
Done with 12500 reviews
Done with 13000 reviews
Done with 13500 reviews
Done with 14000 reviews
Done with 14500 reviews
Done with 15000 reviews
Done with 15500 reviews
Done with 16000 reviews
Done with 16500 reviews
Done with 17000 reviews
Done with 17500 reviews
Done with 18000 reviews
Done with 18500 reviews
Done with 19000 reviews
Done with 19500 reviews
Done with 20000 reviews
Done with 20500 reviews
Done with 21000 reviews
Done with 21

In [12]:
# For stemmed reviews with unigram and 1000 max_features:
ps_clean_reviews = review_cleaner(dataset["SentimentText"], lemmatize=True, stem=True)
train_predict_sentiment(
    cleaned_reviews=ps_clean_reviews, y=dataset["Sentiment"], ngram=1, max_features=1000
)

Done with 500 reviews
Done with 1000 reviews
Done with 1500 reviews
Done with 2000 reviews
Done with 2500 reviews
Done with 3000 reviews
Done with 3500 reviews
Done with 4000 reviews
Done with 4500 reviews
Done with 5000 reviews
Done with 5500 reviews
Done with 6000 reviews
Done with 6500 reviews
Done with 7000 reviews
Done with 7500 reviews
Done with 8000 reviews
Done with 8500 reviews
Done with 9000 reviews
Done with 9500 reviews
Done with 10000 reviews
Done with 10500 reviews
Done with 11000 reviews
Done with 11500 reviews
Done with 12000 reviews
Done with 12500 reviews
Done with 13000 reviews
Done with 13500 reviews
Done with 14000 reviews
Done with 14500 reviews
Done with 15000 reviews
Done with 15500 reviews
Done with 16000 reviews
Done with 16500 reviews
Done with 17000 reviews
Done with 17500 reviews
Done with 18000 reviews
Done with 18500 reviews
Done with 19000 reviews
Done with 19500 reviews
Done with 20000 reviews
Done with 20500 reviews
Done with 21000 reviews
Done with 21

### SENTIMENT ANALYSIS USING RNN

In [1]:
from __future__ import print_function

from tensorflow.keras.datasets import imdb

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models  import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.utils import normalize

In [5]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [6]:
max_features = 10000  # Only include top 10,000 words in the vocabulary
maxlen = 500  # Cut off each review after 500 words
batch_size = 32

In [7]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = max_features)

In [8]:
X_train.shape, X_test.shape


((25000,), (25000,))

In [10]:
# Padding the data so that each sequence is of exactly 500 words
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(X_train, maxlen = maxlen)
X_test = pad_sequences(X_test, maxlen = maxlen)


In [11]:
X_train.shape, X_test.shape

((25000, 500), (25000, 500))

In [16]:

model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [17]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics = ['acc'])

In [19]:
history = model.fit(X_train, y_train, batch_size = batch_size, epochs = 15, validation_split = 0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [20]:
model.evaluate(X_test, y_test)



[0.3697262788462639, 0.88292]