In [1]:
# https://medium.com/@annabiancajones/sentiment-analysis-of-reviews-text-pre-processing-6359343784fb
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Input, GRU, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import unicodedata, re, string
import nltk
from nltk import sent_tokenize
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
import seaborn as sns
sns.set(color_codes=True)
import matplotlib.pyplot as plt
%matplotlib inline

## Reading files

In [2]:
df_train = pd.read_csv('D:/Data_set/IMDB/train.csv')


In [3]:
df_train.head()

Unnamed: 0,text,sentiment
0,For a movie that gets no respect there sure ar...,0
1,Bizarre horror movie filled with famous faces ...,0
2,"A solid, if unremarkable film. Matthau, as Ein...",0
3,It's a strange feeling to sit alone in a theat...,0
4,"You probably all already know this by now, but...",0


In [4]:
stop_words = stopwords.words('english')

In [5]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_numbers(words):
    """Remove all interger occurrences in list of tokenized words with textual representation"""
    new_words = []
    for word in words:
        new_word = re.sub("\d+", "", word)
        if new_word != '':
            new_words.append(new_word)
    return new_words
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        #if word not in stopwords.words('english'):
        if word not in stop_words:
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas
def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_numbers(words)
    words = remove_stopwords(words)
    return words

In [6]:
def clean_sentence1(text):
    sentences=text.apply(nltk.word_tokenize)
    clean_sentence=[]
    for sentence in sentences:
        #print(sentence)
        clean_words=normalize(sentence)
        clean_sentence.append(" ".join(clean_words))
    return clean_sentence

In [7]:
df_train['clean_text']=clean_sentence1(df_train['text'])

In [8]:
len(df_train)

25000

In [9]:
remove_duplicate=df_train.drop_duplicates(subset=['clean_text'], keep=False)

In [10]:
#remove_duplicate.columns
Final_data=remove_duplicate.drop(['text'], axis=1)

In [11]:
Final_data.head()


Unnamed: 0,sentiment,clean_text
0,0,movie gets respect sure lot memorable quotes l...
1,0,bizarre horror movie filled famous faces stole...
2,0,solid unremarkable film matthau einstein wonde...
3,0,strange feeling sit alone theater occupied par...
4,0,probably already know additional episodes neve...


## Parameter for model

In [12]:
# config values
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 400 # max number of words in a question to use

## Training and Test data split

In [13]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step

In [14]:
tok.fit_on_texts(list(Final_data['clean_text'])) #fit to cleaned text

In [15]:
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1

92825


In [16]:
train_df = tok.texts_to_sequences(list(Final_data['clean_text'])) #this is how we create sequences
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen) #let's execute pad step

In [17]:
len(train_df)

24804

In [None]:
#train_df[1]

In [18]:
# Train test split
from sklearn.model_selection import train_test_split

In [19]:
y=Final_data['sentiment']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(train_df,y , test_size=0.1, random_state=42)

In [21]:
type(X_train)

numpy.ndarray

In [22]:
from tensorflow import keras

## Model Build

In [23]:
class RNN(keras.Model):

    def __init__(self, units, num_classes, num_layers):
        super(RNN, self).__init__()


        # self.cells = [keras.layers.LSTMCell(units) for _ in range(num_layers)]
        
        self.rnn = keras.layers.GRU(units, return_sequences=True)
        self.rnn2 = keras.layers.GRU(units)
       
        # have 1000 words totally, every word will be embedding into 100 length vector
        # the max sentence lenght is 80 words
        self.embedding = keras.layers.Embedding(vocab_size, 100, input_length=400)
        self.fc = keras.layers.Dense(1)

    def call(self, inputs, training=None, mask=None):

        # print('x', inputs.shape)
        # [b, sentence len] => [b, sentence len, word embedding]
        x = self.embedding(inputs)
        # print('embedding', x.shape)
        x = self.rnn(x) 
        x = self.rnn2(x) 
        # print('rnn', x.shape)

        x = self.fc(x)
        print(x.shape)

        return x

In [24]:
units = 64
num_classes = 2
batch_size = 32
epochs = 10
model = RNN(units, num_classes, num_layers=2)


In [25]:
model.compile(optimizer=keras.optimizers.Adam(0.001),
                  loss=keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])

In [26]:
model.fit(np.array(X_train), np.array(y_train), batch_size=batch_size, epochs=epochs,
              validation_data=(np.array(X_test),np.array(y_test)), verbose=1)

(None, 1)
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 22323 samples, validate on 2481 samples
Epoch 1/10
(None, 1)
(None, 1)
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2bb93b7a88>

In [27]:
# Evaluate Model with Test data set
results = model.evaluate(np.array(X_test),np.array(y_test))


