In [1]:
import pandas as pd

train = pd.read_csv('..\\0.data\\raw\\imdb_train.csv')
print(len(train))
print(train.head())

17500
      id  labels                                               text
0   1288       0  We saw this on the shelf at the local video st...
1   2064       0  Well, you'd better if you plan on sitting thro...
2  18997       1  This is my favorite Jackie Chan movie and in a...
3  10448       0  The long list of "big" names in this flick (in...
4  16133       1  The great and underrated Marion Davies shows h...


In [2]:
train.columns

Index(['id', 'labels', 'text'], dtype='object')

## Clean Data (Tokenization, Lemmatization, Punctuation Removal and Lower Case

In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk import stem

def clean_paragraph(para):
    lmtzr = stem.WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    
    p = ' '.join([lmtzr.lemmatize(token.lower()) for token in tokenizer.tokenize(para)])
    
    return p

In [4]:
import time; t0 = time.time()

train.text = [clean_paragraph(para) for para in train.text]

print(time.time() - t0)

58.212397813797


## Build Count Vector on Training Data

In [5]:
import sklearn, nltk
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
vectorizer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)
training_vectorized_data = vectorizer.fit_transform(train.text).toarray()

In [7]:
'''
17500 docs, 34325 unique tokens
'''
print(training_vectorized_data.shape)
print(type(training_vectorized_data))
print(vectorizer.vocabulary_.get('underrated'))
# print(len(tokDict))

(17500, 34325)
<class 'numpy.ndarray'>
31896


In [8]:
print(type(train.labels))
print(len(training_vectorized_data[0]))

<class 'pandas.core.series.Series'>
34325


## Running the model on test data

In [9]:
'''Load Data'''
test = pd.read_csv('..\\0.data\\raw\\imdb_test.csv')
print(len(test))
print(test.head())

'''Clean test Data'''
test.text = [clean_paragraph(para) for para in test.text]

# Run vectorizer on it.
test_vectorized_data = vectorizer.transform(test.text).toarray()

7500
      id  labels                                               text
0  20594       1  I am decidedly not in the target audience for ...
1    602       0  Detective Russell Logan(Lou Diamond Phillips)h...
2     29       0  I had some expectation for the movie, since it...
3  20342       1  I think that this movie is very neat. You eith...
4   6230       0  Well I just gave away 95 minutes and 47 second...


In [10]:
print(test_vectorized_data.shape)
training_vectorized_data.shape
len(train.labels)

(7500, 34325)


17500

In [11]:
from keras import models, layers

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [12]:
network = models.Sequential()
network.add(layers.Dense(16, activation='relu', input_shape=(34325,)))
network.add(layers.Dense(16, activation='relu'))
network.add(layers.Dense(1, activation='sigmoid'))






In [None]:
network.compile(optimizer = 'rmsprop',
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

history = network.fit(training_vectorized_data, train.labels, epochs=4, batch_size=512)



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/4
Epoch 2/4

In [None]:
test_loss, test_acc = network.evaluate(test_vectorized_data, test.labels)
print('test_accuracy: ', test_acc)

## The accuracy using CountVector and Deep Learning is now 89%. Earlier it was 85%