In [20]:
import os, re, string, numpy
import nltk
import keras
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def load_docs(filename):
  file = open(filename, 'r')
  text = file.read()
  file.close()
  return text

def clean_doc(filename):
  text = load_docs(filename)
  tokens = text.split()
  tokens = [t.lower() for t in tokens]
  tokens = [re.sub('[^a-zA-Z]', ' ', t) for t in tokens]
  tokens = [t for t in tokens if t.isalpha()]
  sw = set(stopwords.words('english'))
  tokens = [t for t in tokens if t not in sw]
  tokens = [t for t in tokens if len(t)>1]
  return tokens

In [3]:
def doc_to_line(filename, vocab):
  tokens = clean_doc(filename)
  tokens = [w for w in tokens if w in vocab]
  return ' '.join(tokens)

def process(filename, vocab, istrain):
  lines = list()
  for dir in os.listdir(filename):
    if istrain and dir.startswith('cv9'):
      continue
    if not istrain and not dir.startswith('cv9'):
      continue
    path = filename + '/' + dir
    line = doc_to_line(path, vocab)
    lines.append(line)
  return lines

In [4]:
def load_clean_dataset(vocab, istrain, n_dir, p_dir):
  neg = process(n_dir, vocab, istrain)
  pos = process(p_dir, vocab, istrain)
  docs = neg+pos
  labels = [0 for _ in range(len(neg))]+[1 for _ in range(len(pos))]
  return docs, labels

def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer

In [11]:
def create_model( n_words):
  model = keras.models.Sequential([
                                   keras.layers.Dense(50, input_shape=(n_words,), activation='relu'),
                                   keras.layers.Dense(1, activation='sigmoid')
  ])
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [26]:
def clean_review(review):
  tokens = review.split()
  tokens = [t.lower() for t in tokens]
  tokens = [re.sub('[^a-zA-Z]', ' ', t) for t in tokens]
  tokens = [t for t in tokens if t.isalpha()]
  sw = set(stopwords.words('english'))
  tokens = [t for t in tokens if t not in sw]
  tokens = [t for t in tokens if len(t)>1]
  return tokens

def predict_sentiment(model, vocab, tokenizer, review):
  tokens = clean_review(review)
  tokens = [w for w in tokens if w in vocab]
  line = ' '.join(tokens)
  encoded = tokenizer.texts_to_matrix([line], mode='binary')
  y = model.predict(encoded, verbose=0)
  pos_percentage = y[0, 0]
  if(round(pos_percentage)==0):
    return 1-pos_percentage, 'NEGATIVE'
  return pos_percentage, 'POSITIVE'

In [6]:
n_dir = '/content/drive/MyDrive/Colab Notebooks/NLP/data sets/txt_sentoken/neg'
p_dir = '/content/drive/MyDrive/Colab Notebooks/NLP/data sets/txt_sentoken/pos'
v_dir = '/content/drive/MyDrive/Colab Notebooks/NLP/data sets/txt_sentoken/vocab.txt'

In [21]:
vocab = load_docs(v_dir)
vocab = set(vocab.split())

train_docs, ytrain = load_clean_dataset(vocab, True, n_dir, p_dir)
test_docs, ytest = load_clean_dataset(vocab, False, n_dir, p_dir)

tokenizer = create_tokenizer(train_docs)

X_train = tokenizer.texts_to_matrix(train_docs, mode='binary')
X_test = tokenizer.texts_to_matrix(test_docs, mode='binary')

ytrain = numpy.array(ytrain)
ytesy = numpy.array(ytest)

In [22]:
n_words = X_train.shape[1]
model = create_model(n_words)
model.fit(X_train, ytrain, epochs=10, verbose=2)

Epoch 1/10
57/57 - 1s - loss: 0.4960 - accuracy: 0.7672
Epoch 2/10
57/57 - 1s - loss: 0.0894 - accuracy: 0.9856
Epoch 3/10
57/57 - 0s - loss: 0.0246 - accuracy: 1.0000
Epoch 4/10
57/57 - 0s - loss: 0.0094 - accuracy: 1.0000
Epoch 5/10
57/57 - 0s - loss: 0.0049 - accuracy: 1.0000
Epoch 6/10
57/57 - 0s - loss: 0.0029 - accuracy: 1.0000
Epoch 7/10
57/57 - 0s - loss: 0.0019 - accuracy: 1.0000
Epoch 8/10
57/57 - 0s - loss: 0.0013 - accuracy: 1.0000
Epoch 9/10
57/57 - 0s - loss: 9.8731e-04 - accuracy: 1.0000
Epoch 10/10
57/57 - 0s - loss: 7.6204e-04 - accuracy: 1.0000


<keras.callbacks.History at 0x7fe65fd34c50>

In [30]:
p_review = 'best movie ever, it was great. i definitely recomend it'
percent, senti = predict_sentiment(model, vocab, tokenizer, p_review)
print('review: [%s] \nSentiment: %s (%.3f%%)'%(p_review, senti, percent*100))

n_review = 'this movie is so boring'
per, sen = predict_sentiment(model, vocab, tokenizer, n_review)
print('review: [%s] \nSentiment: %s (%.3f%%)'%(n_review, sen, per*100))

review: [best movie ever, it was great. i definitely recomend it] 
Sentiment: POSITIVE (61.225%)
review: [this movie is so boring] 
Sentiment: NEGATIVE (66.642%)
