In [0]:
!pip install contractions
import contractions
import re
import nltk
import spacy
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



In [0]:
def remove_html(text):
  cleanr = re.compile('<.*?>')
  return re.sub(cleanr, '', text)

def decontract(text):
  return contractions.fix(text)

def tokenize(text):
  # makes text lowercase, removes all non-alphabetic chars and tokenizes it
  words = word_tokenize(text.lower())
  words = [word for word in words if word.isalpha()]
  return words

def remove_stopwords(text):
  stop_words = set(stopwords.words('english')) 
  words = [word for word in text if not word in stop_words]
  return words

def lemmatize(text):
  # Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
  nlp = spacy.load('en', disable=['parser', 'ner'])
  text = ' '.join(text)  
  doc = nlp(text)
  lemmas = [word.lemma_ if word.lemma_ != "-PRON-" else word.lower_ for word in doc]
  return lemmas

def text_preprocessing(text):
  html = remove_html(text)
  decontracted = decontract(html)
  tokenized = tokenize(decontracted)
  stopword = remove_stopwords(tokenized)
  lemmatized = lemmatize(stopword)
  clean = ' '.join(lemmatized)
  return clean
  

In [0]:
from keras.datasets import imdb
from keras.preprocessing import sequence 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
def preprocess_text(text):
  #standarize text to lowercase
  text = text.lower()

  #remove html tags
  remove_tags = re.compile('<.*?>')
  text= re.sub(remove_tags, '', text)

  #remove special characters and numbers
  remove_special_char = re.compile('[^a-z ]', re.IGNORECASE)
  text = remove_special_char.sub('', text)

  #expand decontractions
  text = contractions.fix(text)

  #setting stopwords
  stop_words = set(stopwords.words('english')) 

  #for every word in the review, if it's not a stopword, replace it with it's lemma, otherwise go for next word
  processed_text = []
  text = text.split()
  lemmatizer = WordNetLemmatizer()

  for word in text:
    if word in stop_words:
      continue
      
    word = lemmatizer.lemmatize(word)
    word = lemmatizer.lemmatize(word, 'v')
    processed_text.append(word)

  text = ' '.join(processed_text)

  return text

In [0]:
preprocess_text("for the best lemmas ever, i don't know which one to use")

In [0]:
import io
imdb = pd.read_csv(io.BytesIO(uploaded['IMDB Dataset.csv']))

In [0]:

x = []

for review in list(imdb['review']):
  clean = preprocess_text(review)
  x.append(clean)
  if (x.index(clean) % 1000 == 0):
    print(x.index(clean))


In [0]:
y = [1 if sentiment=='positive' else 0 for sentiment in list(imdb['sentiment'])]


In [0]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(x)

In [0]:
x_tokenized = pad_sequences(tokenizer.texts_to_sequences(x), maxlen=100)


In [112]:
print(x[49999], y[49999])
print(x_tokenized[49999])

one expect star trek movie high art fan expect movie good best episode unfortunately movie muddle implausible plot leave cringe far worst nine far movie even chance watch well know character interact another movie save movie include goofy scene kirk spock mccoy yosemitei would say movie worth rental hardly worth watch however true fan need see movie rent movie way see even cable channel avoid movie 0
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    3  170   82 1711    1  235
  345  114  170    1    9   53  177  414    1 3682 3171   47   99 2432
  141  162 2657  141    1   14  424   13   22   24   11 3465   76    1
  279    1  234 2296   19 3326 3898   10   21    1  185 1860  842  185
   13  116  197  114  112    6    1  393    1   38    6   14 1633  838
  570    1]


In [0]:
max_words = 100
vocabulary_size = 5000

# Build the model
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

embedding_size=32
model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())



In [101]:
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics=['accuracy'])



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [103]:
batchSize = 64
epochs = 3
hist = model.fit(x_tokenized, y, batch_size=batchSize, epochs=epochs, verbose=1, shuffle=True, validation_split=0.5)



Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [126]:
predicted = ["This is very horrible, I can't believe I bought this service"]
predicted_tokenized = pad_sequences(tokenizer.texts_to_sequences(predicted), maxlen=100)
model.predict_classes(predicted_tokenized)

array([[0]], dtype=int32)

In [127]:
predicted = ["Although I have some doubts, I would be excited to try this when it comes out"]
predicted_tokenized = pad_sequences(tokenizer.texts_to_sequences(predicted), maxlen=100)
model.predict_classes(predicted_tokenized)

array([[0]], dtype=int32)

In [128]:
predicted = ["To be honest, one of the rarest and best services out there"]
predicted_tokenized = pad_sequences(tokenizer.texts_to_sequences(predicted), maxlen=100)
model.predict_classes(predicted_tokenized)

array([[1]], dtype=int32)