In [1]:
import nltk
import pandas as pd

In [2]:
dataset = pd.read_csv(r'https://github.com/dipanjanS/nlp_workshop_dhs18/raw/master/Unit%2011%20-%20Sentiment%20Analysis%20-%20Unsupervised%20Learning/movie_reviews.csv.bz2')

In [3]:
# build train and test datasets
reviews = dataset['review'].values
sentiments = dataset['sentiment'].values

train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]

test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

In [4]:
import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata


def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = strip_html_tags(doc)
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = remove_accented_chars(doc)
    doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()  
    norm_docs.append(doc)
  
  return norm_docs

In [8]:
norm_train_reviews = pre_process_corpus(train_reviews)
norm_test_reviews = pre_process_corpus(test_reviews)

100%|██████████| 35000/35000 [00:24<00:00, 1409.55it/s]
100%|██████████| 15000/15000 [00:11<00:00, 1354.20it/s]


In [7]:
import gensim
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Activation, Dense
from sklearn.preprocessing import LabelEncoder

In [9]:
le = LabelEncoder()
# tokenize train reviews & encode train labels
tokenized_train = [nltk.word_tokenize(text)
                       for text in norm_train_reviews]
y_train = le.fit_transform(train_sentiments)
# tokenize test reviews & encode test labels
tokenized_test = [nltk.word_tokenize(text)
                       for text in norm_test_reviews]
y_test = le.fit_transform(test_sentiments)


In [11]:
tokenized_test

[['just',
  'do',
  'not',
  'bother',
  'i',
  'thought',
  'i',
  'would',
  'see',
  'a',
  'movie',
  'with',
  'great',
  'supspense',
  'and',
  'actionbut',
  'it',
  'grows',
  'boring',
  'and',
  'terribly',
  'predictable',
  'after',
  'the',
  'interesting',
  'start',
  'in',
  'the',
  'middle',
  'of',
  'the',
  'film',
  'you',
  'have',
  'a',
  'little',
  'social',
  'drama',
  'and',
  'all',
  'tension',
  'is',
  'lost',
  'because',
  'it',
  'slows',
  'down',
  'the',
  'speed',
  'towards',
  'the',
  'end',
  'the',
  'it',
  'gets',
  'better',
  'but',
  'not',
  'really',
  'great',
  'i',
  'think',
  'the',
  'director',
  'took',
  'this',
  'movie',
  'just',
  'too',
  'serious',
  'in',
  'such',
  'a',
  'kind',
  'of',
  'a',
  'movie',
  'even',
  'if',
  'you',
  'do',
  'not',
  'care',
  'about',
  'the',
  'plot',
  'at',
  'least',
  'you',
  'want',
  'some',
  'nice',
  'action',
  'i',
  'nearly',
  'dozed',
  'off',
  'in',
  'the',
  '

In [12]:
y_test

array([0, 1, 0, ..., 0, 0, 0])