<a href="https://colab.research.google.com/github/lilianabs/nlp-basics/blob/main/Simple_RNN_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2024-02-09 04:11:48--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-02-09 04:11:48--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-02-09 04:11:49--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [18]:
!unzip glove*.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from keras.preprocessing.text import one_hot, Tokenizer
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import LSTM, SimpleRNN, Embedding
from keras.preprocessing.sequence import pad_sequences

In [None]:
df = pd.read_csv('IMDB Dataset.csv')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Data preparation

In [None]:
def preprocess_text(sen):
  # Remove Stopwords
  pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
  sen = pattern.sub('', sen)

  # Remove html tags
  sen = re.sub(r'<[^>]+>', ' ', sen)

  # Remove punctuations and numbers
  sen = re.sub(r'[^\w\s]', ' ', sen)

  # Remove single characters
  sen = re.sub(r"\s+[a-zA-Z]\s+", ' ', sen)

  # Remove multiple spaces
  sen = re.sub(r'\s+', ' ', sen)

  # Lowercase
  sen = sen.lower()

  return sen

In [None]:
X = []
sentences = list(df['review'])

for sen in sentences:
  X.append(preprocess_text(sen))

In [None]:
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, df['sentiment'])))

In [None]:
X[1]

'a wonderful little production the filming technique unassuming old time bbc fashion gives comforting sometimes discomforting sense realism entire piece the actors extremely well chosen michael sheen got polari voices pat you truly see seamless editing guided references williams diary entries well worth watching terrificly written performed piece masterful production one great master comedy life the realism really comes home little things fantasy guard rather use traditional dream techniques remains solid disappears it plays knowledge senses particularly scenes concerning orton halliwell sets particularly flat halliwell murals decorating every surface terribly well done '

In [None]:
len(X[1])

678

In [None]:
# Tokenize words into numerical sequences
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X)

X = word_tokenizer.texts_to_sequences(X)

In [None]:
len(X[1])

92

In [None]:
vocab_lenght = len(word_tokenizer.word_index) + 1

In [None]:
vocab_lenght

101696

In [None]:
# Padding
max_len = 100

X = pad_sequences(X, padding='post', maxlen=max_len)

In [None]:
X[1]

array([  562,   302,    46,   262,     1,  1295,  2914, 17675,    71,
           9,  2185,  1497,   321, 13240,   444, 25698,   189,  1761,
         351,   322,     1,    66,   476,    15,  2202,   398,  4116,
         100, 63085,  2219,  3222,    96,   279,    14, 13241,   697,
        9708,  1758,  1654,  7311,  6578,    15,   185,    65, 36440,
         316,  2328,   322,  4285,   262,     4,    20,  1057,   110,
          41,     1,  1761,    13,   179,   251,    46,    88,   933,
        2825,   159,   268,  2124,   879,  3071,  1197,  1101,  4885,
           6,   207,  1757,  4511,   491,    59,  3646, 18574, 24663,
         636,   491,   962, 24663, 44259, 26908,    83,  2222,  1895,
          15,   128,     0,     0,     0,     0,     0,     0,     0,
           0], dtype=int32)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Create embedding matrix

In [21]:
embeddings_dict = dict()
glove_file = open('glove.6B.100d.txt', encoding='utf8')

for line in glove_file:
  records = line.split()
  word = records[0]
  vector_dimensions = np.asarray(records[1:], dtype='float32')
  embeddings_dict[word] = vector_dimensions

glove_file.close()

In [22]:
len(embeddings_dict)

400000

In [27]:
embedding_matrix = np.zeros((vocab_lenght, 100))
for word, index in word_tokenizer.word_index.items():
  embedding_vector = embeddings_dict.get(word)
  if embedding_vector is not None:
    embedding_matrix[index] = embedding_vector

In [28]:
embedding_matrix.shape

(101696, 100)

## RNN model

In [None]:
rnn = Sequential()
embedding_layer = Embedding(vocab_lenght, 100, weights=[embedding_matrix],
                            input_length=max_len, trainable=False)