In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/news/

Mounted at /content/drive
/content/drive/MyDrive/news


In [23]:
import pandas as pd
import numpy as np
from numpy import array
import nltk
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from numpy import asarray
from numpy import zeros
from keras.layers import LSTM, Embedding
from keras.models import Sequential
from keras.layers.core import Dense

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
imdb_reviews = pd.read_csv("imdb_reviews.csv")
# safety check
imdb_reviews.shape

(50000, 2)

preprocessing

In [6]:
def preprocess(s):
    s = s.lower()

    s = re.compile(r'<[^>]+>').sub('', s)

    # remove punctuations/numbers
    s = re.sub('[^a-zA-Z]', ' ', s)

    # remove single letters
    s = re.sub(r"\s+[a-zA-Z]\s+", ' ', s)

    # remove places with more than one space
    s = re.sub(r'\s+', ' ', s)

    # remove nltk stopwords (regex makes it go wayyy faster than the line below)
    # s = ' '.join([word for word in s.split() if word not in (stopwords.words('english'))])
    s = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*').sub('', s)

    return s

In [7]:
print(preprocess("i use ` rust for BLLLLLAZINGLY fast code    ... its amazing"))

use rust blllllazingly fast code amazing


In [8]:
# preprocessing all reviews
preprocessed_text = []

for review in list(imdb_reviews['review']):
    preprocessed_text.append(preprocess(review))

print(preprocessed_text[1])

wonderful little production filming technique unassuming old time bbc fashion gives comforting sometimes discomforting sense realism entire piece actors extremely well chosen michael sheen got polari voices pat truly see seamless editing guided references williams diary entries well worth watching terrificly written performed piece masterful production one great master comedy life realism really comes home little things fantasy guard rather use traditional dream techniques remains solid disappears plays knowledge senses particularly scenes concerning orton halliwell sets particularly flat halliwell murals decorating every surface terribly well done 


In [10]:
# converting each review's "sentiment" column value to 1 and 0 instead of "positive" and "negative"
sentiments = []

for sentiment in imdb_reviews['sentiment']:
    if sentiment == "positive":
        sentiments.append(1)
    else:
        sentiments.append(0)

sentiments = np.array(sentiments)

print(sentiments[1])

1


In [11]:
# spliting data into training and test set data for the model
X_train, X_test, y_train, y_test = train_test_split(preprocessed_text, sentiments, test_size=0.25, random_state=42)

making embedded layer

In [12]:
# transform raw text into numerical representations suitable for feeding into the model
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [13]:
# our corpus has 90,094 words in it
# also adding one more to the mix for words that dont have a word embedding (90,095 total)
corpus_length = len(tokenizer.word_index) + 1
print(corpus_length)

90095


In [15]:
# padding reviews to 100 characters
X_train = pad_sequences(X_train, padding='post', maxlen=100)
X_test = pad_sequences(X_test, padding='post', maxlen=100)

In [17]:
# creating dict that will contain word embeddings for words found in the glove_word_embeddings.txt
word_embeddings_dict = {}
glove_word_embeddings = open('glove_word_embeddings.txt', encoding="utf8")

for line in glove_word_embeddings:
    values = line.split()
    word = values[0]
    embedding_vector = np.asarray(values[1:], dtype='float32')
    word_embeddings_dict[word] = embedding_vector
glove_word_embeddings.close()

In [18]:
# each row corresponds to the index of the word in the corpus
# the matrix has 100 columns, where each column contains the glove embeddings for the words in the corpus
# matrix can now be used as an initial embedding layer when training the neural network
embedding_matrix = np.zeros((corpus_length, 100))

for word, i in tokenizer.word_index.items():
    embedding_vector = word_embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [20]:
print(embedding_matrix.shape)

(90095, 100)
