# Movie rewievs

In [1]:
import pandas as pd
import numpy as np


In [2]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding

Using TensorFlow backend.


In [3]:
from keras.layers import SpatialDropout1D

## data exploration

In [4]:
data_neg = pd.read_csv("./movie_reviews/movie_reviews/train/neg_rating.csv")

In [5]:
data_pos = pd.read_csv("./movie_reviews/movie_reviews/train/pos_rating.csv")

Load the GloVe corpus

In [7]:
embeddings_index = {}
f = open('./glove.6B/glove.6B.300d.txt')
for line in f:
    values = line.split(' ')
    word = values[0] 
    coefs = np.asarray(values[1:], dtype='float32') 
    embeddings_index[word] = coefs
f.close()


In [8]:
def data_loader(data, sentiment):
    train = []
    for i  in range(data.shape[0]):
        file = open("./movie_reviews/movie_reviews/train/{0}/{1}.txt".format(sentiment,i), "r")
        train.append(file.readlines())
    return train

In [9]:
d = data_loader(data_neg, "neg")

In [10]:
d_pos = data_loader(data_pos, "pos")

In [29]:
data = d+d_pos

In [26]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ira/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
import re
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
def removing_stop_words(data):
    lines_without_stopwords=[] 
    for seq in data: 
        seq = seq[0].lower()
        seq_by_words = re.findall(r'(?:\w+)', seq, flags = re.UNICODE) 
        new_line=[]
        for word in seq_by_words:
            if word not in stop:
                new_line.append(word)
        lines_without_stopwords.append(new_line)

    return lines_without_stopwords

In [30]:
data = removing_stop_words(data)

In [31]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
MAX_NUM_WORDS = 1000
MAX_SEQUENCE_LENGTH = 100
def prep_with_tokenize_pad(texts):
   
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

   
    
    print("data shape: ", data.shape)

    return word_index, data

In [32]:
word_index_data, data = prep_with_tokenize_pad(data)

Found 76906 unique tokens.
data shape:  (25000, 100)


In [33]:
labels = [0 if i < 12500 else 1 for i in range(data_neg.shape[0]+data_pos.shape[0]) ]

In [101]:
sum(labels)

12500

In [22]:
from keras.initializers import Constant
def embed(word_index, data):
    EMBEDDING_DIM = embeddings_index.get('a').shape[0]
    num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word) 
        if embedding_vector is not None:

            embedding_matrix[i] = embedding_vector
    return embedding_matrix, num_words

In [19]:
#as we have fake labels for train I should split it on test and train by my own
from sklearn.model_selection import train_test_split
train, test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [112]:
embedding_matrix_t, num_words_t = embed(word_index_data, train)

## binary classification

In [85]:

model = Sequential()
model.add(Embedding(num_words, 300, input_length=400, weights= [embedding_matrix], trainable=False))

model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [115]:
model.fit(train, np.array(y_train), validation_split=0.1, epochs = 6)

Train on 18000 samples, validate on 2000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f8e21075c50>

In [116]:
scores = model.evaluate(train, y_train, verbose=0)
print(scores)

[0.10220872561708093, 0.9625]


In [117]:
scores = model.evaluate(test, y_test, verbose=0)
print(scores)

[0.40757103457450866, 0.8658]


# rating prediction


One hot labels encoding

In [13]:
 from sklearn.preprocessing import OneHotEncoder

In [14]:
data_cocn = pd.concat([data_neg, data_pos])

In [15]:
def convert_to_one_hot():
    dummies = pd.get_dummies(data_cocn['rating'], prefix = None)
    return dummies.values

In [37]:
Y = convert_to_one_hot()
train_m, test_m, y_train_m, y_test_m = train_test_split(data, Y, test_size=0.2, random_state=42)

In [39]:
embedding_matrix_m, num_words_m = embed(word_index_data, train_m)

In [54]:
model = Sequential()
model.add(Embedding(num_words_m, 300,  input_length=100, weights= [embedding_matrix_m], trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(8, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


batch_size = 64

fit = model.fit(train_m, y_train_m, epochs=20, batch_size=batch_size,validation_split=0.1)

Train on 18000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [55]:
scores = model.evaluate(test_m, y_test_m, verbose=0)
print(scores)

[1.6258691785812378, 0.426]
