In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')

import spacy;

Using TensorFlow backend.


In [2]:
yelp_reviews=pd.read_csv("review.csv",usecols=["stars","text"])
yelp_reviews.head(2)

Unnamed: 0,text,stars
0,"Love the staff, love the meat, love the place....",5
1,Super simple place but amazing nonetheless. It...,5


In [50]:
reviews = yelp_reviews[yelp_reviews['text'].map(len) > 1000]
reviews = reviews[:100000]

In [51]:
reviews['labels'] = reviews['stars'].apply(lambda x: 2 if x > 3 else 1 if x == 3 else 0)
texts = reviews["text"].values
labels = reviews["labels"].values

In [52]:
MAX_NUM_WORDS=2000 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH=150 # max number of words in a review to use


tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 162245 unique tokens.
Shape of data tensor: (100000, 150)
Shape of label tensor: (100000, 3)


In [53]:
VALIDATION_SPLIT=0.2
data_sample = data[:10]
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]


In [7]:
import os
embeddings_index = {}
f = open('glove.6B.50d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [8]:
EMBEDDING_DIM = 50 # how big is each word vector

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [9]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [10]:
from keras.layers import Bidirectional, GlobalMaxPool1D,Conv1D
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

from keras.models import Model


inp = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedded_sequences = embedding_layer(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(3, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=2, batch_size=128);

Train on 80000 samples, validate on 20000 samples
Epoch 1/2
Epoch 2/2


In [44]:
Sample = reviews.reset_index()


In [49]:
sample = Sample[:10].drop(columns='index')
sample

Unnamed: 0,text,stars
0,I thought Tidy's Flowers had a great reputatio...,1
1,really excited to hear of this restaurant comi...,1
2,was always intrigued of this place when I pass...,3
3,Terrible service and not so great drinks.\n\nW...,1
4,My boyfriend took me here for my birthday. At ...,4
5,Came here for a friend's birthday. She had loo...,1
6,I started going to this nail salon last summer...,4
7,I give up on this place! I have gotten burrito...,1
8,I think I may have been let down by this place...,2
9,This whole place basically sucks and it's even...,1


In [54]:
y_pred = model.predict(data_sample)

In [55]:
y_pred

array([[ 0.85938323,  0.10467435,  0.06266689],
       [ 0.79175264,  0.1833213 ,  0.05845479],
       [ 0.161616  ,  0.23502216,  0.56462204],
       [ 0.49099258,  0.3069886 ,  0.17850322],
       [ 0.21618278,  0.41344553,  0.3733429 ],
       [ 0.56409454,  0.29904711,  0.14731939],
       [ 0.6026184 ,  0.09928344,  0.2727541 ],
       [ 0.824471  ,  0.130411  ,  0.08463027],
       [ 0.74627399,  0.28959319,  0.05876447],
       [ 0.76918137,  0.19572897,  0.08854329]], dtype=float32)