In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

In [2]:
file_path = {'yelp' : '/content/drive/My Drive/NLP/yelp_labelled.txt',
             'amazon' : '/content/drive/My Drive/NLP/amazon_cells_labelled.txt',
             'imdb' : '/content/drive/My Drive/NLP/imdb_labelled.txt'}
data_list = []
for source, filepath in file_path.items():
  data = pd.read_csv(filepath, names = ['sentence', 'label'], sep = '\t')
  data['source'] = source
  data_list.append(data)
data = pd.concat(data_list)
data.head()

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [3]:
data.shape

(2748, 3)

In [4]:
review = data['sentence'].values
label = data['label'].values
x_train, x_test, y_train, y_test = train_test_split(review, label, test_size = 0.25, random_state = 1000)

In [None]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(x_train)
xcnn_train = tokenizer.texts_to_sequences(x_train)
xcnn_test = tokenizer.texts_to_sequences(x_test)
vocab_size = len(tokenizer.word_index) + 1
print(x_train[1])
print(xcnn_train[1])

There was a warm feeling with the service and I felt like their guest for a special treat.
[43, 10, 4, 607, 323, 15, 1, 47, 2, 3, 350, 37, 109, 1908, 12, 4, 279, 1236]


In [None]:
max_len = 100
xcnn_train = pad_sequences(xcnn_train, padding = 'post', maxlen = max_len)
xcnn_test = pad_sequences(xcnn_test, padding = 'post', maxlen = max_len)
print(xcnn_train[0, :])

[ 278  295  212 1907   39  349    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [None]:
embedding_dim = 200
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length = max_len))
model.add(layers.Conv1D(128, 5, activation = 'relu'))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 200)          920600    
                                                                 
 conv1d_1 (Conv1D)           (None, 96, 128)           128128    
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 10)                1290      
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 1,050,029
Trainable params: 1,050,029
Non-trainable params: 0
____________________________________________

In [None]:
model.fit(xcnn_train, y_train, epochs = 10, verbose = False, validation_data = (xcnn_test, y_test), batch_size = 10)
loss, accuracy = model.evaluate(xcnn_train, y_train, verbose = False)
print("Training accuracy - ", accuracy)
print("Training loss - ", loss)

Training accuracy -  1.0
Training loss -  0.00022707736934535205


In [None]:
loss, accuracy = model.evaluate(xcnn_test, y_test, verbose = False)
print("Testing accuracy - ", accuracy*100)
print("Testing loss - ", loss)

Testing accuracy -  83.84279608726501
Testing loss -  0.6307385563850403
