In [2]:
# Load the libraries

from keras.models import Sequential
from keras import layers
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
import pandas as pd
from sklearn.model_selection import train_test_split

#Load the path for the dataset

filepath_dict = {'yelp':   'data/sentiment_analysis/yelp_labelled.txt',
                 'amazon': 'data/sentiment_analysis/amazon_cells_labelled.txt',
                 'imdb':   'data/sentiment_analysis/imdb_labelled.txt'}

# Create an empty list  to loop through the dataset
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

#Concatenate the list to a dataframe
df = pd.concat(df_list)
print(df)

#Assigning values to labels and spliting the dataset into train, test
df_yelp = df[df['source']=='yelp']
sentences = df_yelp['sentence'].values
y = df_yelp['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size = 0.25, random_state = 1000)

# Generating the dataset
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)


X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

# Checking the dataset
print(sentences_train[2])
print(X_train[2])

embedding_dim = 100
maxlen = 100

X_train = pad_sequences(X_train, padding = 'post', maxlen= maxlen)
X_test = pad_sequences(X_test, padding = 'post', maxlen= maxlen)

# Developing the model with Convolution of 1d
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

# Training the model with dataset
history= model.fit(X_train, y_train,
                  epochs = 10,
                  verbose = False,
                  validation_data = (X_test, y_test),
                  batch_size = 10)

# Evaluating the model to obtain the accuracy and loss for the testing and training
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


                                              sentence  label source
0                             Wow... Loved this place.      1   yelp
1                                   Crust is not good.      0   yelp
2            Not tasty and the texture was just nasty.      0   yelp
3    Stopped by during the late May bank holiday of...      1   yelp
4    The selection on the menu was great and so wer...      1   yelp
..                                                 ...    ...    ...
743  I just got bored watching Jessice Lange take h...      0   imdb
744  Unfortunately, any virtue in this film's produ...      0   imdb
745                   In a word, it is embarrassing.        0   imdb
746                               Exceptionally bad!        0   imdb
747  All in all its an insult to one's intelligence...      0   imdb

[2748 rows x 3 columns]
Of all the dishes, the salmon was the best, but all were great.
[11, 43, 1, 171, 1, 283, 3, 1, 47, 26, 43, 24, 22]
Model: "sequential_1"
__________