In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk, re, time
from nltk.corpus import stopwords

from bs4 import BeautifulSoup 

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

nltk.download('stopwords')
from time import time
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/ideis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train = pd.read_csv("data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv( "data/testData.tsv", header=0, delimiter="\t", quoting=3)

print("Train: %d, Test: %d\n"
      % (train["review"].size, test["review"].size))

Train: 25000, Test: 25000



In [3]:
def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    text = BeautifulSoup(text).get_text()
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    text = re.sub(r"[^a-z]", " ", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    
    return(text)

def extract_sentiment(s):
    s = re.sub(r"[^0-9]", " ", s)
    n = int(s.split()[1])
    if n <= 5:
        return 0
    else:
        return 1
train['review'] = train.apply(lambda x: clean_text(x['review']), axis=1)
test['review'] = test.apply(lambda x: clean_text(x['review']), axis=1)
test['sentiment'] = test.apply(lambda x: extract_sentiment(x['id']), axis=1)

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

maxlen = 300  # Max length of review
max_features = 10000  # Number of words to consider as features
max_words = 10000  # Number of words to consider in the dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train["review"])
sequences = tokenizer.texts_to_sequences(train["review"])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(train["sentiment"])
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 74216 unique tokens.
Shape of data tensor: (25000, 200)
Shape of label tensor: (25000,)


In [5]:
training_samples = 20000
validation_samples = 5000

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

In [6]:
from keras.models import Sequential
from keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout
from keras.optimizers import RMSprop

model = Sequential()
model.add(Embedding(max_features, 32, input_length=maxlen))

model.add(Conv1D(32, kernel_size=7, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.3))

model.add(Conv1D(64, kernel_size=7, activation='relu'))
model.add(Dropout(0.3))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 32)           32000     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 194, 32)           7200      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 97, 32)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 97, 32)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 91, 64)            14400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 91, 64)            0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
__________

In [None]:
model.compile(optimizer=RMSprop(lr=1e-4),
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=128,
                    validation_data=(x_val, y_val))

In [None]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure()

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.savefig('cnn-acc.png')
plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.savefig('cnn-loss.png')

plt.show()

In [None]:
sequences = tokenizer.texts_to_sequences(test["review"])
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(test["sentiment"])

scores = model.evaluate(x_test, y_test)
print("Accuracy:", scores[1])

In [None]:
from sklearn.metrics import roc_auc_score
y_pred = model.predict_proba(x_test)
print("ROC-AUC:",roc_auc_score(y_test, y_pred))