In [0]:
import pandas as pd

In [2]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import np_utils

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Lambda
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, SimpleRNN, GRU

from keras.layers.convolutional import Conv1D

from keras.callbacks import EarlyStopping

from keras import backend as K

Using TensorFlow backend.


In [3]:
df = pd.read_csv("ccl_clean.csv", index_col=0)

IOError: ignored

In [0]:
df.shape

(460, 2)

In [0]:
train = df[df.index < '2016-01-01']
test = df[df.index > '2015-12-31']

y_train = train["rise_in_next_week"].values
y_test = test["rise_in_next_week"].values

In [0]:
trainheadlines = [str(i) for i in train["text"].values]
testheadlines = [str(i) for i in test["text"].values]

# LSTM

In [0]:
max_features = 10000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.1
maxlen = 200
batch_size = 10
nb_classes = 2

In [0]:
tokenizer = Tokenizer(nb_words=max_features)
tokenizer.fit_on_texts(trainheadlines)
sequences_train = tokenizer.texts_to_sequences(trainheadlines)
sequences_test = tokenizer.texts_to_sequences(testheadlines)



In [0]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(sequences_train, maxlen=maxlen)
X_test = sequence.pad_sequences(sequences_test, maxlen=maxlen)

Y_train = np_utils.to_categorical(y_train, nb_classes)
Y_test = np_utils.to_categorical(y_test, nb_classes)


print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

Pad sequences (samples x time)
X_train shape: (355, 200)
X_test shape: (105, 200)


In [0]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) 
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
early = EarlyStopping(monitor='val_loss', min_delta=0, patience=1, verbose=1, mode='auto')
model.fit(X_train, Y_train, batch_size=batch_size, epochs=3,
          validation_data=(X_test, Y_test),
         callbacks=[early])
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds_lstm = model.predict_classes(X_test, verbose=0)
acc_lstm = accuracy_score(test["rise_in_next_week"], preds_lstm)

Build model...


  app.launch_new_instance()


Train...
Train on 355 samples, validate on 105 samples
Epoch 1/3
Epoch 2/3
Epoch 00002: early stopping
Test score: 0.6747690439224243
Test accuracy: 0.5619047638915834
Generating test predictions...


# CNN

In [0]:
nb_filter = 120
filter_length = 2
hidden_dims = 120
nb_epoch = 2

In [0]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
# we add a Convolution1D, which will learn nb_filter
# word group filters of size filter_length:
model.add(Conv1D(activation="relu", filters=120, kernel_size=2, strides=1, padding="valid"))

def max_1d(X):
    return K.max(X, axis=1)

model.add(Lambda(max_1d, output_shape=(nb_filter,)))
model.add(Dense(hidden_dims)) 
model.add(Dropout(0.2)) 
model.add(Activation('relu'))
model.add(Dense(nb_classes))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Build model...


  app.launch_new_instance()


In [0]:
print('Train...')
model.fit(X_train, Y_train, batch_size=32, epochs=5,
          validation_data=(X_test, Y_test),
          callbacks=[early])
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds16 = model.predict_classes(X_test, verbose=0)
acc16 = accuracy_score(test['rise_in_next_week'], preds16)

Train...
Train on 355 samples, validate on 105 samples
Epoch 1/5
Epoch 2/5
Epoch 00002: early stopping
Test score: 0.6639595996765864
Test accuracy: 0.6190476247242519
Generating test predictions...


# CNN+LSTM

In [0]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(Conv1D(activation="relu", filters=120, kernel_size=2, strides=1, padding="valid"))
model.add(Dropout(0.2))
model.add(Conv1D(activation="relu", filters=120, kernel_size=2, strides=1, padding="valid"))
model.add(Dropout(0.2))
model.add(Conv1D(activation="relu", filters=120, kernel_size=2, strides=1, padding="valid"))
model.add(Dropout(0.2))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)) 
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)) 
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2)) 
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Build model...


  app.launch_new_instance()


In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 128)         1280000   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 120)         30840     
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 120)         0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 120)         28920     
_________________________________________________________________
dropout_3 (Dropout)          (None, None, 120)         0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, None, 120)         28920     
_________________________________________________________________
dropout_4 (Dropout)          (None, None, 120)         0         
__________

In [0]:
print('Train...')
early = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=1, mode='auto')
model.fit(X_train, Y_train, batch_size=batch_size, epochs=10,
          validation_data=(X_test, Y_test),
          callbacks=[early])
score, acc = model.evaluate(X_test, Y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


print("Generating test predictions...")
preds_cnn_lstm = model.predict_classes(X_test, verbose=0)
acc_cnn_lstm = accuracy_score(test["rise_in_next_week"], preds_cnn_lstm)

Train...
Train on 355 samples, validate on 105 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 2.426414069675264
Test accuracy: 0.6000000046832221
Generating test predictions...
