In [1]:
'''
Source: http://mindmech.net
'''

import csv
import numpy as np

def process_msg(message, vocab):
    '''
    message:    the message string to classify.
    vocab:         a dict of unique integers assigned to unique words.
    
    Insert your preprocessing here. For now we'll just lowercase, 
    skip punctuation, and add unk tags.
    '''
    msg_arr = []
    tokenized = "".join((char if char.isalpha() else " ") for char in message.lower()).split()
    
    for word in tokenized:
        
        if word in vocab:
            msg_arr.append(vocab[word])
        else:
            msg_arr.append(vocab['<unk>'])
            
    return np.asarray(msg_arr)
    

def get_vocab(train_fname):
    '''
    Creates a vocabulary from a CSV file (must have "message" column), by 
    assigning a unique integer to each unique word seen in the file. 
    Replaces words only occurring once with an <unk> tag, to give the 
    network the capability to process unknown words.
    '''
    print("Reading vocab from:", train_fname)
    reader = csv.reader(open(train_fname, 'r', encoding='utf-8'))
    freqs = {}
    
    header = next(reader)
    for row in reader:
        if row == []:
            continue
        message = row[header.index('message')]
        msg_arr = message.lower().split()
        
        for word in msg_arr:
            if word not in freqs.keys():
                freqs[word] = 0
        freqs[word] += 1
        
    vocab = {}
    vocab_idx = 1
    for word in freqs.keys():
        if freqs[word] > 1:
            vocab[word] = vocab_idx
            vocab_idx += 1
            
    vocab['<unk>'] = vocab_idx
    
    return vocab
    

def get_xy(csv_fname, vocab):
    '''
    csv_fname:     filename for a CSV with columns "message" (string) 
                and "annotation" (int).
    vocab:         a dict of unique integers assigned to unique words
    
    Returns "x" and "y" data from csv file, i.e. converts each message 
    into a list of corresponding word integers from the vocabulary for 
    "x". The "y" data, of course, is simply the annotation for each 
    message in the csv file.
    '''
    print("Getting x and y data from file", csv_fname)
    reader = csv.reader(open(csv_fname, 'r', encoding='utf-8'))
    header = next(reader)
    
    x = []
    y = []
    
    for row in reader:
        if row == []:
            continue
        message = row[header.index('message')]
        msg_x = process_msg(message, vocab)
        x.append(msg_x)
        
        annotation = int(row[header.index('annotation')])
        y.append(annotation)
    
    return np.asarray(x), np.asarray(y)
    
    
def load_data(train_fname, test_fname):
    '''
    Load the messages and annotations from the input CSV files as
    lists of integers assigned to vocabulary words. Return also the 
    vocabulary for later use by the live tool.
    '''
    vocab = get_vocab(train_fname)
    (x_train, y_train) = get_xy(train_fname, vocab)
    (x_test, y_test) = get_xy(test_fname, vocab)
    
    return (x_train, y_train), (x_test, y_test), vocab

In [4]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
import sys, pickle

# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

print('Loading data...')
#(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

train_path = "/Users/aliosha/Development/nlp/blog/02_keras_txt_classifier/train.csv"
test_path = "/Users/aliosha/Development/nlp/blog/02_keras_txt_classifier/test.csv"

(x_train, y_train), (x_test, y_test), vocab = load_data(train_path, test_path)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])


Loading data...
Reading vocab from: /Users/aliosha/Development/nlp/blog/02_keras_txt_classifier/train.csv
Getting x and y data from file /Users/aliosha/Development/nlp/blog/02_keras_txt_classifier/train.csv
Getting x and y data from file /Users/aliosha/Development/nlp/blog/02_keras_txt_classifier/test.csv
45035 train sequences
4965 test sequences
Pad sequences (samples x time)
x_train shape: (45035, 400)
x_test shape: (4965, 400)
Build model...


In [5]:
model

<keras.models.Sequential at 0x124889cc0>

In [6]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

classifier_fname = 'classifier.h5'
vocab_fname = 'vocab.pkl'

print("Saving classifier to:", classifier_fname)
model.save(classifier_fname)
print("Saving vocab to:", vocab_fname)
pickle.dump(vocab, open(vocab_fname, 'wb'))

Train on 45035 samples, validate on 4965 samples
Epoch 1/2
Epoch 2/2
Saving classifier to: classifier.h5
Saving vocab to: vocab.pkl
