In [1]:
import pandas as pd
import numpy as np
import itertools
import time
import functools
import threading
import os.path
import queue
import pickle
from keras.preprocessing import sequence
from keras.optimizers import SGD
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.convolutional import MaxPooling2D, Conv2D
from keras.callbacks import ModelCheckpoint
from bs4 import BeautifulSoup
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}")
FEATURE_LEN = 1014
BATCH_SIZE = 128
NUM_FILTERS = 256
EPOCHS = 10

In [3]:
character_hash = pd.DataFrame(np.identity(len(ALPHABET), dtype='bool'), columns=ALPHABET)
character_hash.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,Unnamed: 12,=,<,>,(,),[,],{,}
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [4]:
# Create tag dic (top 50 labels)
labels = list(set(itertools.chain(*pd.read_csv('populartags.csv')['PopularTags'].str.split(' ').values)))
print(len(labels))
labels_toidx = {l:i for i,l in enumerate(labels)}
idx_tolabels = {i:l for i,l in enumerate(labels)}
labels[:10]

50


['oracle',
 'forms',
 'json',
 'image',
 'css',
 'ajax',
 'django',
 'multithreading',
 'git',
 '.net']

In [5]:
# X-data
# Remove html, lower, trim to n chars, reverse-order
df = pd.read_csv('stackoverflow_38mill.csv', encoding='latin-1')
X_data = df.Body.apply(lambda x: BeautifulSoup(x, "lxml").get_text().lower()[:FEATURE_LEN][::-1])

In [6]:
# y-data
stack_y = []
for tg in tqdm(df['PopularTags'].values):
    targets = np.zeros(len(labels))
    for t in tg.split(' '):
        targets[labels_toidx[t]] = 1
    stack_y.append(targets)
# Array
y_data = np.array(stack_y, np.uint8)

100%|████████████████████████████| 3887653/3887653 [00:11<00:00, 348725.08it/s]


In [7]:
def load_data_frame(X_data, y_data, batch_size=128, shuffle=False):

    if shuffle:
        idx = X_data.index
        assert len(idx) == len(y_data)
        rnd = np.random.permutation(idx)
        X_data = X_data.reindex(rnd)
        y_data = y_data[rnd]

    # Dictionary to create character vectors
    character_hash = pd.DataFrame(np.identity(len(ALPHABET), dtype='bool'), columns=ALPHABET)
    
    def feature_extractor(dta, val):
        # Yield mini-batch amount of character vectors
        # Input_shape = (samples, rows, cols, channels)
        X_split = np.zeros([batch_size, FEATURE_LEN, len(ALPHABET), 1], dtype=np.uint8)
        for ti, tx in enumerate(dta):
            chars = list(tx)
            for ci, ch in enumerate(chars):
                if ch in ALPHABET:
                    X_split[ti % batch_size][ci] = np.array(character_hash[ch], dtype=np.uint8)[..., np.newaxis]
            # No padding -> only complete batches processed
            if (ti + 1) % batch_size == 0:
                yield X_split, val[ti + 1 - batch_size:ti + 1]
                X_split = np.zeros([batch_size, FEATURE_LEN, len(ALPHABET), 1], dtype=np.uint8)

    # Yield one mini-batch at a time and asynchronously process to keep 4 in queue
    while True:
        print("Fresh run of data ...")
        for Xsplit, ysplit in feature_extractor(X_data, y_data):
            yield Xsplit, ysplit

In [8]:
def create_crepe():
    print('Build model...')
    
    model = Sequential()
    #Input = #alphabet x 1014
    #kernel= width, height
    #Input_shape = (samples, rows, cols, channels)
    model.add(Conv2D(filters=NUM_FILTERS, kernel_size=(7,69), input_shape=(FEATURE_LEN, 69, 1)))

    model.add(MaxPooling2D(pool_size=(3,1)))
    #Input = 336 x 256
    model.add(Conv2D(NUM_FILTERS, (7,1)))
    model.add(MaxPooling2D(pool_size=(3,1)))
    #Input = 110 x 256
    model.add(Conv2D(NUM_FILTERS, (3,1)))
    #Input = 108 x 256
    model.add(Conv2D(NUM_FILTERS, (3,1)))
    #Input = 106 x 256
    model.add(Conv2D(NUM_FILTERS, (3,1)))
    #Input = 104 X 256
    model.add(Conv2D(NUM_FILTERS, (3,1)))
    model.add(MaxPooling2D(pool_size=(3,1)))
    model.add(Flatten())
    
    #Fully Connected Layers
    fully_connected = [1024, 1024, len(labels)]
    #Input is 8704 Output is 1024 
    model.add(Dense(fully_connected[0]))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    #Input is 1024 Output is 1024
    model.add(Dense(fully_connected[1]))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    #Input is 1024 Output is n-binomial distributions
    model.add(Dense(fully_connected[2]))
    model.add(Activation('sigmoid'))
    
    #Stochastic gradient parameters as set by paper
    sgd = SGD(lr=0.01, decay=1e-5, momentum=0.9, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer=sgd, class_mode="binary", metrics=['accuracy'])
    return model,sgd

In [9]:
model, sgd = create_crepe()

Build model...


In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 1008, 1, 256)      123904    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 336, 1, 256)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 330, 1, 256)       459008    
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 110, 1, 256)       0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 108, 1, 256)       196864    
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 106, 1, 256)       196864    
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 104, 1, 256)       196864    
__________

In [11]:
# Train/test split
trainsize = int(len(X_data)*0.9)
x_train, x_test = X_data[:trainsize], X_data[trainsize:]
y_train, y_test = y_data[:trainsize], y_data[trainsize:]

In [12]:
# Test dimensions from generator
for X_batch, y_batch in load_data_frame(x_train, y_train):
    print(X_batch.shape, y_batch.shape)
    break

Fresh run of data ...
(128, 1014, 69, 1) (128, 50)


In [None]:
# checkpoint
filepath="crepe_{epoch:02d}_{val_acc:.2f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=False, mode='max')
callbacks_list = [checkpoint]

In [None]:
# Fit on custom generator
model.fit_generator(load_data_frame(x_train, y_train),
                    steps_per_epoch=(len(x_train)//BATCH_SIZE), 
                    epochs=15,
                    verbose=2,
                    max_q_size=50,
                    validation_data=load_data_frame(x_test, y_test),
                    validation_steps=(len(x_test)//BATCH_SIZE),
                    callbacks=callbacks_list)

kwargs passed to function are ignored with Tensorflow backend


Fresh run of data ...Epoch 1/15

