In [13]:
import numpy as np
import pandas as pd
from collections import defaultdict
import re

from bs4 import BeautifulSoup
import sys
import os

os.environ['KERAS_BACKEND']='theano'
import keras
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model
from keras.models import load_model

from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers

MAX_SENT_LENGTH = 100
MAX_SENTS = 50
MAX_NB_WORDS = 50000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)   
    return string.strip().lower()

dtype = {
    'id': str,
    'teacher_id': str,
    'teacher_prefix': str,
    'school_state': str,
    'project_submitted_datetime': str,
    'project_grade_category': str,
    'project_subject_categories': str,
    'project_subject_subcategories': str,
    'project_title': str,
    'project_essay_1': str,
    'project_essay_2': str,
    'project_essay_3': str,
    'project_essay_4': str,
    'project_resource_summary': str,
    'teacher_number_of_previously_posted_projects': int,
    'project_is_approved': np.uint8,
}
#data_path = os.path.join('..', 'input')
train = pd.read_csv('data/train.csv', dtype=dtype, low_memory=True)
test = pd.read_csv( 'data/test.csv', dtype=dtype, low_memory=True)

test['project_is_approved'] = 1

train['text'] = train.apply(lambda row: ' '.join([
    str(row['project_title']), 
    str(row['project_resource_summary']), 
    str(row['project_essay_1']), 
    str(row['project_essay_2']), 
    str(row['project_essay_3']), 
    str(row['project_essay_4'])]), axis=1)
test['text'] = test.apply(lambda row: ' '.join([
    str(row['project_title']), 
    str(row['project_resource_summary']), 
    str(row['project_essay_1']), 
    str(row['project_essay_2']), 
    str(row['project_essay_3']), 
    str(row['project_essay_4'])]), axis=1)

train = train.drop([
    'teacher_id',
    'teacher_prefix',
    'school_state',
    'project_submitted_datetime',
    'project_grade_category',
    'project_subject_categories',
    'project_subject_subcategories',
    'project_title',
    'project_essay_1',
    'project_essay_2',
    'project_essay_3',
    'project_essay_4',
    'project_resource_summary',
    'teacher_number_of_previously_posted_projects'], axis=1)
test = test.drop([
    'teacher_id',
    'teacher_prefix',
    'school_state',
    'project_submitted_datetime',
    'project_grade_category',
    'project_subject_categories',
    'project_subject_subcategories',
    'project_title',
    'project_essay_1',
    'project_essay_2',
    'project_essay_3',
    'project_essay_4',
    'project_resource_summary',
    'teacher_number_of_previously_posted_projects'], axis=1)

data_train = pd.concat([train,test],axis = 0).reset_index()

import nltk
from nltk import tokenize

reviews = []
labels = []
texts = []
instance_inputs = []
comment_id = []

#Return dimension of data_train.review([0]=row)
for idx in range(data_train.text.shape[0]):
    sys.stdout.write("\rProcessing ---- %d"%idx)
    sys.stdout.flush()
    comment_id.append(data_train.id[idx])
    text = ''.join(data_train.text[idx])
    #parse the sentences into beautifulsoup object
    #print text
    text = BeautifulSoup(text)
    text = clean_str(text.get_text().encode('ascii','ignore'))
    #insert clear text into texts array
    texts.append(text)
    #Return a sentence-tokenized copy of text( divide string into substring by punkt)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)
    labels.append(data_train.project_is_approved[idx])

#Class for vectorizing texts (Tokenizer)
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
#list of texts to train on
tokenizer.fit_on_texts(texts)

#New 3D array filled with zero with (length,15,100) length= num of char
data = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
word_len = np.zeros(10000)
#enumerate produce a tuple(index)
for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        word_len[len(sentences)] +=1
        if j< MAX_SENTS:
        #Split sentence into a list of words
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
            #dictionary mapping word to their rank/index (int)
                    data[i,j,k] = tokenizer.word_index[word]
                    k=k+1                    
                    
word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

#Converts a class vector (integers) to binary class matrix
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

nb_validation_samples = 78035
#split training and validation set
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
comment_id = comment_id[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

GLOVE_DIR = ".~/ubuntu/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    #split the vector of 100d
    values = line.split()
    #word at values[0]
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)

class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('normal')
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        self.W = self.add_weight(name='kernel', 
                                  shape=(input_shape[-1],),
                                  initializer='normal',
                                  trainable=True)
        super(AttLayer, self).build(input_shape) 

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))     
        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

    def get_config(self):
        config = {}
        base_config = super(AttLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

print('Shape of data tensor:', data.shape)
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32', name='main_input')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
l_dense = TimeDistributed(Dense(200))(l_lstm)
l_att = AttLayer()(l_dense)
sentEncoder = Model(sentence_input, l_att)
review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)
l_att_sent = AttLayer()(l_dense_sent)
preds = Dense(2, activation='softmax')(l_att_sent)
model = Model(review_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("model fitting - Hierachical attention network")
print(model.summary())
model.fit(x_train, y_train, validation_data=(x_val, y_val),nb_epoch=3, batch_size=100, verbose=2)

Processing ---- 0



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


TypeError: cannot use a string pattern on a bytes-like object