In this notebook we implement a convolutional neural network to classify the authors from some writtten text (https://arxiv.org/abs/1408.5882) on top of a pretrained GloVe embedding matrix (https://nlp.stanford.edu/projects/glove/).

In [1]:
from keras import regularizers, optimizers
from keras.models import Sequential, Model
from keras.layers import Input, Dense, LSTM, Flatten, Conv1D, MaxPooling1D, Dropout, Bidirectional, Concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

import pandas as pd
import numpy as np
import re
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split
### Read in the data
# read our data into a dataframe
texts = pd.read_csv("../input/spooky-author-identification/train.csv")

def clean_str(string):
    """
    From:
    https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

texts.text = texts.text.transform(clean_str)

num_classes = len(set(texts.author))
# 
one_hot_labels = np.zeros((len(texts), num_classes))
one_hot_labels[np.arange(one_hot_labels.shape[0]),texts.author.astype('category').cat.codes] = 1

Using TensorFlow backend.


In [2]:
# Load the pretrained GloVec embedding matrix
embedding_size = 100

pretrained_embedding = {}
with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split(' ')
        pretrained_embedding[values[0]] = np.asarray(values[1:], dtype='float32')
        
def find_closest_word(word, vocab, pretrained_embedding):
    if word not in pretrained_embedding: return 'UNK'
    items = [(k,v) for k,v in pretrained_embedding.items() if k!=word and k in vocab]
    idx = np.argmin([np.dot(v-pretrained_embedding[word], v-pretrained_embedding[word]) for k,v in items])
    return items[idx][0]        

In [3]:
X_train, X_test, y_train, y_test = train_test_split(texts.text, one_hot_labels, random_state=20171030, train_size=0.8)

# Verify that test and train are well stratified
print(np.mean(y_train, axis = 0))
print(np.mean(y_test, axis = 0))

[ 0.40247718  0.28998276  0.30754006]
[ 0.40755873  0.27911134  0.31332993]




In [4]:
countvect = CountVectorizer(max_features=1000000, strip_accents='unicode').fit(X_train)

if 'UNK' not in countvect.vocabulary_:
    countvect.vocabulary_['UNK'] = max(countvect.vocabulary_.values()) + 1
# Special
for c in ',ia':
    if c not in countvect.vocabulary_: 
        countvect.vocabulary_[c] = max(countvect.vocabulary_.values()) + 1

In [5]:
# transform input strings to list of index, padded to the same length
def text2idx(s, vocab, maxlen):
    words = s.split(' ')
    for i,w in enumerate(words): 
        if w not in vocab: 
            #words[i] = find_closest_word(w, vocab, pretrained_embedding)
            words[i] = 'UNK'
    return np.array([vocab[x] for x in words] + [vocab['UNK']]*(maxlen - len(words)))

padded_len = max(X_train.str.len().max(), X_test.str.len().max())
X_train_keras = X_train.apply(text2idx, vocab=countvect.vocabulary_, maxlen=padded_len)
X_train_keras = np.array([x for x in X_train_keras])

X_test_keras = X_test.apply(text2idx, vocab=countvect.vocabulary_, maxlen=padded_len)
X_test_keras = np.array([x for x in X_test_keras])

In [6]:
# Build the embedding matrix. Any word not found in the pretrained will be represented by all zero vector
embedding_matrix = np.zeros((len(countvect.vocabulary_), embedding_size))
for w, i in countvect.vocabulary_.items():
    embedding_vector = pretrained_embedding.get(w)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        print(w) # show the words not in pretrained

pardonne
butefulle
contree
glueing
guiltlessly
legrasse
maillardet
allus
maelzel
mazurewicz
ulthar
onct
besprinkled
perpendicularity
lyeh
relume
demoniacally
loathsomely
crotala
undulantly
unhallowed
personated
uncurl
dogless
niggerless
throbbings
whar
taown
slackenin
westwardly
ruddier
appals
gaieties
oxydracae
relick
paralized
hypothenuse
templed
thalarion
firesides
partizans
engroses
unvaried
palsying
bestrewen
rodosto
chaldaea
monstrum
horrendum
ademptum
matutinal
satinet
helseggen
teuffel
rhythmed
deathful
rumgudgeon
univarsal
overspread
desideratum
exculpated
clerval
haggardness
furtiveness
terraqueous
palings
fishily
gaspingly
interminableness
contriver
irresistably
roarings
minuteness
palpability
schoolfellows
superabundant
islanded
recreant
unphilosophical
skeert
inquietude
iranon
sothoth
gamesome
nephren
hadoth
saleve
juras
pitchy
desolating
succouring
dinned
figgurs
queerest
ebber
syllabification
hatheg
awaked
cotters
mahometans
effluence
miasmal
daown
saoundin
adjure
antago

ungloved
upthrew
obtrude
trelliswork
gyptus
aricina
nemorensis
effulgently
unsearchableness
demureness
sanctimoniousness
resistlessly
phosphorescently
tiaraed
cawed
asseverations
prepossess
unappeasable
choler
plaguy
breathin
clamorously
misproportioned
fleetly
unadapted
ngranek
deestrick
suddent
unrefreshing
circumjacent
unfaded
truckmen
bethumbed
beseamed
besmeared
pennyless
meutes
piny
audacities
curvetted
maounds
southwardly
salsafette
nourjahad
ognor
frementi
necessitous
intentness
vospicus
sarmatic
proxenoi
unparticipated
sesquipedalian
presentiments
lanscape
esquimau
daemonologist
scantiness
frenziedly
upspringing
cuspidors
refinedly
espritism
theatricalism
ridgepoles
servox
tantalization
purloiner
sterb
immedicable
supposititious
embosomed
bossieux
enkindle
engross
gualtier
uprearing
fitten
traipsin
presentant
sixthly
unaccommodating
zaimi
compeers
heigho
pericranium
lionship
mudler
petrovitch
houseless
centurio
primipilus
habiliment
uneraseable
moralisers
concealedly
assemblan

In [7]:
embedding_length = 100
filter_sizes = [3,4,5,6]

model_input = Input(shape=(padded_len,), dtype='int32', name='embed_input') 
x = Embedding(len(countvect.vocabulary_), embedding_length, weights=[embedding_matrix], trainable=False)(model_input)

convs = []
for filter_size in filter_sizes: 
    conv = Conv1D(8, filter_size, padding="valid", activation="relu")(x) # sharing the input x
    conv = MaxPooling1D(4)(conv)
    conv = Flatten()(conv)
    convs.append(conv)

# merge the branches
x = Concatenate()(convs)
x = Dropout(0.5)(x)
model_output = Dense(num_classes, activation='softmax')(x)
model = Model(model_input, model_output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

print(model.summary())
model.fit(X_train_keras, y_train, validation_data=(X_test_keras,  y_test), epochs=10, batch_size=128) 

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embed_input (InputLayer)         (None, 4670)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 4670, 100)     2300000     embed_input[0][0]                
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 4668, 8)       2408        embedding_1[0][0]                
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, 4667, 8)       3208        embedding_1[0][0]                
___________________________________________________________________________________________

<keras.callbacks.History at 0x7efdfc450668>