In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Conv1D, MaxPooling1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re

In [4]:
df = pd.read_csv('dataset/datafinal.csv', encoding='latin-1')
df

Unnamed: 0.1,Unnamed: 0,text,author,text no stopwords,top text no stopwords
0,0,ou have time to listen i will give you the ent...,1,ou time listen give entire story say may form ...,one time row give may city boat effect interes...
1,1,wish for solitude he was twenty years of age a...,1,wish solitude twenty years age possession perf...,one time row give may city boat effect interes...
2,2,and the skirt blew in perfect freedom about th...,1,skirt blow perfect freedom upper part wear hat...,one time row give may city boat effect interes...
3,3,of san and the rows of shops opposite impresse...,1,san row shop opposite impress upon vision last...,one time row give may city boat effect interes...
4,4,an hour s walk was as tiresome as three in a s...,1,hour walk tiresome three sensible english town...,one time row give may city boat effect interes...
...,...,...,...,...,...
53673,53673,after surrounding and searching the whole plac...,50,surround search whole place could discover tra...,one time row give may city boat effect interes...
53674,53674,giant who could make a young earthquake or a w...,50,giant could make young earthquake angry accord...,one time row give may city boat effect interes...
53675,53675,waters of the lake at the bottom of the hill c...,50,water lake bottom hill curl kindness sympathy ...,one time row give may city boat effect interes...
53676,53676,fingers and thumb in it exactly as it came out...,50,finger thumb exactly come hand never mind say ...,one time row give may city boat effect interes...


In [5]:
df['text'] = df['text no stopwords']
df

Unnamed: 0.1,Unnamed: 0,text,author,text no stopwords,top text no stopwords
0,0,ou time listen give entire story say may form ...,1,ou time listen give entire story say may form ...,one time row give may city boat effect interes...
1,1,wish solitude twenty years age possession perf...,1,wish solitude twenty years age possession perf...,one time row give may city boat effect interes...
2,2,skirt blow perfect freedom upper part wear hat...,1,skirt blow perfect freedom upper part wear hat...,one time row give may city boat effect interes...
3,3,san row shop opposite impress upon vision last...,1,san row shop opposite impress upon vision last...,one time row give may city boat effect interes...
4,4,hour walk tiresome three sensible english town...,1,hour walk tiresome three sensible english town...,one time row give may city boat effect interes...
...,...,...,...,...,...
53673,53673,surround search whole place could discover tra...,50,surround search whole place could discover tra...,one time row give may city boat effect interes...
53674,53674,giant could make young earthquake angry accord...,50,giant could make young earthquake angry accord...,one time row give may city boat effect interes...
53675,53675,water lake bottom hill curl kindness sympathy ...,50,water lake bottom hill curl kindness sympathy ...,one time row give may city boat effect interes...
53676,53676,finger thumb exactly come hand never mind say ...,50,finger thumb exactly come hand never mind say ...,one time row give may city boat effect interes...


In [6]:
max_words = 100000
max_length = 1000
embedding_dim = 100
tokenizer = Tokenizer(num_words=max_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df.text)
print("tokens:", tokenizer.word_index)

tokens: {'ã¢': 1, 'say': 2, 'one': 3, 'would': 4, 'go': 5, 'know': 6, 'come': 7, 'make': 8, 'think': 9, 'mr': 10, 'could': 11, 'see': 12, 'man': 13, 'time': 14, 'look': 15, 'take': 16, 'like': 17, 'little': 18, 'upon': 19, 'well': 20, 'good': 21, 'give': 22, 'never': 23, 'much': 24, 'get': 25, 'hand': 26, 'must': 27, 'tell': 28, 'great': 29, 'old': 30, 'two': 31, 'mrs': 32, 'find': 33, 'even': 34, 'day': 35, 'eye': 36, 'may': 37, 'leave': 38, 'way': 39, 'life': 40, 'us': 41, 'might': 42, 'first': 43, 'long': 44, 'seem': 45, 'back': 46, 'hear': 47, 'every': 48, 'men': 49, 'house': 50, 'though': 51, 'place': 52, 'away': 53, 'love': 54, 'shall': 55, 'without': 56, 'nothing': 57, 'head': 58, 'face': 59, 'last': 60, 'young': 61, 'many': 62, 'turn': 63, 'speak': 64, 'ever': 65, 'yet': 66, 'work': 67, 'sir': 68, 'ask': 69, 'call': 70, 'mind': 71, 'miss': 72, 'still': 73, 'word': 74, 'room': 75, 'new': 76, 'mean': 77, 'stand': 78, 'want': 79, 'put': 80, 'let': 81, 'feel': 82, 'night': 83, 'peo

"data = pad_sequences(data, max_length)" in this line, it should be max_length, i used max_words

In [7]:
labels = to_categorical(df.author - 1)
data = tokenizer.texts_to_sequences(df.text)
data = pad_sequences(data, max_length)
train_data, test_data, train_label, test_label = train_test_split(data, labels, test_size = 0.10, random_state = 30)

In [8]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=train_data.shape[1]))
model.add(Conv1D(filters = 32, kernel_size = 5, padding="same", activation="relu"))
model.add(MaxPooling1D(pool_size = 2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(50, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 100)         10000000  
_________________________________________________________________
conv1d (Conv1D)              (None, 1000, 32)          16032     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 500, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 50)                5050      
Total params: 10,074,282
Trainable params: 10,074,282
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
epochs = 5
batch_size = 64
model.fit(train_data, train_label, epochs=epochs, batch_size=batch_size,validation_split=0.1)

Epoch 1/5