In [None]:
import keras
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D, Activation
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

batch_size = 32
num_classes = 4
epochs = 2

# load the data and split between train and test sets   
train_df = pd.read_csv("ag_news_csv\\train.csv", names=['label', 'title', 'description'])
test_df = pd.read_csv("ag_news_csv\\train.csv", names=['label', 'title', 'description'])
print ("retrieved data")

# subtract 1 from labels to make them 0-3
train_df['label'] -= 1
test_df['label'] -= 1

# split data into x_train and y_train
y_train = train_df['label']
x_train = train_df['title'].map(str) + train_df['description'].map(str)
y_test = test_df['label']
x_test = test_df['title'].map(str) + test_df['description'].map(str)

# spit train into train/val
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=10000, shuffle=True)

print(x_train.shape[0], 'train samples')
print(x_val.shape[0], 'val samples')
print(x_test.shape[0], 'test samples')

# find max length
max_len = int(x_train.str.len().quantile(.95))

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_val = keras.utils.to_categorical(y_val, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# tokenize data
print ("beginning to process data")
t = Tokenizer()
t.fit_on_texts(x_train)
encoded_train = t.texts_to_sequences(x_train)
encoded_val = t.texts_to_sequences(x_val)
encoded_test = t.texts_to_sequences(x_test)

padded_train = pad_sequences(encoded_train, maxlen=max_len, padding='post')
padded_val = pad_sequences(encoded_val, maxlen=max_len, padding='post')
padded_test = pad_sequences(encoded_test, maxlen=max_len, padding='post')
vocab = len(t.word_counts)
print ("finished processing data")

# constants
l2weight = 0.0001

# full model
model = Sequential()
model.add(Embedding(vocab + 1, 8, input_length=max_len))
model.add(Flatten())
model.add(Dense(num_classes, activation='softmax'))

# build the graph
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(), metrics=['accuracy'])

model.summary()

model.fit(padded_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(padded_val, y_val))

# score the model on the test set
score = model.evaluate(padded_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])