In [1]:
# ========================Load data=========================
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

In [2]:
train_data_source = './data/ag_news_csv/train.csv'
test_data_source = './data/ag_news_csv/test.csv'

train_df = pd.read_csv(train_data_source, header=None)
test_df = pd.read_csv(test_data_source, header=None)

In [5]:
test_df

Unnamed: 0,0,1,2
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...
...,...,...,...
7595,1,Around the world,Ukrainian presidential candidate Viktor Yushch...
7596,2,Void is filled with Clement,With the supply of attractive pitching options...
7597,2,Martinez leaves bitter,Like Roger Clemens did almost exactly eight ye...
7598,3,5 of arthritis patients in Singapore take Bext...,SINGAPORE : Doctors in the United States have ...


In [6]:
# concatenate column 1 and column 2 as one text
for df in [train_df, test_df]:
    df[1] = df[1] + df[2]
    df = df.drop([2], axis=1)




In [9]:
df

Unnamed: 0,0,1
0,3,Fears for T N pension after talksUnions repres...
1,4,The Race is On: Second Private Team Sets Launc...
2,4,Ky. Company Wins Grant to Study Peptides (AP)A...
3,4,Prediction Unit Helps Forecast Wildfires (AP)A...
4,4,Calif. Aims to Limit Farm-Related Smog (AP)AP ...
...,...,...
7595,1,Around the worldUkrainian presidential candida...
7596,2,Void is filled with ClementWith the supply of ...
7597,2,Martinez leaves bitterLike Roger Clemens did a...
7598,3,5 of arthritis patients in Singapore take Bext...


In [10]:
# convert string to lower case
train_texts = train_df[1].values
train_texts = [s.lower() for s in train_texts]

test_texts = test_df[1].values
test_texts = [s.lower() for s in test_texts]

In [None]:
train_texts

In [12]:
# =======================Convert string to index================
# Tokenizer
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)
# If we already have a character list, then replace the tk.word_index
# If not, just skip below part

In [14]:
tk.word_index

{'UNK': 1,
 ' ': 2,
 'e': 3,
 'a': 4,
 't': 5,
 'i': 6,
 's': 7,
 'o': 8,
 'n': 9,
 'r': 10,
 'l': 11,
 'd': 12,
 'h': 13,
 'c': 14,
 'u': 15,
 'p': 16,
 'm': 17,
 'g': 18,
 'f': 19,
 'y': 20,
 'w': 21,
 'b': 22,
 '.': 23,
 'v': 24,
 'k': 25,
 ',': 26,
 '-': 27,
 ';': 28,
 '3': 29,
 '0': 30,
 'x': 31,
 '9': 32,
 'j': 33,
 'q': 34,
 '#': 35,
 '1': 36,
 '(': 37,
 ')': 38,
 '2': 39,
 "'": 40,
 'z': 41,
 '\\': 42,
 '&': 43,
 ':': 44,
 '/': 45,
 '5': 46,
 '4': 47,
 '6': 48,
 '"': 49,
 '7': 50,
 '$': 51,
 '8': 52,
 '=': 53,
 '?': 54,
 '!': 55,
 '_': 56,
 '*': 57}

In [15]:
# -----------------------Skip part start--------------------------
# construct a new vocabulary
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy()
# Add 'UNK' to the vocabulary
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
# -----------------------Skip part end----------------------------

In [16]:
# Convert string to index
train_sequences = tk.texts_to_sequences(train_texts)
test_texts = tk.texts_to_sequences(test_texts)

In [None]:
test_texts

In [21]:
# Padding
train_data = pad_sequences(train_sequences, maxlen=1014, padding='post')
test_data = pad_sequences(test_texts, maxlen=1014, padding='post')

In [22]:
# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

In [26]:
train_df

Unnamed: 0,0,1,2
0,3,Wall St. Bears Claw Back Into the Black (Reute...,"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...,Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."
...,...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...,KARACHI (Reuters) - Pakistani President Perve...
119996,2,Renteria signing a top-shelf dealRed Sox gener...,Red Sox general manager Theo Epstein acknowled...
119997,2,Saban not going to Dolphins yetThe Miami Dolph...,The Miami Dolphins will put their courtship of...
119998,2,Today's NFL gamesPITTSBURGH at NY GIANTS Time:...,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...


In [28]:
# =======================Get classes================
train_classes = train_df[0].values


train_classes


array([3, 3, 3, ..., 2, 2, 2])

In [32]:
train_class_list = [x -1 for x in train_classes]

In [33]:

test_classes = test_df[0].values
test_class_list = [x - 1 for x in test_classes]

from keras.utils import to_categorical

train_classes = to_categorical(train_class_list)
test_classes = to_categorical(test_class_list)

In [39]:
test_classes[1]

array([0., 0., 0., 1.], dtype=float32)

In [37]:
test_class_list[1]

3

In [40]:
# =====================Char CNN=======================
# parameter
input_size = 1014
vocab_size = len(tk.word_index)
embedding_size = 69
conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]
num_of_classes = 4
dropout_p = 0.5
optimizer = 'adam'
loss = 'categorical_crossentropy'


In [41]:

# Embedding weights
embedding_weights = []  # (70, 69)
embedding_weights.append(np.zeros(vocab_size))  # (0, 69)

for char, i in tk.word_index.items():  # from index 1 to 69
    onehot = np.zeros(vocab_size)
    onehot[i - 1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)
print('Load')

Load


In [42]:

# Embedding layer Initialization
embedding_layer = Embedding(vocab_size + 1,
                            embedding_size,
                            input_length=input_size,
                            weights=[embedding_weights])

# Model Construction
# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)
# Embedding
x = embedding_layer(inputs)

In [43]:
# Conv
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x)
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)  # Final shape=(None, 34, 256)
x = Flatten()(x)  # (None, 8704)
# Fully connected layers
for dense_size in fully_connected_layers:
    x = Dense(dense_size, activation='relu')(x)  # dense_size == 1024
    x = Dropout(dropout_p)(x)
# Output Layer
predictions = Dense(num_of_classes, activation='softmax')(x)
# Build model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])  # Adam, categorical_crossentropy
model.summary()


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 1014)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1014, 69)          4830      
_________________________________________________________________
conv1d (Conv1D)              (None, 1008, 256)         123904    
_________________________________________________________________
activation (Activation)      (None, 1008, 256)         0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 336, 256)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 330, 256)          459008    
_________________________________________________________________
activation_1 (Activation)    (None, 330, 256)          0     

In [44]:
# # 1000 training samples and 100 testing samples
# indices = np.arange(train_data.shape[0])
# np.random.shuffle(indices)
#
# x_train = train_data[indices][:1000]
# y_train = train_classes[indices][:1000]
#
# x_test = test_data[:100]
# y_test = test_classes[:100]

indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)

x_train = train_data[indices]
y_train = train_classes[indices]

x_test = test_data
y_test = test_classes


In [47]:
train_classes[0]

array([0., 0., 1., 0.], dtype=float32)

In [None]:
# Training
model.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=128,
          epochs=10,
          verbose=2)