In [1]:
from keras import layers
from keras import models

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
model = models.Sequential()

### Load Data

In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv('data/uci-news-aggregator.csv')

In [30]:
import re
import string

def clean_text(s):
    s = s.lower()
    for ch in string.punctuation:                                                                                                     
        s = s.replace(ch, " ") 
    s = re.sub("[0-9]+", "||DIG||",s)
    s = re.sub(' +',' ', s)        
    return s

data['TEXT'] = [clean_text(s) for s in data['TITLE']]

In [31]:
data[['CATEGORY', 'TITLE', 'TEXT']].head()

Unnamed: 0,CATEGORY,TITLE,TEXT
0,b,"Fed official says weak data caused by weather,...",fed official says weak data caused by weather ...
1,b,Fed's Charles Plosser sees high bar for change...,fed s charles plosser sees high bar for change...
2,b,US open: Stocks fall after Fed official hints ...,us open stocks fall after fed official hints a...
3,b,"Fed risks falling 'behind the curve', Charles ...",fed risks falling behind the curve charles plo...
4,b,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,fed s plosser nasty weather has curbed job growth


### First create a baseline

In [6]:
data_by_category = data.groupby('CATEGORY')

In [7]:
category_count = data_by_category['ID'].agg(['count'])

In [8]:
sorted_count = category_count.sort_values('count', ascending=False)

In [9]:
# take the highest occuring item and suppose that we predicted all records to be this category
max_value = sorted_count.max()
total_count = len(data)

In [10]:
base_line_value = max_value / total_count

In [11]:
base_line_accuracy = base_line_value[0]
print('Baseline accuracy to beat: {:f}%'.format((base_line_accuracy * 100)))

Baseline accuracy to beat: 36.094257%


### Define targets and data

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [72]:
max_len = 30
max_features = 10000
max_word_vector_length = 100

In [73]:
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(data['TEXT'])
total_words = x.shape[-1]
print('Without setting a limit to the number of words, we get {} words'.format(total_words))

Without setting a limit to the number of words, we get 49771 words


In [35]:
encoder = LabelEncoder()
y = encoder.fit_transform(data['CATEGORY'])

In [77]:
vectorizer = CountVectorizer(max_features=max_features)
x = vectorizer.fit_transform(data['TEXT'])

In [65]:
lb = LabelBinarizer()
y = lb.fit_transform(data['CATEGORY'])

In [36]:
from sklearn.model_selection import train_test_split

In [66]:
y

array([[1, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       ...,
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [68]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

### Create simple dense neural network

In [92]:
model = models.Sequential()

In [93]:
model.add(layers.Dense(32, activation='relu', input_shape=(x_train.shape[-1],)))
model.add(layers.Dense(len(category_count), activation='softmax'))

In [94]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
model.fit(
    x_train,
    y_train,
    epochs = 3,
    batch_size=512,
    validation_split=0.2
)

Train on 270348 samples, validate on 67587 samples
Epoch 1/3
Epoch 2/3

### Create Recurrent Neural Network

In [22]:
max_len = 30
max_features = 10000
max_word_vector_length = 100

In [23]:
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data['TITLE'].values)

In [83]:
x = pad_sequences(x, max_len)

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [79]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [80]:
from keras.layers import Embedding
model = models.Sequential()
model.add(Embedding(max_features, max_word_vector_length, input_length=max_len))
model.add(layers.LSTM(
    32,
    dropout=0.2,
    recurrent_dropout=0.2,
))
model.add(layers.Dense(len(category_count), activation='softmax'))

In [81]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [82]:
history = model.fit(
    x_train,
    y_train,
    epochs = 3,
    batch_size=512,
    validation_split=0.2
)

ValueError: Error when checking input: expected embedding_2_input to have shape (30,) but got array with shape (10000,)

In [88]:
pad_sequences(np.array(x), max_len)

TypeError: iteration over a 0-d array

In [91]:
type(x)

scipy.sparse.csr.csr_matrix