In [1]:
import os
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')
 
print(len(news.data))
# 18846
 
print(len(news.target_names))
# 20
 
print(news.target_names)
# ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
 
for text, num_label in zip(news.data[:10], news.target[:10]):
    print('[%s]:\t\t "%s ..."' % (news.target_names[num_label], text[:100].split('\n')[0]))
 

18846
20
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
[rec.sport.hockey]:		 "From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu> ..."
[comp.sys.ibm.pc.hardware]:		 "From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson) ..."
[talk.politics.mideast]:		 "From: hilmi-er@dsv.su.se (Hilmi Eren) ..."
[comp.sys.ibm.pc.hardware]:		 "From: guyd@austin.ibm.com (Guy Dawson) ..."
[comp.sys.mac.hardware]:		 "From: Alexander Samuel McDiarmid <am2o+@andrew.cmu.edu> ..."
[sci.electronics]:		 "From: tell@cs.unc.edu (Stephen Tell) ..."
[comp.sys.mac.hardware]:		 "From: lpa8921@tamuts.tamu.edu (Louis Paul Adams) ..."
[rec.sport.hockey]:		 "From: dchhabra@stpl.

In [9]:
import spacy
nlp = spacy.load('en')

In [11]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model


MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 384
VALIDATION_SPLIT = 0.2


In [2]:


# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(news.data)


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Processing text dataset


In [4]:
sequences = tokenizer.texts_to_sequences(news.data)


In [5]:

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))


Found 179209 unique tokens.


In [6]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(news.target))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


Shape of data tensor: (18846, 1000)
Shape of label tensor: (18846, 20)


In [7]:

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]


In [12]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = nlp(word).vector
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


Preparing embedding matrix.


In [14]:
with open('data/matrix.npy', 'wb') as fp:
    np.save(fp, embedding_matrix)

In [18]:
# with open('data/matrix.npy', 'wb') as fp:
#     embedding_matrix = np.load(fp)


In [17]:
embedding_matrix.shape

(20000, 384)

In [20]:

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(news.target_names), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


Training model.


In [21]:
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))

Train on 15077 samples, validate on 3769 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12126b0f0>

In [22]:
model.save("kerasw2v.model")

In [31]:
test = tokenizer.texts_to_sequences([news.data[0]])
test = pad_sequences(test, maxlen=MAX_SEQUENCE_LENGTH)
test

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
      

In [34]:
prediction = model.predict(test)

In [35]:
prediction.argmax()

10

In [49]:
def get_result(p_string):
    to_predict = tokenizer.texts_to_sequences([p_string])
    to_predict = pad_sequences(to_predict, maxlen=MAX_SEQUENCE_LENGTH)
    prediction = model.predict(to_predict)
    return news.target_names[prediction.argmax()]

In [50]:
ambig_art = """
A program is trying to attract more women into coaching to match the strong recent growth in women's sport.

Football Federation SA (FFSA) has just held its first female-only Asian Football Confederation C-licence coaching course.

"There's a lot of interest in the women's game from the excellent job the Matildas have done, and certainly at the local level as well, and it cries out as the game's growing for the female coaching base to grow as well," FFSA's John Mundy said.

The C-licence program teaches basic skills and techniques so the coaches are able to work primarily with young players.

"It's part of a two-year project where our focus and priority is female coaching," Mr Mundy said.

"We're looking for some of these girls learning to coach to also be instructors on courses down the track."

 A group photo of soccer players.
PHOTO: Cristiano dos Santos Rodrigues (L) with some of the program participants and other soccer players. (ABC News: Loukas Founten)
Mr Mundy pushed for the course after noticing there were few female coaches in the local Women's National Premier League and no female head coaches for SA's state teams.

"We think we could get to a stage where we even see females coaching the male teams," he said.

"A good coach is a good coach so there's no reason why a female couldn't be coaching a male team, in my view."

The first intake of 10 women included some school teachers interested in coaching.

'Male-dominated industry needs balance'
Program director Cristiano dos Santos Rodrigues, a former Adelaide United striker, said women benefited from the all-female coaching group.

"They talk more about football, they express their knowledge of football a bit more than if they are mixed with men," he said.

"When they are with men they feel a bit intimidated."

Lauren Daniel, a soccer player for 12 years, got involved in the program hoping it might provide a pathway to coaching at national or international level.

"It's quite a male-dominated industry at the moment. I just think it was a lot more relaxed having just females there and some friends as well," the Adelaide teacher said.

"[It's] good to get more females involved so that young people can look up and realise women can be a coach and can be really good coaches as well."

The Matildas have been coached by a woman just once in their 40-year history, when Hesterine de Reus from the Netherlands spent 15 months in the job.

There are two female head coaches among the nine W-League teams — Heather Garriock at Canberra United and Brisbane's Mel Andreatta, this year's W-League premiership coach."""

In [51]:
get_result(ambig_art)

'rec.sport.hockey'

In [52]:
wind = """Microsoft launched ARM-powered Windows 10 PCs with “all-day” battery life back in December. While HP, Asus, and Lenovo’s devices aren’t on sale just yet, we’re still waiting to hear more about the limitations of Windows 10 running on these new PCs. Microsoft published a full list of limitations last week, spotted first by Thurrott, that details what to expect from Windows 10 on ARM. This list must have been published by accident, as the software giant removed it over the weekend so only cached copies of the information are available.

Only ARM64 drivers are supported. Windows 10 on ARM can run x86 apps, but it can’t use x86 drivers. That shouldn’t be a problem for most hardware, but if you have some older peripherals then it’s likely that driver support won’t be available. Windows 10 on ARM driver support will be more limited, and similar to what Windows 10 S provides.
x64 apps are not supported. This is something we’ve known, but Windows 10 on ARM does not support emulation of x64 apps. Microsoft is planning to support these in the future at some point, though.
Certain games and apps don’t work. Microsoft says that games and apps that use a version of OpenGL later than 1.1 or that require hardware-accelerated OpenGL won’t work on Windows 10 on ARM. Games that use anticheat technologies also won’t run on Windows 10 on ARM.
Apps that customize the Windows experience may not work correctly. Apps like assistitive technologies or input method editors won’t work properly on Windows 10 on ARM. Also, apps that include shell extensions (icons and right-click menus in File Explorer) like Dropbox may fail. These apps will need to be compiled natively for ARM.
Apps that assume that all ARM-based devices are running a mobile version of Windows may not work correctly. Some apps that have been coded for Windows Phone won’t work correctly and could appear in the wrong orientation or have UI layout problems. This won’t be a huge amount of apps, though.
The Windows Hypervisor Platform is not supported on ARM. You won’t be able to run virtual machines using Hyper-V with Windows 10 on ARM.
It seems that for most Windows users, Windows 10 on ARM will support common apps and scenarios. Microsoft’s emulation work allows you to download most 32-bit exe files from the web and install them on ARM-powered laptops. There are clearly some limitations, outlined above, but the majority of apps should run. We’re still waiting to test an ARM-powered Windows 10 laptop to see if the battery life is what has been promised, and whether performance for desktop apps is reasonable enough."""

In [53]:
get_result(wind)

'comp.os.ms-windows.misc'

In [61]:
for i, el in enumerate(news.data[:10]):
    print(get_result(el), news.target_names[news.target[i]])
    

rec.sport.hockey rec.sport.hockey
sci.electronics comp.sys.ibm.pc.hardware
rec.sport.hockey talk.politics.mideast
comp.sys.ibm.pc.hardware comp.sys.ibm.pc.hardware
comp.sys.ibm.pc.hardware comp.sys.mac.hardware
sci.electronics sci.electronics
misc.forsale comp.sys.mac.hardware
rec.sport.hockey rec.sport.hockey
rec.sport.hockey rec.sport.hockey
talk.religion.misc talk.religion.misc


In [62]:
news.data[2]

'From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject: Re: ARMENIA SAYS IT COULD SHOOT DOWN TURKISH PLANES (Henrik)\nLines: 95\nNntp-Posting-Host: viktoria.dsv.su.se\nReply-To: hilmi-er@dsv.su.se (Hilmi Eren)\nOrganization: Dept. of Computer and Systems Sciences, Stockholm University\n\n\n\n\n|>The student of "regional killings" alias Davidian (not the Davidian religios sect) writes:\n\n\n|>Greater Armenia would stretch from Karabakh, to the Black Sea, to the\n|>Mediterranean, so if you use the term "Greater Armenia" use it with care.\n\n\n\tFinally you said what you dream about. Mediterranean???? That was new....\n\tThe area will be "greater" after some years, like your "holocaust" numbers......\n\n\n\n\n|>It has always been up to the Azeris to end their announced winning of Karabakh \n|>by removing the Armenians! When the president of Azerbaijan, Elchibey, came to \n|>power last year, he announced he would be be "swimming in Lake Sevan [in \n|>Armeniaxn] by July".\n\t\t*****\n\tIs\'t July 

In [59]:
news.target_names[news.target[0]]

'rec.sport.hockey'

10