In [1]:
# import keras

In [5]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classication of newsgroup messages into 20 different categories).
GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)
20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
'''

from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import sys
from keras.layers.recurrent import LSTM
# keras.layers.recurrent.Recurrent(weights=None, return_sequences=False, go_backwards=False, stateful=False, unroll=False, consume_less='cpu', input_dim=None, input_length=None)


BASE_DIR = ''
GLOVE_DIR = ''
TEXT_DATA_DIR = '/home/i008/20_newsgroup'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                texts.append(f.read())
                f.close()
                labels.append(label_id)

# print('Found %s texts.' % len(texts))
# # finally, vectorize the text samples into a 2D integer tensor
# tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
# tokenizer.fit_on_texts(texts)
# sequences = tokenizer.texts_to_sequences(texts)

# word_index = tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))

# data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# labels = to_categorical(np.asarray(labels))
# print('Shape of data tensor:', data.shape)
# print('Shape of label tensor:', labels.shape)

# # split the data into a training set and a validation set
# indices = np.arange(data.shape[0])
# np.random.shuffle(indices)
# data = data[indices]
# labels = labels[indices]
# nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

# x_train = data[:-nb_validation_samples]
# y_train = labels[:-nb_validation_samples]
# x_val = data[-nb_validation_samples:]
# y_val = labels[-nb_validation_samples:]

# print('Preparing embedding matrix.')

# # prepare embedding matrix
# nb_words = min(MAX_NB_WORDS, len(word_index))
# embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
# for word, i in word_index.items():
#     if i >= MAX_NB_WORDS:
#         continue
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector

# # load pre-trained word embeddings into an Embedding layer
# # note that we set trainable = False so as to keep the embeddings fixed
# embedding_layer = Embedding(nb_words,
#                             EMBEDDING_DIM,
#                             weights=[embedding_matrix],
#                             input_length=MAX_SEQUENCE_LENGTH,
#                             trainable=False)

# print('Training model.')


Indexing word vectors.
Found 400000 word vectors.
Processing text dataset


In [12]:
np.unique(np.asarray(labels)).shape

to_categorical(labels).shape

(19997, 20)

In [14]:

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
# x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = LSTM(1000, dropout_W=0.2, dropout_U=0.2)(embedded_sequences) # try using a GRU instead, for fun
# x = LSTM(100, dropout_W=0.2, dropout_U=0.2)(x) # try using a GRU instead, for fun
# x = LSTM(128, dropout_W=0.2, dropout_U=0.2)(x) # try using a GRU instead, for fun

# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(35)(x)
x = Dense(128, activation='relu')(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=2, batch_size=128)

Exception: Input 0 is incompatible with layer flatten_6: expected ndim >= 3, found ndim=2

In [8]:
# LSTM and CNN for sequence classification in the IMDB dataset
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
model = Sequential()

model.add(Embedding(input_dim=nb_words, 
                    output_dim=EMBEDDING_DIM, 
                    input_length=MAX_SEQUENCE_LENGTH, 
                    weights=[embedding_matrix], 
                    trainable=True)
         )
model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(LSTM(100))
model.add(Dense(len(labels_index), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train, nb_epoch=3, batch_size=128, validation_data=(x_val, y_val))
# Final evaluation of the model
# scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 1000, 100)     2000000     embedding_input_4[0][0]          
____________________________________________________________________________________________________
convolution1d_2 (Convolution1D)  (None, 1000, 32)      9632        embedding_5[0][0]                
____________________________________________________________________________________________________
maxpooling1d_2 (MaxPooling1D)    (None, 500, 32)       0           convolution1d_2[0][0]            
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 100)           53200       maxpooling1d_2[0][0]             
___________________________________________________________________________________________

NameError: name 'x_test' is not defined

In [11]:
t = Tokenizer(nb_words=10)

In [13]:
t.fit_on_texts(texts)

In [24]:
len(t.texts_to_sequences([texts[10]])[0])

32

In [33]:
len(t.texts_to_sequences([texts[10]])[0])

32

In [43]:
sorted(t.word_index, key=lambda x: )

['\x02',
 '\x02ubject',
 '\x03',
 '\x03\x03\x1b',
 '\x08',
 '\x08a',
 '\x08b',
 '\x08c',
 '\x08d',
 '\x08e',
 '\x08f',
 '\x08h',
 '\x08i',
 '\x08k',
 '\x08l',
 '\x08m',
 '\x08n',
 '\x08o',
 '\x08p',
 '\x08r',
 '\x08s',
 '\x08t',
 '\x08u',
 '\x08v',
 '\x08w',
 '\x08y',
 '\x0c',
 '\r',
 '\r\r',
 '\remail',
 '\reven',
 '\rfax',
 '\rin',
 '\x10a',
 '\x10more',
 '\x10where',
 '\x18',
 '\x18\xe93\xdaj\xdd\x08\x05',
 '\x19',
 '\x1a',
 '\x1b',
 '\x1c',
 '\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e\x1e',
 "'",
 "''",
 "'''",
 "'''''",
 "''0y",
 "''1",
 "''3",
 "''4",
 "''5",
 "''5y'",
 "''83",
 "''8y",
 "''9",
 "''9265e965e9",
 "''a",
 "''a6t",
 "''after",
 "''ahmadya",
 "''america's",
 "''aq",
 "''b9",
 "''bar'",
 "''bpu",
 "''c",
 "''c4",
 "''c7'os6",
 "''close",
 "''d",
 "''dk",
 "''dz",
 "''e5f",
 "''first",
 "''i",
 "''ipx",
 "''j",
 "''ji'",
 "''k",
 "''kbl",
 "''l",
 "''m",
 "''my",
 "''n",
 "''or'",
 "''our",
 "''patterns",
 "''pv6rx",
 "''q",
 "''qs0p",
 "''s",
 "''sir",
 "''somethings''"

In [42]:
for w,i in t.word_index.items():  
    print(w)
    print(i)
    if i > 100:
        break

9304191946
108721


In [45]:
import operator

sorted_x = sorted(word_index.items(), key=operator.itemgetter(1))

In [46]:
sorted_x

[('the', 1),
 ('edu', 2),
 ('to', 3),
 ('of', 4),
 ('a', 5),
 ('and', 6),
 ('in', 7),
 ('i', 8),
 ('is', 9),
 ('that', 10),
 ("'ax", 11),
 ('cmu', 12),
 ('it', 13),
 ('for', 14),
 ('com', 15),
 ('cs', 16),
 ('you', 17),
 ('from', 18),
 ('on', 19),
 ('this', 20),
 ('news', 21),
 ('not', 22),
 ('be', 23),
 ('srv', 24),
 ('are', 25),
 ('have', 26),
 ('with', 27),
 ('as', 28),
 ('cantaloupe', 29),
 ('1', 30),
 ('or', 31),
 ('net', 32),
 ('was', 33),
 ('if', 34),
 ('but', 35),
 ('message', 36),
 ('they', 37),
 ('subject', 38),
 ('lines', 39),
 ('date', 40),
 ('id', 41),
 ('apr', 42),
 ('newsgroups', 43),
 ('path', 44),
 ('organization', 45),
 ('at', 46),
 ('by', 47),
 ('state', 48),
 ('gmt', 49),
 ('an', 50),
 ('can', 51),
 ('2', 52),
 ('my', 53),
 ('what', 54),
 ('1993', 55),
 ('all', 56),
 ('would', 57),
 ('ohio', 58),
 ('x', 59),
 ('there', 60),
 ('one', 61),
 ('will', 62),
 ('do', 63),
 ('re', 64),
 ('0', 65),
 ('writes', 66),
 ('about', 67),
 ('3', 68),
 ('we', 69),
 ('no', 70),
 ('so'

In [48]:
data[0].shape

(1000,)

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('reviews_with_language.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
lab

array(['unknown'], dtype=object)