In [6]:
from keras import Sequential
from keras.layers import Embedding
import numpy as np
import pandas as pd

In [8]:
model = Sequential()
model.add(Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch, input_length).
# the largest integer (i.e. word index) in the input should be
# no larger than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch dimension.

input_array = np.random.randint(1000, size=(32, 10))

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
assert output_array.shape == (32, 10, 64)

In [9]:
input_array

array([[762, 438, 133, 309, 988, 479, 760,   4, 276, 377],
       [102, 891, 150,  69, 297, 545, 667, 589, 400, 128],
       [441, 166, 796, 107, 946, 924, 486, 505, 210, 548],
       [823, 110, 506, 147, 148, 716, 450, 927, 387, 200],
       [340, 384, 221, 883, 911, 267, 378, 856, 636,  31],
       [655, 468, 802, 131, 839, 801, 911, 308, 257, 439],
       [330,  15, 481, 821, 646, 271, 846, 963, 294, 440],
       [ 63, 438, 286, 686, 560, 886, 236, 158, 867, 883],
       [  3, 236, 614, 575, 876, 984, 780, 491, 287, 299],
       [154, 738, 540, 460, 934, 494, 894, 378,  20, 316],
       [130, 137,  57,  77,  89, 308, 817, 520, 268, 394],
       [718,  32, 458, 384, 548, 691, 836, 677, 444, 690],
       [578, 254, 931, 802, 671, 418, 956, 320, 523, 150],
       [822, 376, 230, 692, 667, 980, 925, 336, 344, 334],
       [672, 897, 131, 882, 192, 748,  59,  89, 801,  99],
       [565, 662, 307,   6, 428, 179, 660, 329,  72, 405],
       [290, 472, 831, 324, 464, 135, 811, 747, 717, 233

In [10]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 10, 64)            64000     
Total params: 64,000
Trainable params: 64,000
Non-trainable params: 0
_________________________________________________________________


In [12]:
output_array.shape

(32, 10, 64)

In [13]:
output_array

array([[[-1.97902080e-02, -1.97851658e-02, -2.86110528e-02, ...,
          2.04533376e-02, -5.48856333e-03, -1.53464191e-02],
        [ 1.55693926e-02, -4.23086546e-02,  3.88166197e-02, ...,
          8.26353952e-03,  2.75744684e-02,  1.44720338e-02],
        [ 2.68191956e-02, -2.23554727e-02, -1.56116486e-03, ...,
         -3.28207985e-02, -1.02091581e-04,  4.83696200e-02],
        ...,
        [-1.07271671e-02, -1.55469552e-02, -1.00960582e-03, ...,
         -5.23564965e-03,  2.79647969e-02, -1.17293969e-02],
        [-4.74153757e-02, -2.68574357e-02,  3.35686915e-02, ...,
         -2.89883371e-02,  2.17423178e-02,  4.30395342e-02],
        [-3.48421223e-02, -1.21081583e-02,  4.73917648e-03, ...,
         -8.64964724e-03,  4.14612405e-02,  4.99535240e-02]],

       [[-3.87276784e-02,  3.31667103e-02,  1.43934377e-02, ...,
          7.73501396e-03, -1.15987174e-02, -4.74767573e-02],
        [-9.43536684e-03, -3.90840285e-02, -4.36153300e-02, ...,
         -1.98038667e-03,  1.53522529e

In [28]:
def load_data(train_file_name, test_file_name):
    """ load training data from kaggle dataset
    @param train_file_name: fully qualified file name of train.csv from kaggle
    housing practice competition
    @param test_file_name: fully qualified file name of test.csv from kaggle
    housing practice competition
    @return: pair of tuples of observations and responses
    """
    test_df = pd.read_csv(test_file_name, index_col='Id')    
    train_df = pd.read_csv(train_file_name, index_col='Id')

    def encode_cats(df):
        """ one-hot encode all categorical values in data frame
        @param df: a dataframe
        @return: one hot encoded equivalents of non-numerical
        columns in data frame
        """
        df_numeric = df.select_dtypes(include = np.number)
        df_numeric.fillna(df.mean(), inplace=True)
        non_numeric_columns = df.select_dtypes(include=np.object).columns
        df_one_hot =  pd.get_dummies(df[non_numeric_columns])
        df_one_hot.fillna(0, inplace=True)
        return pd.concat([df_numeric, df_one_hot], axis  = 1)
    
    return (encode_cats(train_df.iloc[:,1:-1]), train_df.iloc[:,-1] ), \
        encode_cats(test_df)

In [31]:
(train_data , train_targets), test_data = load_data('/home/jhancock2010/git/data/house/train.csv',
                                                 '/home/jhancock2010/git/data/house/test.csv')
mean = train_data.mean(axis=0)
train_data -= mean
std = train_data.std(axis=0)
train_data /= std
test_data -= mean
test_data /= std


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [10]:
train_data.shape[1]

287

In [35]:
print(train_data.max().max(),
train_data.min().min())

38.183775152957416 -38.1837751529566


In [22]:
model = Sequential()
model.add(Embedding(215245, 64, input_length=train_data.shape[1]))

In [23]:
model.compile('rmsprop', 'mse')
output_array = model.predict(train_data)

InvalidArgumentError:  indices[31,0] = -2147483648 is not in [0, 215245)
	 [[node embedding_2/embedding_lookup (defined at /home/jhancock2010/venv/fastai-2019-08-01/lib/python3.5/site-packages/tensorflow_core/python/framework/ops.py:1751) ]] [Op:__inference_keras_scratch_graph_266]

Function call stack:
keras_scratch_graph
