In [1]:
import sys
sys.executable

import os
os.environ["JAVA_HOME"] = "C:\Java\jdk1.8.0_221"

In [2]:
# !{sys.executable} -m pip install tensorflow_datasets --trusted-host pypi.org --trusted-host files.pythonhosted.org

In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import load_model
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

import random

import numpy as np

import pandas as pd

import itertools

import pyspark
from pyspark.ml.feature import CountVectorizer, Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml import Pipeline
from pyspark.ml.stat import Summarizer
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.embeddings import *

In [4]:
# MAKE ALL KERAS MODELING RESULTS REPRODUCIBLE...

# Seed value
seed_value= 123

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed_value)

# # 5. Configure a new global `tensorflow` session
# from tensorflow.keras import backend as K
# session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
# sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
# K.set_session(sess)


In [5]:
# create a toy matrix of zeros and ones to pipe thought 1D CNN tensor flow...test a 1xN kernel
onesAndZeros = np.random.randint(2, size = (20, 15)).astype('float32')
print(onesAndZeros)

# so this is really stupid reshaping...
X = np.expand_dims(onesAndZeros, axis=2)

print(X)

# make dv
y = np.random.randint(2, size=(20, 14))
print(y)

[[0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0.]
 [1. 0. 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 0. 0. 0.]
 [0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 1.]
 [1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1.]
 [0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1.]
 [0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0.]
 [1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1.]
 [1. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1.]
 [1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1.]
 [0. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1.]
 [0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 1. 1.]
 [1. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1.]
 [1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0.]
 [0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 0. 1. 0. 0.]
 [0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0.]
 [0. 0. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1. 1. 0. 1.]
 [1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0.]
 [0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 0. 1.]]
[[[0.]
  [1.]
  [0.]
  [0.]
  [0.]
  [0

In [6]:
# if I want to perform predictions on this array then I need to convert each of these elements to acceptable datatypes: float16, bfloat16, float32, float64
print(X[0][:][:].dtype)

float32


In [7]:
# pipe above through 1D CNN 

# create and compile the model
model = Sequential()
model.add(keras.layers.Conv1D(input_shape=(15, 1),
                    filters=1, 
                    kernel_size = 2, 
                    strides=1, 
                    padding='valid', 
                    activation=None, 
                    use_bias=False, 
                    bias_initializer='zeros'))
model.add(keras.layers.Flatten())
# model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 14, 1)             2         
_________________________________________________________________
flatten (Flatten)            (None, 14)                0         
Total params: 2
Trainable params: 2
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
# Train the model, iterating on the data in batches of 1 sample - so not interating
result = model.fit(X, y, batch_size=1, verbose=1)

Train on 20 samples


In [9]:
# evaluate the model
scores = model.evaluate(X, y, verbose=0)
print(model.metrics_names[1], scores[1])

accuracy 0.47142857


In [10]:
# dir(model.layers[0])
print(model.layers[0].get_weights())
weights = model.layers[0].get_weights()

[array([[[-0.9157342 ]],

       [[ 0.26331082]]], dtype=float32)]


In [11]:
# do some math...
# slide a matrix window of dimension [1 x 2] across the input matrix. Multiply by the above weights matrix of dimension [2 x 1] the resulting matrix should be [20 x 14]

# convert array to pandas dataframe to make this task easier.
pdOnesAndZeros = pd.DataFrame(onesAndZeros)

# convert the weights to a dataframe
b = pd.DataFrame(np.asarray(weights).reshape((2,1)))

# create an empty matrix in which the dot products are stored
matrix = np.zeros((20, 14))

# using a stride of 1 slide a window equal to a [1x2] matrix across the onesAndZeros dataframe
for i in pdOnesAndZeros.index:
    for j in range(2, 16, 1): 
#         print('i: ', i, 'j: ', j) #confirm the index values are correct
        a = pd.DataFrame(pdOnesAndZeros.iloc[i,(j-2):j]).transpose().reset_index(drop=True)
        a.columns = range(a.shape[1]) # had to rename to reset the column indexes
#         print(a.dot(b)) #these match the output shown below.
        matrix[i,(j-2)] = a.dot(b).values # store values in null matrix



In [12]:
# print matrix with stored values
print(matrix)

[[ 0.26331082 -0.91573417  0.          0.          0.          0.
   0.26331082 -0.65242338 -0.91573417  0.26331082 -0.65242338 -0.91573417
   0.26331082 -0.91573417]
 [-0.91573417  0.26331082 -0.65242338 -0.91573417  0.          0.
   0.26331082 -0.65242338 -0.65242338 -0.91573417  0.26331082 -0.91573417
   0.          0.        ]
 [ 0.26331082 -0.65242338 -0.65242338 -0.91573417  0.          0.26331082
  -0.91573417  0.          0.26331082 -0.91573417  0.26331082 -0.91573417
   0.26331082 -0.65242338]
 [-0.91573417  0.          0.          0.          0.26331082 -0.65242338
  -0.91573417  0.          0.26331082 -0.91573417  0.26331082 -0.91573417
   0.          0.26331082]
 [ 0.26331082 -0.91573417  0.26331082 -0.91573417  0.          0.
   0.          0.26331082 -0.91573417  0.          0.26331082 -0.65242338
  -0.65242338 -0.65242338]
 [ 0.          0.          0.          0.26331082 -0.65242338 -0.91573417
   0.          0.          0.          0.26331082 -0.91573417  0.26331082
 

In [13]:
# get predictions for each obs
preds = model.predict(X)

In [14]:
# good, output is of correct dimensionality
print(preds.shape)
print(preds)

(20, 14)
[[ 0.26331082 -0.9157342   0.          0.          0.          0.
   0.26331082 -0.6524234  -0.9157342   0.26331082 -0.6524234  -0.9157342
   0.26331082 -0.9157342 ]
 [-0.9157342   0.26331082 -0.6524234  -0.9157342   0.          0.
   0.26331082 -0.6524234  -0.6524234  -0.9157342   0.26331082 -0.9157342
   0.          0.        ]
 [ 0.26331082 -0.6524234  -0.6524234  -0.9157342   0.          0.26331082
  -0.9157342   0.          0.26331082 -0.9157342   0.26331082 -0.9157342
   0.26331082 -0.6524234 ]
 [-0.9157342   0.          0.          0.          0.26331082 -0.6524234
  -0.9157342   0.          0.26331082 -0.9157342   0.26331082 -0.9157342
   0.          0.26331082]
 [ 0.26331082 -0.9157342   0.26331082 -0.9157342   0.          0.
   0.          0.26331082 -0.9157342   0.          0.26331082 -0.6524234
  -0.6524234  -0.6524234 ]
 [ 0.          0.          0.          0.26331082 -0.6524234  -0.9157342
   0.          0.          0.          0.26331082 -0.9157342   0.26331082

In [15]:
# check for any differences between "hand calc" matrix and model predictions
print(matrix - preds)

# :)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
