In [3]:
!pip install keras

Collecting keras
  Downloading Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Installing collected packages: keras
Successfully installed keras-2.4.3
You should consider upgrading via the '/home/swethapola/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
import nltk
from nltk.corpus import brown
from nltk.data import find
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences


In [5]:
df = pd.read_csv('data/NN_Data.csv').drop(columns=['Unnamed: 0'])
df["no_punct"] = df['text'].str.replace('[^\w\s]','')
df['str_list'] = df.no_punct.apply(lambda s: s.split(' '))

In [6]:
df.head()

Unnamed: 0,text,outcome,no_punct,str_list
0,My dad had a heart attack and spent over 7 min...,1,My dad had a heart attack and spent over 7 min...,"[My, dad, had, a, heart, attack, and, spent, o..."
1,I had dangerous open heart surgery due to mult...,1,I had dangerous open heart surgery due to mult...,"[I, had, dangerous, open, heart, surgery, due,..."
2,"In June of 2020, my Dad collapsed in our yard ...",1,In June of 2020 my Dad collapsed in our yard w...,"[In, June, of, 2020, my, Dad, collapsed, in, o..."
3,"after my twin aunts were born, my grandma beca...",1,after my twin aunts were born my grandma becam...,"[after, my, twin, aunts, were, born, my, grand..."
4,Not me but my mother is the most severe case I...,1,Not me but my mother is the most severe case I...,"[Not, me, but, my, mother, is, the, most, seve..."


In [7]:
w2v = Word2Vec(df['str_list'].tolist(), min_count=1, vector_size= 300, workers=3, window =3, sg = 1)

In [8]:
kv = w2v.wv

In [9]:
EMBEDDING_DIM = len(kv['heart'])      # we know... it's 300

# initialize embedding matrix and word-to-id map:
embedding_matrix = kv.vectors
vocab_dict = kv.key_to_index

In [10]:
embedding_matrix.shape

(20621, 300)

In [11]:
list(vocab_dict.items())[:7]

[('I', 0), ('to', 1), ('and', 2), ('the', 3), ('a', 4), ('of', 5), ('my', 6)]

In [12]:
def sents_to_ids(sentences):
    """
    converting a list of strings to a list of lists of word ids
    """
    text_ids = [vocab_dict[word] for word in sentences]
    return np.asarray(text_ids).astype(np.int64)

In [13]:
df["id_list"] = df["str_list"].apply(sents_to_ids)
# df["outcome_2d"] = df["outcome"].apply(lambda x: np.asarray([x]).astype(np.int64))
X = np.array(df["id_list"])
# y = np.array(df["outcome_2d"])
y = np.array(df['outcome'].tolist()).astype('float32')

In [14]:
df[["str_list", "id_list"]].head()

Unnamed: 0,str_list,id_list
0,"[My, dad, had, a, heart, attack, and, spent, o...","[75, 149, 24, 4, 52, 107, 2, 642, 97, 725, 443..."
1,"[I, had, dangerous, open, heart, surgery, due,...","[0, 24, 2967, 374, 52, 440, 447, 1, 746, 3114,..."
2,"[In, June, of, 2020, my, Dad, collapsed, in, o...","[372, 3523, 5, 9391, 6, 801, 2421, 9, 160, 439..."
3,"[after, my, twin, aunts, were, born, my, grand...","[104, 6, 6215, 2833, 74, 1373, 6, 691, 656, 36..."
4,"[Not, me, but, my, mother, is, the, most, seve...","[382, 16, 18, 6, 274, 14, 3, 138, 842, 568, 0,..."


In [15]:
X = pad_sequences(X).astype('float32') # zero pre-padding

In [16]:
MAX_SEQUENCE_LENGTH = X.shape[1]  # Keras' embedding layer expects a specific input length. Padding is often needed here.

embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_matrix.shape[1],
                            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [18]:
try:
    del tf_model
except:
    pass

In [19]:
tf_model = tf.keras.Sequential()

tf_model.add(embedding_layer)                                        # embedding layer

tf_model.add(tf.keras.layers.Conv1D(
    filters=10, 
    kernel_size=3, 
    strides=1, 
    padding='same', 
    activation='relu', 
    use_bias=True,
    kernel_initializer='glorot_uniform', 
    bias_initializer='zeros')
            )    

tf_model.add(tf.keras.layers.GlobalMaxPooling1D())


tf_model.add(Dense(100, activation='relu'))                          # hidden layer
tf_model.add(Dense(1, activation='sigmoid'))                         # classification layer

In [20]:
tf_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2966, 300)         6186300   
_________________________________________________________________
conv1d (Conv1D)              (None, 2966, 10)          9010      
_________________________________________________________________
global_max_pooling1d (Global (None, 10)                0         
_________________________________________________________________
dense (Dense)                (None, 100)               1100      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 6,196,511
Trainable params: 10,211
Non-trainable params: 6,186,300
_________________________________________________________________


In [23]:
tf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics="accuracy")


In [24]:
tf_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=25, verbose=1)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25

KeyboardInterrupt: 

In [None]:
[print(i.shape, i.dtype) for i in tf_model.inputs]
[print(o.shape, o.dtype) for o in tf_model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in tf_model.layers]

In [None]:
type(X_train[0])

In [None]:
y_train.shape

In [None]:
type(y_train)