In [144]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [145]:
import numpy as np
import pandas as pd
import gc
import time

# Acquire transcript data

In [146]:
from data import preprocess_transcript
(X,y),vocab = preprocess_transcript()
X.shape,y.shape

((11936, 4, 15), (11936,))

In [147]:
from sklearn.model_selection import train_test_split
X0, X1, Y0, Y1 = train_test_split(X, y, test_size=0.15,random_state=1)
X0.shape, Y0.shape, X1.shape, Y1.shape

((10145, 4, 15), (10145,), (1791, 4, 15), (1791,))

# Benchmark RNN Model

In [148]:
from rnn import RNN,train_RNN
%aimport rnn
model = RNN(len(vocab))
model.build(X0.shape)
model.summary()

Model: "rnn_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_39 (Embedding)    multiple                  320000    
                                                                 
 flatten_126 (Flatten)       multiple                  0         
                                                                 
 lstm_25 (LSTM)              multiple                  98816     
                                                                 
 dense_91 (Dense)            multiple                  645000    
                                                                 
 dense_92 (Dense)            multiple                  10002     
                                                                 
Total params: 1,073,818
Trainable params: 1,073,818
Non-trainable params: 0
_________________________________________________________________


In [149]:
rnn_train_loss,rnn_train_acc =[],[]
rnn_val_loss,rnn_val_acc =[],[]

for i in range(5):
    model = RNN(len(vocab))
    history = train_RNN(model, X0, Y0, X1, Y1)
    rnn_train_loss.append(min(history.history['loss']))
    rnn_train_acc.append(max(history.history['sparse_categorical_accuracy']))
    rnn_val_loss.append(min(history.history['val_loss']))
    rnn_val_acc.append(max(history.history['val_sparse_categorical_accuracy']))
    if i == 4: model.save('../models/rnn')
    del model
    gc.collect()
    time.sleep(3)

In [150]:
np.mean(rnn_train_loss),np.mean(rnn_train_acc)

(0.23932151794433593, 0.895633316040039)

In [151]:
np.mean(rnn_val_loss),np.mean(rnn_val_acc)

(0.6593041896820069, 0.6135120034217835)

# Baseline Transformer Model

In [152]:
from transformer_encoder import TransformerEncoder,train_TransformerEncoder
%aimport transformer_encoder
embedding_size = 64
window = 15
model = TransformerEncoder(len(vocab),embedding_size,window)
model.build(X0.shape)
model.summary()

Model: "transformer_encoder_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_45 (Embedding)    multiple                  320000    
                                                                 
 transformer_block_rank_four  multiple                 16576     
 _26 (TransformerBlockRankFo                                     
 ur)                                                             
                                                                 
 dense_104 (Dense)           multiple                  7682      
                                                                 
Total params: 344,258
Trainable params: 344,258
Non-trainable params: 0
_________________________________________________________________


In [153]:
transformer_train_loss,transformer_train_acc =[],[]
transformer_val_loss,transformer_val_acc =[],[]
for i in range(5):
    model = TransformerEncoder(len(vocab),embedding_size,window)
    history = train_TransformerEncoder(model, X0, Y0, X1, Y1)
    transformer_train_loss.append(min(history.history['loss']))
    transformer_train_acc.append(max(history.history['sparse_categorical_accuracy']))
    transformer_val_loss.append(min(history.history['val_loss']))
    transformer_val_acc.append(max(history.history['val_sparse_categorical_accuracy']))
    if i == 4: model.save('../models/transformer')
    del model
    gc.collect()
    time.sleep(3)

In [154]:
np.mean(transformer_train_loss),np.mean(transformer_train_acc)

(0.2727239906787872, 0.8881419539451599)

In [155]:
np.mean(transformer_val_loss),np.mean(transformer_val_acc)

(0.718769109249115, 0.5834729194641113)

# Augmented model Transcript w. statement

In [156]:
from data import preprocess_transcript_statement
%aimport train

In [157]:
X, y, vocab = preprocess_transcript_statement()

In [158]:
X0, X1, Y0, Y1 = train_test_split(X, y, test_size=0.15,random_state=1)
X0.shape, Y0.shape, X1.shape, Y1.shape

((10145, 804, 15), (10145,), (1791, 804, 15), (1791,))

In [159]:
from transformer_encoder import TransformerEncoderStatement,train_TransformerEncoderStatement
%aimport transformer_encoder
embedding_size = 64
window = 15
model = TransformerEncoderStatement(len(vocab),embedding_size,window)
model.build(X0.shape)
model.summary()

Model: "transformer_encoder_statement_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_51 (Embedding)    multiple                  368576    
                                                                 
 transformer_block_rank_four  multiple                 16576     
 _32 (TransformerBlockRankFo                                     
 ur)                                                             
                                                                 
 transformer_block_rank_four  multiple                 16576     
 _33 (TransformerBlockRankFo                                     
 ur)                                                             
                                                                 
 dense_117 (Dense)           multiple                  1543682   
                                                                 
Total params: 1,945,410
Trainable 

In [160]:
transformer_s_train_loss,transformer_s_train_acc =[],[]
transformer_s_val_loss,transformer_s_val_acc =[],[]
for i in range(5):
    model = TransformerEncoderStatement(len(vocab),embedding_size,window)
    history = train_TransformerEncoderStatement(model, X0, Y0, X1, Y1)
    transformer_s_train_loss.append(min(history.history['loss']))
    transformer_s_train_acc.append(max(history.history['sparse_categorical_accuracy']))
    transformer_s_val_loss.append(min(history.history['val_loss']))
    transformer_s_val_acc.append(max(history.history['val_sparse_categorical_accuracy']))
    if i == 4: model.save('../models/transformer_statement')
    del model
    gc.collect()
    time.sleep(3)

In [161]:
np.mean(transformer_s_train_loss),np.mean(transformer_s_train_acc)

(1.8431342601776124, 0.6859930992126465)

In [162]:
np.mean(transformer_s_val_loss),np.mean(transformer_s_val_acc)

(1.4995854139328002, 0.7134561777114868)

# Augmented model Transcript w. statement & tone

In [163]:
X, y, vocab = preprocess_transcript_statement(add_tone=True)

In [164]:
X0, X1, Y0, Y1 = train_test_split(X, y, test_size=0.15,random_state=1)
X0.shape, Y0.shape, X1.shape, Y1.shape

((10145, 822, 15), (10145,), (1791, 822, 15), (1791,))

In [165]:
from transformer_encoder import TransformerEncoderStatementTone,train_TransformerEncoderStatementTone
%aimport transformer_encoder
embedding_size = 64
window = 15
model = TransformerEncoderStatementTone(len(vocab),embedding_size,window)
model.build(X0.shape)
model.summary()

Model: "transformer_encoder_statement_tone_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_57 (Embedding)    multiple                  368576    
                                                                 
 transformer_block_rank_four  multiple                 16576     
 _44 (TransformerBlockRankFo                                     
 ur)                                                             
                                                                 
 transformer_block_rank_four  multiple                 16576     
 _45 (TransformerBlockRankFo                                     
 ur)                                                             
                                                                 
 transformer_block_rank_thre  multiple                 945       
 e_1 (TransformerBlockRankTh                                     
 ree)                         

In [166]:
transformer_st_train_loss,transformer_st_train_acc =[],[]
transformer_st_val_loss,transformer_st_val_acc =[],[]
for i in range(5):
    model = TransformerEncoderStatement(len(vocab),embedding_size,window)
    history = train_TransformerEncoderStatement(model, X0, Y0, X1, Y1)
    transformer_st_train_loss.append(min(history.history['loss']))
    transformer_st_train_acc.append(max(history.history['sparse_categorical_accuracy']))
    transformer_st_val_loss.append(min(history.history['val_loss']))
    transformer_st_val_acc.append(max(history.history['val_sparse_categorical_accuracy']))
    if i == 0: 
        model.save('../models/transformer_statement_tone')
        break
    del model
    gc.collect()
    time.sleep(3)

In [167]:
np.mean(transformer_st_train_loss),np.mean(transformer_st_train_acc)

(1.8523714542388916, 0.6858550906181335)

In [168]:
np.mean(transformer_st_loss),np.mean(transformer_st_acc)

(1.0373454093933105, 0.7185929417610168)

# Policy words Analysis

In [169]:
tone = pd.read_csv('../input/processed/tones.csv')

In [170]:
X, y, vocabulary, statement_dict = preprocess_transcript_statement(add_tone=True,return_statement_dict=True)

In [171]:
import datetime

In [172]:
words = ['improv', 'foster', 'increas', 'moder', 'slow', 'weak', 'condit', 'anticip', 'believ']
tone_05_04_22 = np.array(tone[tone['date']=='2022-05-04'].agg({'sad': 'mean', 'angry': 'mean', 'neutral': 'mean', 'happy': 'mean', 'disgust': 'mean', 'fearful': 'mean'}))
statement_05_04_22 = statement_dict[datetime.datetime(2022, 5, 4)]

In [173]:
# 4620 is index for [QUIET]
for w in words:
    word_test = np.array( [4620 for i in range(59)]+[vocabulary[w]])
    np.random.shuffle(word_test)
    word_test = word_test.reshape(1, 4, 15)
    statement_test = np.repeat(statement_05_04_22, 15)
    statement_test = np.reshape(statement_test, (1, 800, 15))
    tone_test = np.transpose(np.tile(tone_05_04_22, (15,1,1)),[1,2,0])
    tone_test = np.repeat(tone_test, 3, axis=1)
    D_test = np.concatenate([word_test, statement_test, tone_test], axis=1)
    print(w)
    print(model(D_test))

improv
tf.Tensor([[0.80526406 0.19473594]], shape=(1, 2), dtype=float32)
foster
tf.Tensor([[0.81596005 0.18403992]], shape=(1, 2), dtype=float32)
increas
tf.Tensor([[0.81397444 0.18602556]], shape=(1, 2), dtype=float32)
moder
tf.Tensor([[0.8111978  0.18880214]], shape=(1, 2), dtype=float32)
slow
tf.Tensor([[0.8099522  0.19004774]], shape=(1, 2), dtype=float32)
weak
tf.Tensor([[0.8053697  0.19463035]], shape=(1, 2), dtype=float32)
condit
tf.Tensor([[0.80888265 0.19111733]], shape=(1, 2), dtype=float32)
anticip
tf.Tensor([[0.80831516 0.19168483]], shape=(1, 2), dtype=float32)
believ
tf.Tensor([[0.7987827  0.20121737]], shape=(1, 2), dtype=float32)
