In [10]:
import numpy as np
import pandas as pd
import tensorflow as tf
import imageParse as imgP
from audioSent_model import VAD_audio
import audioSent_train as as_train
import imageSent as imgS

In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
VAD_map = np.array([
    [1,0.735,0.772], #0: 'happy'
    [0.918,0.61,0.566], #1: 'funny'
    [0.225,0.333,0.149], #2: 'sad'
    [0.63,0.52,0.509], #3: 'tender'
    [0.95,0.792,0.789], #4: 'exciting'
    [0.122,0.83,0.604], #5: 'angry'
    [0.062,0.952,0.528], #6: 'scary'
])

VAD_pd = pd.DataFrame(
    columns = ["valence", "arousal", "dominance"], 
    data = VAD_map, 
    index = ['happy', 'funny', 'sad', 'tender', 
             'exciting', 'angry', 'scary'])

### Audio Sentiment Model

In [13]:
train_context_pd = pd.read_parquet(f"../data/unbal_music_contexts.parquet")
train_embeddings = np.load(f"../data/unbal_music_embeddings.npy")
test_context_pd = pd.read_parquet(f"../data/bal_music_contexts.parquet")
test_embeddings = np.load(f"../data/bal_music_embeddings.npy")

In [14]:
audio_train = tf.cast(train_embeddings.reshape(-1, 10, 128, 1)/128.0, 
                             dtype = tf.float32)
label_train = tf.cast(train_context_pd[["valence", "arousal", "dominance"]].to_numpy(), 
                        dtype = tf.float32)
audio_test = tf.cast(test_embeddings.reshape(-1, 10, 128, 1)/128.0, 
                             dtype = tf.float32)
label_test = tf.cast(test_context_pd[["valence", "arousal", "dominance"]].to_numpy(), 
                        dtype = tf.float32)
seg_label_train = train_context_pd["mood"].to_numpy()
seg_label_test = test_context_pd["mood"].to_numpy()

In [15]:
numEpoch = 4
model = VAD_audio()

as_train.train_full(model, numEpoch, audio_train, label_train)

Epoch: 1
train_loss = 0.13785018026828766 

Epoch: 2
train_loss = 0.1010199636220932 

Epoch: 3
train_loss = 0.09076208621263504 

Epoch: 4
train_loss = 0.07875846326351166 



### Image Sentiment Model

In [17]:
abs_X0, abs_Y0, abs_X1, abs_Y1 = imgP.parseData_Abs()
abs_X0 = imgP.input_prep_fn(abs_X0)
abs_X1 = imgP.input_prep_fn(abs_X1)

In [18]:
art_X0, art_Y0, art_X1, art_Y1 = imgP.parseData_Art()
art_X0 = imgP.input_prep_fn(art_X0)
art_X1 = imgP.input_prep_fn(art_X1)

In [19]:
abs_model = imgS.ImageSentModel(name='abs')
abs_model.compile(optimizer=tf.keras.optimizers.Adam(5e-3), VAD_map=VAD_map)
abs_Y0_VAD = VAD_pd.loc[abs_Y0]
abs_Y1_VAD = VAD_pd.loc[abs_Y1]
abs_model.fit(
    (abs_X0, abs_Y0_VAD), abs_Y0_VAD,
    epochs     = 5,
    batch_size = 8,
);

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
art_model = imgS.ImageSentModel(name='art')
art_model.compile(optimizer=tf.keras.optimizers.Adam(5e-3), VAD_map=VAD_map)
art_Y0_VAD = VAD_pd.loc[art_Y0]
art_Y1_VAD = VAD_pd.loc[art_Y1]
art_model.fit(
    (art_X0, art_Y0_VAD), art_Y0_VAD,
    epochs     = 5,
    batch_size = 8,
);

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Matching Images to Audio

In [21]:
print(seg_label_test)

[276 276 276 276 276 276 276 276 276 276 276 276 276 276 276 276 276 276
 276 276 276 276 276 277 277 277 277 277 277 277 277 277 277 277 277 277
 277 277 277 277 277 277 277 277 277 277 277 277 277 278 278 278 278 278
 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278 278
 278 278 278 278 278 279 279 279 279 279 279 279 279 279 279 279 279 279
 279 279 279 279 279 279 279 279 279 279 279 279 279 280 280 280 280 280
 280 280 280 280 280 280 280 280 280 280 280 280 280 280 280 280 281 281
 281 281 281 281 281 281 281 281 281 281 281 281 281 281 281 281 281 281
 281 281 282 282 282 282 282 282 282 282 282 282 282 282 282 282 282 282
 282 282 282 282]
