## Required Packages

In [1]:
import pickle
import numpy as np

## Downloading the data

In [None]:
%cd data/

In [None]:
!curl -L -J -O "https://is.gd/tsdlIV" -L -J -O "https://is.gd/C3lSUl" -L -J -O "https://is.gd/UVXwPO"

In [None]:
%cd ..

## Loading the Data

In [2]:
with open('data/all.pkl', 'rb') as f:
    ids, speakers, all_labels, embeddings, audio, visual, \
        raw_sentences, trains_index, tests_index = pickle.load(f)

### Sample Codes

In [3]:
# How many videos are there?
len(ids)

3281

In [4]:
# How many video segments are there?
np.sum([len(segments) for segments in ids.values()])

22029

In [5]:
list(ids.keys())[:5]

['--qXJuDtHPw', '-3g5yACwYnA', '-3nNcZdcdvU', '-571d8cVauQ', '-6rXp3zJ3kc']

In [6]:
ids['-3g5yACwYnA']

['-3g5yACwYnA[0]',
 '-3g5yACwYnA[1]',
 '-3g5yACwYnA[2]',
 '-3g5yACwYnA[3]',
 '-3g5yACwYnA[4]',
 '-3g5yACwYnA[5]']

In [7]:
raw_sentences['-3g5yACwYnA'][3]

"operations key sp polymer sp brings a sp technical aspect to sp our sp operation that we don't have sp internally sp we're"

Thus we can generate the word list from a specific sentence:

In [8]:
print((raw_sentences['-3g5yACwYnA'][3]).split())

['operations', 'key', 'sp', 'polymer', 'sp', 'brings', 'a', 'sp', 'technical', 'aspect', 'to', 'sp', 'our', 'sp', 'operation', 'that', 'we', "don't", 'have', 'sp', 'internally', 'sp', "we're"]


## Modified Embeddings

In [9]:
with open('data/embedding_dict.pkl', 'rb') as f:
    embedding_dict = pickle.load(f)

In [10]:
embedding_dict['word'][:10], embedding_dict['word'].shape

(array([-0.014424 , -0.0060105, -0.23573  ,  0.055827 ,  0.16621  ,
         0.08586  ,  0.056778 , -0.12082  , -0.2068   ,  1.8227   ]),
 (300,))

Thus, we can set the dimension of embedding to 300:

In [11]:
dim_embeddings = len(list(embedding_dict.values())[0])

In [12]:
raw_sentences['-3g5yACwYnA'][5]

'solutions sp we have many new opportunities through sp the way things have changed through the years sp looking for new niche high end value added products sp'

In [13]:
all_words = (raw_sentences['-3g5yACwYnA'][5]).split()

In [14]:
new_embedding = np.zeros(dim_embeddings)
word_counts = len(all_words)
missed_word_counts = 0

for word in all_words:
    # of course, similarly you can do the replacing jobs
    try:
        new_embedding += embedding_dict[word]
    except KeyError:
        missed_word_counts += 1

if word_counts != missed_word_counts:
    new_embedding /= (len(all_words) - missed_word_counts)


In [15]:
missed_word_counts

0

In [16]:
np.array_equal(new_embedding, embeddings['-3g5yACwYnA'][5])

True

## Sample: generating sentiments data

In [17]:
with open('data/labels_index.dict', 'rb') as f:
    index_of = pickle.load(f)

In [18]:
def get_emotion_label_dict(name, labels=all_labels, index_of=index_of):
    '''
    generate specific emotion/sentiment label dict from the labels dict
    '''
    label_dict = {}
    label_index = index_of[name]

    for key, label_list in labels.items():
        for label in label_list:
            try:
                label_dict[key].append(label[label_index])
            except KeyError:
                label_dict[key] = [label[label_index]]

    return label_dict

In [19]:
sentiments = get_emotion_label_dict('sentiment')

Then we can export it to a valid `.pkl` file for our network.

In [20]:
def export_data(filename, labels):
    '''
    export data to a file
    '''
    path = 'data/' + filename + '.pkl'
    with open(path, 'wb') as f:
        all_modalities = (
            ids, speakers, labels, embeddings, audio, visual,
            raw_sentences, trains_index, tests_index)

        pickle.dump(all_modalities, f)
    

In [21]:
export_data('categorical', sentiments)

Then we can train the network:

In [22]:
# ! python train_categorical.py --lr=1e-4 --l2=1e-5 --rec-dropout=0.1 --dropout=0.5 --batch-size=128 --epochs=20 --log_dir='logs/mosei_categorical'