In [18]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import plotly.graph_objects as go
import networkx as nx

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
nltk.download('stopwords')
from keras.preprocessing.text import Tokenizer
from gensim.models import KeyedVectors
from keras.layers import Embedding
from keras.models import Sequential
import tensorflow as tf

from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.models import Model
import cv2
import os

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

from spektral.data.loaders import SingleLoader, Loader
from spektral.datasets.citation import Citation
from spektral.layers import GCNConv
from spektral.models.gcn import GCN
from spektral.transforms import LayerPreprocess
from sklearn.preprocessing import OneHotEncoder


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gsevr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('dev_sent_emo.csv',encoding='utf-8')
df['Utterance'] = df.Utterance.str.replace('Â’',"'")
df['gender'] = df.Speaker.apply(lambda x: 'male' if x == 'Ross' or x == 'Joey' or x == 'Chandler' else 'female')
df = df.sample(frac=0.03,ignore_index=True)
df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,gender
0,250,I'm not reading this!,Rachel,disgust,negative,20,6,5,10,"00:21:58,317","00:22:00,109",female
1,1001,That would be no.,Ross,disgust,negative,94,2,2,2,"00:13:21,717","00:13:23,551",male
2,924,I heard it from my friend Irene who heard it f...,Rachel,neutral,neutral,85,13,8,4,"00:19:03,433","00:19:06,811",female
3,840,God. I forgot how much I love driving. I have,Rachel,joy,positive,78,0,7,22,"00:09:32,530","00:09:35,490",female
4,602,Hey Mon!,Rachel,joy,positive,57,5,3,1,"0:01:38,807","0:01:39,754",female
5,560,"3, 2,",Joey,neutral,neutral,53,6,6,10,"00:20:15,088","00:20:17,024",male
6,1089,What?,Monica,surprise,positive,104,8,5,8,"00:21:06,431","00:21:07,765",female
7,904,"He said, ""Nice to meet you Glenda."" Well, obv...",Phoebe,neutral,neutral,84,2,5,13,"00:09:41,539","00:09:48,128",female
8,111,I guess so.,Phoebe,neutral,neutral,11,0,5,19,"00:05:29,287","00:05:30,412",female
9,725,I'm okay. I gotta go down to the police statio...,Tag,sadness,negative,69,1,7,8,"00:19:40,888","00:19:45,099",female


In [3]:
def preprocess(text):
    t = text.lower()
    t = re.sub('\d+',r'',t)
    t = re.sub(r'\W+',r' ',t)
    return t

lemmatizer = WordNetLemmatizer()



df['prepro'] = [' '.join([lemmatizer.lemmatize(preprocess(txt))])
                 .strip() for txt in df['Utterance']]


texts = df.prepro.values
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=30, padding='post')

In [4]:
glove_path = 'glove.6B.300d.txt'
word_vectors = KeyedVectors.load_word2vec_format(glove_path, binary=False, encoding='utf8',no_header=True)



In [5]:
embedding_dim = 300  
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in word_vectors:
        embedding_matrix[i] = word_vectors[word]

embedding_layer = Embedding(
    input_dim=len(tokenizer.word_index) + 1,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=30,
    trainable=False
)


model = Sequential()
model.add(embedding_layer)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics='accuracy')


In [6]:
embedding_vectors = model.predict(padded_sequences)
embedding_vectors.shape



(33, 30, 300)

In [7]:
base_model = InceptionV3(weights='imagenet', include_top=False)

model = Model(inputs=base_model.input, outputs=base_model.layers[-2].output)

def get_video_embeddings(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_embeddings = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.resize(frame, (299, 299))
        x = np.expand_dims(frame, axis=0)

        x = preprocess_input(x)

        frame_embedding = model.predict(x)

        frame_embeddings.append(frame_embedding)

    cap.release()
    return frame_embeddings


In [8]:
video_embeddings_list = []
folder_path = 'dev_splits_complete/'

for idx,row in df.iterrows():
    file_name = 'dia' + str(row['Dialogue_ID']) + '_utt' + str(row['Utterance_ID']) + '.mp4'
    video_path = folder_path + file_name
    if os.path.isfile(video_path):
        video_embeddings_list.append(get_video_embeddings(video_path))
    else:
        print(f'File name {file_name} does not exist')

        



In [10]:
df.Emotion.unique()

array(['disgust', 'neutral', 'joy', 'surprise', 'sadness', 'anger',
       'fear'], dtype=object)

In [16]:
ohe_emotions = OneHotEncoder(categories=[['disgust', 'neutral', 'joy', 'surprise', 'sadness', 'anger','fear']])
labels_emotions = ohe_emotions.fit_transform(df.Emotion.to_numpy().reshape(-1,1)).toarray()

In [15]:
ohe_sentiment = OneHotEncoder(categories=[['negative', 'neutral', 'positive']])
labels_sentiment = ohe_sentiment.fit_transform(df.Sentiment.to_numpy().reshape(-1,1)).toarray()

In [50]:
text_embs = {}

for i, array in enumerate(embedding_vectors):
    text_embs[i] = array

In [54]:
G = nx.Graph()

# for text, video in zip(embedding_vectors,video_embeddings_list):
#     G.add_node([text,video])

for i, array in text_embs.items():
    G.add_node([array])

TypeError: unhashable type: 'list'

In [56]:
G.info()

AttributeError: 'Graph' object has no attribute 'info'