In [39]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [40]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [41]:
!pip install transformers



In [42]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [43]:
import pandas as pd

session_1_csv = pd.read_csv('/content/gdrive/My Drive/Multi_Model_Emotion_Detection/IEMoCap/Extracted CSVs/Session1.csv')
session_2_csv = pd.read_csv('/content/gdrive/My Drive/Multi_Model_Emotion_Detection/IEMoCap/Extracted CSVs/Session2.csv')
session_3_csv = pd.read_csv('/content/gdrive/My Drive/Multi_Model_Emotion_Detection/IEMoCap/Extracted CSVs/Session3.csv')
session_4_csv = pd.read_csv('/content/gdrive/My Drive/Multi_Model_Emotion_Detection/IEMoCap/Extracted CSVs/Session4.csv')
session_5_csv = pd.read_csv('/content/gdrive/My Drive/Multi_Model_Emotion_Detection/IEMoCap/Extracted CSVs/Session5.csv')

In [84]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "/content/gdrive/My Drive/Multi_Model_Emotion_Detection/BERT Models/DailyDialog+MELD+IEMOCAP/4/epoch_3", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 5, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

print('Model Ready')

Model Ready


In [85]:
def get_embeddings(utterance):
  input_ids = tokenizer.encode(utterance, add_special_tokens=True)
  input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(device)
  _, hidden_states = model(input_ids)
  embedding = hidden_states[-1][0][0]
  return embedding.cpu().detach().numpy()

In [86]:
session_1_csv['Text Embeddings'] = session_1_csv['Utterance'].apply(get_embeddings)
session_2_csv['Text Embeddings'] = session_2_csv['Utterance'].apply(get_embeddings)
session_3_csv['Text Embeddings'] = session_3_csv['Utterance'].apply(get_embeddings)
session_4_csv['Text Embeddings'] = session_4_csv['Utterance'].apply(get_embeddings)
session_5_csv['Text Embeddings'] = session_5_csv['Utterance'].apply(get_embeddings)

In [87]:
EMOTION_IDS = {
    0: 'anger/disgust',
    1: 'fear/surprise',
    2: 'happiness',
    3: 'neutral',
    4: 'sadness'
}

In [88]:
def get_id(self):
  session_number = str(self.Session_Number)
  session_number = '0' * (2 - len(session_number)) + session_number
  utterance_number = str(self.Utterance_Number)
  utterance_number = '0' * (3 - len(utterance_number)) + utterance_number
  return "%s-%s-%s%s-%s-%s-%s-%s" % (EMOTION_IDS[self.Emotion_Label], 'test', session_number, self.Mocap_Source, self.Dialogue_Type, self.Dialogue_Number, utterance_number, self.Speaker)

In [89]:
session_1_csv['ID'] = session_1_csv.apply(lambda row: get_id(row), axis=1)
session_2_csv['ID'] = session_2_csv.apply(lambda row: get_id(row), axis=1)
session_3_csv['ID'] = session_3_csv.apply(lambda row: get_id(row), axis=1)
session_4_csv['ID'] = session_4_csv.apply(lambda row: get_id(row), axis=1)
session_5_csv['ID'] = session_5_csv.apply(lambda row: get_id(row), axis=1)

In [90]:
dataset_df = pd.concat([session_1_csv, session_2_csv, session_3_csv, session_4_csv, session_5_csv])

In [91]:
import pickle
pickle.dump(dataset_df[['ID', 'Text Embeddings']], open('/content/gdrive/My Drive/Multi_Model_Emotion_Detection/Embeddings/IEMOCAP Embeddings/DailyDialog+MELD+IEMOCAP/4/text_embeddings.pkl', 'wb'))