In [1]:
import pandas as pd
import torch

from sentence_transformers import SentenceTransformer
from sentence_transformers import models
from tqdm import tqdm

In [2]:
model_path = 'tune_results/sentence-transformers'
word_embedding_model = models.Transformer(model_path, max_seq_length=64)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
model._modules["1"].pooling_mode_mean_tokens = False
model._modules["1"].pooling_mode_cls_token = True

In [3]:
train_df = pd.read_pickle('data/train.pkl')
val_df = pd.read_csv('data/multi_choice/val.csv')
test_df = pd.read_csv('data/multi_choice/test.csv')

train_df = train_df.loc[:, ['action', 'object', 'effect_sentence_list']]
train_df = train_df.explode('effect_sentence_list', ignore_index=True)
train_df.rename(columns={'effect_sentence_list': 'effect_sentence'}, inplace=True)

In [4]:
train_df.head()

Unnamed: 0,action,object,effect_sentence
0,arrange,chairs,objects are moved around in order
1,arrange,chairs,the objects are put in order
2,arrange,chairs,the objects are st in a specific order
3,arrange,chairs,objects are arranged in a specific fashion all...
4,arrange,chairs,the objects are now in a row


In [5]:
val_df.head()

Unnamed: 0,object,action,effect_sentence_1,effect_sentence_2,effect_sentence_3,effect_sentence_4,label
0,window,open,the object was left ajar,the object is made so it cannot be opened,outside brown layer of object is taken off lea...,the object is open and air is coming in,3
1,bottle,squeeze,a object is dropped into a dirty bucket,the object is now out of shape,a object is dropped into a dirty bucket,the doctor writes the word poison on a object ...,1
2,shirt,stain,the object is cut in two,the object has a mark on it,the object is hanging in the closet,the object is cut in two,1
3,knee,skin,the objects will be in a pile,the object will be scraped and the skin will b...,the objects will be in a pile,the objects will be in a pile,1
4,football,kick,the ball is being struck by a foot,the object is divided into pieces,the object is divided into pieces,the object is flying through the air,0


In [11]:
train_emb_df = pd.DataFrame(columns=['action', 'obj_emb', 'effect_emb'])
for i, row in tqdm(train_df.iterrows()):
    action = row['action']
    with torch.no_grad():
        obj_emb = model.encode(row['object'])
        effect_emb = model.encode(row['effect_sentence'])
    train_emb_df.loc[i] = [action, obj_emb, effect_emb]
    
val_emb_df = pd.DataFrame(columns=['action', 'obj_emb', 'effect_emb_1', 'effect_emb_2', 'effect_emb_3', 'effect_emb_4', 'label'])
for i, row in tqdm(val_df.iterrows()):
    action = row['action']
    label = row['label']
    with torch.no_grad():
        obj_emb = model.encode(row['object'])
        effect_emb_1 = model.encode(row['effect_sentence_1'])
        effect_emb_2 = model.encode(row['effect_sentence_2'])
        effect_emb_3 = model.encode(row['effect_sentence_3'])
        effect_emb_4 = model.encode(row['effect_sentence_4'])
    val_emb_df.loc[i] = [action, obj_emb, effect_emb_1, effect_emb_2, effect_emb_3, effect_emb_4, label]

test_emb_df = pd.DataFrame(columns=['action', 'obj_emb', 'effect_emb_1', 'effect_emb_2', 'effect_emb_3', 'effect_emb_4', 'label'])
for i, row in tqdm(test_df.iterrows()):
    action = row['action']
    label = row['label']
    with torch.no_grad():
        obj_emb = model.encode(row['object'])
        effect_emb_1 = model.encode(row['effect_sentence_1'])
        effect_emb_2 = model.encode(row['effect_sentence_2'])
        effect_emb_3 = model.encode(row['effect_sentence_3'])
        effect_emb_4 = model.encode(row['effect_sentence_4'])
    test_emb_df.loc[i] = [action, obj_emb, effect_emb_1, effect_emb_2, effect_emb_3, effect_emb_4, label]

840it [00:16, 52.05it/s]
150it [00:07, 21.07it/s]
255it [00:11, 21.19it/s]

In [None]:
train_emb_df.to_pickle('data/train_emb.pkl')
test_emb_df.to_pickle('data/multi_choice/test_emb.pkl')
val_emb_df.to_pickle('data/multi_choice/val_emb.pkl')