In [9]:
import torch
import pandas as pd
import numpy as np
import os


from judgenet.modules.preprocess import SentenceEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
folder_path = "../../data/mit_interview"

scores = pd.read_csv(f"{folder_path}/turker_scores_full_interview.csv")
transcripts = pd.read_csv(f"{folder_path}/interview_transcripts_by_turkers.csv")
prosody_features = pd.read_csv(f"{folder_path}/prosodic_features.csv")

print(scores.shape)
print(transcripts.shape)
print(prosody_features.shape)

(1378, 21)
(138, 2)
(690, 59)


# Extract aggregated scores

In [3]:
# Reduce scores to only Overall score aggregated across turkers
scores = scores[scores["Worker"]=="AGGR"][["Participant", "Overall"]]
print(scores)

     Participant   Overall
9             p1  5.297316
19            p3  4.414892
29            p4  4.494494
39            p5  5.457670
49            p6  5.106512
...          ...       ...
1338        pp83  6.045748
1348        pp84  5.710073
1358        pp85  5.626074
1368        pp86  4.853881
1377        pp89  4.960084

[138 rows x 2 columns]


In [4]:
scores_tensor = torch.tensor(scores["Overall"].values)
scores_tensor.shape

torch.Size([138])

# Aggregate VGGish features

In [14]:
root = "../../data/mit_interview/features/vggish/"
filenames = os.listdir(root)
feats = []
filenames_ordered = []


for i in range(90):
    file = f"audio_P{i}.pt"
    if file in filenames:
        filenames_ordered.append(file)

for i in range(90):
    file = f"audio_PP{i}.pt"
    if file in filenames:
        filenames_ordered.append(file)

for name in filenames_ordered:
    feats.append(torch.load(f"{root}/{name}"))
audio_tensor = torch.stack(feats, dim=0)
torch.save(audio_tensor, f"{folder_path}/features/audio.pt")

torch.Size([128])

# Aggregate BERT features

In [55]:
root = "../../data/mit_interview/features/lexical_batched"
feats = []
filenames_ordered = []

for i in range(138):
    file = f"{i}.pt"
    feats.append(torch.load(f"{root}/{file}"))

lexical_tensor = torch.stack(feats, dim=0)
torch.save(lexical_tensor, f"{folder_path}/features/lexical.pt")

In [56]:
lexical_tensor.shape

torch.Size([138, 768])

# Extract and pool prosody features (deprecated)


In [None]:

# Loop through prosody features, taking the mean of each participants' 5 questions
# pooled_example = np.mean(prosody_features.iloc[0:5])

prosodic_features_pooled = pd.DataFrame()


for i in range(0,len(prosody_features),5):
    # Check first and last id to make sure chunk is all from same participant
    participant = prosody_features.iloc[i]["participant&question"].split('Q')[0]
    if participant != prosody_features.iloc[4+i]["participant&question"].split('Q')[0]:
        print("Misaligned chunk: ", i)
        break

    chunk = prosody_features.iloc[i:i+5, prosody_features.columns != "participant&question"]

    pooled = np.mean(chunk)
    # Add participant to row
    # pooled = pd.concat([pd.Series([participant.lower()], index=["Participant"]), pooled])

    prosodic_features_pooled = pd.concat([prosodic_features_pooled, pooled], axis=1)

prosodic_features_pooled = prosodic_features_pooled.transpose()

In [None]:
print(prosodic_features_pooled)
prosodic_features_pooled.shape

In [37]:
prosody_tensor = torch.tensor(prosodic_features_pooled.values)
prosody_tensor.shape

torch.Size([138, 57])

# Prune Transcript down to just interviewee speech and extract BERT features

Note: Kernel was crashing when extracting all the rows, so I'm batching it

In [13]:
def extract_bert(transcript):
    split = transcript.split('Interviewee:')

    # Trim transcript to Interviewee speech
    cleaned = ""
    for row in split[1:]:
        cleaned += row.split('|')[0]

    tokens = se.tokenize(cleaned)
    return se.encode_batched_tokens(tokens)

se = SentenceEncoder()
current_pos = 0

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
lexical_features = []

for i in range(current_pos, current_pos+20):
    if i < len(transcripts):
        transcript = transcripts.iloc[i]["Transcript"]
        embedding = extract_bert(transcript)
        lexical_features.append(embedding)

current_pos = current_pos + 20
print(f"finished up to row row {i}")
stacked = torch.stack(lexical_features)
torch.save(stacked, f"{folder_path}/features/lexical_batched/{i}.csv")

finished up to row row 139


In [18]:
feats = []
filenames = ["19.pt", "39.pt", "59.pt", "79.pt", "99.pt", "119.pt", "139.pt"]
for name in filenames:
    feats.append(torch.load(f"{folder_path}/features/lexical/{name}"))

In [27]:
lexical_tensor = torch.cat(feats, dim=0)

In [44]:
# print(f"Shape of scores: {scores_tensor.shape}")
# print(f"Shape of prosody: {prosody_tensor.shape}")
print(f"Shape of audio: {audio_tensor.shape}")
# print(f"Shape of lexical: {lexical_tensor.shape}")

Shape of audio: torch.Size([138, 128])


In [37]:
torch.save(scores_tensor, f"{folder_path}/features/scores.pt")
torch.save(prosody_tensor, f"{folder_path}/features/prosody.pt")
torch.save(lexical_tensor, f"{folder_path}/features/lexical.pt")

# Scratch under here

In [58]:
scores_tensor = torch.load(f"{folder_path}/features/scores.pt")
audio_tensor = torch.load(f"{folder_path}/features/audio.pt")
lexical_tensor = torch.load(f"{folder_path}/features/lexical.pt")

print(scores_tensor.shape, audio_tensor.shape, lexical_tensor.shape)

torch.Size([138]) torch.Size([138, 128]) torch.Size([138, 768])
