In [None]:

import numpy as np
import scipy.io as sio
import os
from os.path import basename
from util import util_feat as uft

import yaml
with open('./util/config__drama_data.yaml', 'r') as yml:
    config = yaml.safe_load(yml)
        

# (demo 1) Feature extraction for a text file

## Param: Setting parameters

model_lang : 'jp'(Japanese) or 'en'(English)<br>
text_path : path of the text file (~.txt)<br>

In [None]:
model_lang = 'en'
layer_no = 24
text_path = './demo_files/test_annotation.txt'

## Main: Feature extraction for each line in the text file.

In [None]:

if model_lang == 'jp':
    model_name = 'GPT2medium_jp_layer{:d}_mean'.format(layer_no)
elif model_lang == 'en':
    model_name = 'GPT2medium_en_layer{:d}_mean'.format(layer_no)
    
model_set = uft.initial_setup_for_model(model_name)
    
    
# Load the text file
f = open(text_path)
lines = f.readlines()
f.close()

# Cleaning texts
print('********** Feature extraction using {:s} **********'.format(model_name))
features = []
for li, line in enumerate(lines):
    line_clean = uft.text_cleaning(line) # preprocessing for each line
    print('(line ID:{:d}): {:s}'.format(li, line_clean))
    feature = uft.feature_extraction_using_GPT2(model_set, line_clean)
    features.append(feature.to('cpu').detach().numpy().copy()[0])  

del feature
features = np.array(features)

# (demo 2) Feature extraction for our annotation files
## Param: Setting parameters

featCode : 0–3. We prepared 4 features corresponding to the 4 annotations (0: obj, 1: speech(jp), 2: speech(en), 3: story). <br>
saveDir : directory of the annotation files <br>

In [None]:
featCode = 1
saveDir = './feature/'

## Main: Feature extraction for each line in each annotation file.

In [None]:

annot_name = config['annotInfo']['annotTypes'] [featCode]
model_name_main = config['annotInfo']['modelNames'] [featCode]
model_name_sub = config['annotInfo']['modelSubNames'] [model_name_main][featCode]
model_name = model_name_main + '_' + model_name_sub

'''
You can change merge_type as follows ...
> merge_type = sum     # eos, sum, mean
> model_name.replace('mean', merge_type)
'''

print('Feature extraction : {:s}'.format(model_name))
os.makedirs(saveDir, exist_ok=True)

# initial setip of model 
fileNames_txt, fileNames_feat = uft.set_load_and_save_info(config, annot_name, model_name)
model_set = uft.initial_setup_for_model(model_name)

# --------------------------------------------------
for fi in range(0, len(fileNames_txt)):

    print('Now processing for : {:s}'.format(fileNames_txt[fi]))

    # Load lines
    f = open(fileNames_txt[fi])
    lines = f.readlines()
    f.close()

    # Set texts
    texts = []
    for li, line in enumerate(lines):
        
        # preprocessing for each line
        line = uft.preproc_line(annot_name, line)
        
        # append to list
        texts.append(line)  

    # feature extraction
    features = uft.feature_extraction_using_GPT2__annotation(model_set, texts)
    
    # Save features
    path_save = '{:s}/{:s}'.format(saveDir, basename(fileNames_feat[fi]))
    sio.savemat(path_save, {'stim':features})

