In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm

from datasets import Dataset
from transformers import AutoConfig, AutoModel, AutoTokenizer, BertModel, BertTokenizer, \
                         BertConfig, RobertaForSequenceClassification,RobertaTokenizer

import fasttext


from evaluation.extract_representations import get_dataset, get_embeddings_from_model
logging.basicConfig(format='%(process)d-%(levelname)s-%(message)s')



def get_vocab_from_jsondf(inputfile, basepath):
    """
    Input:
        inputfile: Input file path
        Basepath: Data folder
    Returns:
        lvocab: List of vocabulary in the dataset
    The function reads the dataframe from the inputfile. Then, it recovers the unique triplets
    (tree_id, node name, node definition) as a list.
    """

    df = pd.read_json(load_json(os.path.join(basepath,inputfile)), orient="index")
    # Iterate by subtrees, create directed graph + collect definitions
    all_vocab = {} # key:value -> (id_tree): {definitions:{}, graph:}
    for tid in range(min(df["treeid"]), max(df["treeid"])+1):
        # For each subtree
        filtered_df = df[df["treeid"]==tid]
        #print(display(filtered_df))
        # Retrieve definitions in this subtree
        for idx,row in filtered_df.iterrows():
            all_vocab[str(tid)+"-"+row['father']]=[tid,row['father'],row['father_definition']]
            all_vocab[str(tid)+"-"+row['child']]=[tid, row['child'],row['child_definition']]

        #subtrees[tid] = {'definitions':dictionary_defs, 'graph':G}
        #print("Root", [n for n,d in G.in_degree() if d==0] )
    lvocab =[]
    for k,v in all_vocab.items():
        lvocab.append(v)
    
    return lvocab



def get_dataset(inputfile, dataset_path):
    """
    Input:
        inputfile: json file
        dataset_path: path for the data folder

    Returns:
        Dataset (from HuggingFaces).
    It adds a new column with the constructed phrase with a word and its definition.
    """

    _all_vocab = get_vocab_from_jsondf(inputfile, dataset_path)
    df = pd.DataFrame(_all_vocab, columns=['tid','name','definition'])
    #df['name'] = df['name'].str.lower()
    #df['definition'] = df['definition'].str.lower()
    dataset = Dataset.from_pandas(df)
    dataset= dataset.add_column('concept',[n+" is defined as "+d for n,d in zip(dataset['name'],dataset['definition'])])
    #print("Loaded dataset")
    #print(dataset)
    return dataset




In [None]:
_ifile =
_dataset_base = 

logging.info("Reading dataset.")
 _dataset = get_dataset(_ifile, _dataset_base)
    
logging.info(f"Processing model: {model_name} with file: {_ifile}.")
get_embeddings_from_model(_dataset, model_str=model_name,model_src=model_source,
                          column=input_type,output_path=_output_name,only_last_layer=only_last_layer)