In [25]:
import pandas as pd
import numpy as np

rootdir = "/home/ninad/Desktop/Link-to-sem4/dsis/prob-emb/fb-poincare/data/book_data/"

# Remember to change this
# data_dir = rootdir + "exp1.1_pretrn_ext/"  # pre-train folder
data_dir = rootdir + "exp2.3_baseline_notaxo/"  # No-pretrain folder
prob_val = 0.01    # probability threshold

# Set this to None if no-pretrain data folders
files_genre = None
# files_genre = ["genre_genre_master.txt", "genre_genre_eval.txt"]

files_book = ["book_dev.txt", "book_train.txt", "book_test.txt", "book_train_eval.txt"]

In [2]:
def get_data_list(data_filepath, prob_threshold):
    """Function to create a list of training tuples. It is according
    to the training data format specified for the poincare model. Also
    writes a csv file to disk.
    
    Parameters
    ----------
    data_filepath : str
        csv file Path having the conditional probabilities. format is
        like - IsA \t term1 \t term2 \t prob.    
    prob_threshold : float
        threshold for the conditional probability. Only pairs having 
        prob greater than this will be considered.
    outFile : str
        File name to which output the modified training file
    
    Returns
    -------
    data_list : list
        List of training tuples (pairs) having 2 terms which satisfy 
        the threshold requirement.             
    """
    # data_list = []    
    df = pd.read_csv(data_filepath, header=None, delimiter='\t', usecols=[1,2,3])
    df.columns = ['t1', 't2', 'cond_prob']
    df = df[df.cond_prob >= prob_threshold]
    # drop the 3rd column now., since no use
    df.drop('cond_prob', axis=1, inplace=True)
    data_list = list(df.itertuples(index=False, name=None))

    index = data_filepath.find('.txt')
    outFile = data_filepath[:index] + "_hb_gensim.csv"
    with open(outFile, "w") as out:
        out.write("id1,id2,weight\n")
        for row in data_list:
            out.write("%s,%s,1\n" % (row[0], row[1]))
    return data_list
    

In [5]:
def write_mod_data(data_filepath, prob_threshold):
    """Function to write csv training file for the fb-poincare model.
    
    Parameters
    ----------
    data_filepath : str
        csv file Path having the conditional probabilities. format is
        like - IsA \t term1 \t term2 \t prob.    
    prob_threshold : float
        threshold for the conditional probability. Only pairs having 
        prob greater than this will be considered.         
    """
    # data_list = []    
    df = pd.read_csv(data_filepath, header=None, delimiter='\t', usecols=[1,2,3])
    df.columns = ['id1', 'id2', 'weight']
    df = df[df.weight >= prob_threshold]
    index = data_filepath.find('.txt')
    outFile = data_filepath[:index] + "_hb.csv"
    df.to_csv(outFile, sep=',', index=False)    
    return


In [21]:
def create_new_files(datadir, file_list, genre_list=None, prob_t=0.01):
    """Function to create new training files for hyperbolic models.
    
    Parameters
    ----------
    datadir: str
        Path of the data directory where all the training files are located
    file_list: list (of str)
        List of filenames in data dir which need to be transformed
    genre_list: list (of str)
        List of genre specific files to be modified. Defaults is None (if there
        are no such genre specific files in the datadir)
    prob_t: float
        probability threshold to be considered for dropping out edges.
        Default value: 0.01
    
    """
    
    for f in file_list:
        fpath = datadir + f
        write_mod_data(fpath, prob_t)
    
    if genre_list:
        for gf in genre_list:
            gpath = datadir + gf
            write_mod_data(gpath, prob_t)
    
    return
    

In [26]:
create_new_files(datadir=data_dir, file_list=files_book, genre_list=files_genre, prob_t=prob_val)