In [62]:
import glob
import jsonlines
import os
import pandas as pd
import re
from collections import OrderedDict

In [72]:
class sampling_dir:
    # dense_medium.logit_lens.24.sample_output.temp0_4.topk_40.txt
    rgx_toplevel = re.compile("^([^.]*)\.([^.]*).(\d*)\.sample_output(.*)\.txt$")
    rgx_temp = re.compile("^temp(\d*)")
    rgx_topk = re.compile("^topk_([0-9_]*)$")
    
    #model_dtype = pd.CategoricalDtype(models.all, ordered=True)
    
    def __init__(self, path):
        self.path = path

    @classmethod
    def parse_results_file_name(cls, file_name):
        res = OrderedDict()
        m = cls.rgx_toplevel.match(file_name)
        if m is None:
            print("WARNING: cannot parse results file '{}'".format(file_name))
            return None

        run_id = m[1]
        res['path'] = file_name
        res['model'] = m[1]
        res['layer'] = int(m[3])
        res['depth_method'] = m[2]
        
        args = filter(None, m[4].split('.'))
        for arg in args:
            m_temp = cls.rgx_temp.match(arg)
            if m_temp is not None:
                res['temp'] = float(m_temp[1].replace('_', '.'))
                continue

            m_topk = cls.rgx_topk.match(arg)
            if m_topk is not None:
                res['top_k'] = int(m_topk[1])
                continue
        
            print("WARNING: cannot parse run id '{}' argument '{}'".format(run_id, arg))

        return res

    def as_df(self):
        dict_list = []
        for file_path in glob.glob(os.path.join(self.path, "*.txt")):
            # Parse the file name and add
            meta = self.parse_results_file_name(os.path.basename(file_path))
            if meta is None:
                continue

            # Read the json file into a data frame

            with open(file_path) as f:
                try:
                    lines = [line for line in jsonlines.Reader(f)]
                except Exception as e:
                    print("WARNING: cannot load file '{}'".format(file_path))
                    continue

            for result_json in lines:
                record = dict()
                for key in result_json.keys():
                    record[key] = result_json[key]

                for meta_key in meta:
                    record[meta_key] = meta[meta_key]
                dict_list.append(record)

        res = pd.json_normalize(dict_list)
        print(res)

        def set_col_type(col_name, t):
            if col_name in res:
                res[col_name] = res[col_name].astype(t)
        set_col_type('path', str)
        set_col_type('layer', pd.Int64Dtype())
        set_col_type('temp', pd.Float32Dtype())
        set_col_type('top_k', pd.Int64Dtype())

        return res


In [73]:
df = sampling_dir("/mnt/ssd-1/igor/gpt-neox/sampling").as_df()

                                               context  \
0                               At its peak, the Roman   
1                 A robot may not injure a human being   
2    A giraffe walks into a bar and orders a beer. ...   
3               Dow Jones futures tilted higher, along   
4    Language models like GPT-3 are useful for many...   
..                                                 ...   
793  A giraffe walks into a bar and orders a beer. ...   
794             Dow Jones futures tilted higher, along   
795  Language models like GPT-3 are useful for many...   
796  Maximum temperature today near 86 degrees. A p...   
797  In this article we took a step-by-step look at...   

                                                  text  length  finished  \
0     Empire was the largest and most powerful worl...      64     False   
1    .\n\nThe first step is to create a new project...      64     False   
2    , a man named the "Jerk’s" wife, says he was a...      64     False   

In [77]:
#'context', 'text', 'length', 'finished', 'message', 'duration_seconds', 'path', 'model', 'layer', 'depth_method', 'temp', 'top_k'
df[["context", "model", "layer", "depth_method", "temp", "top_k", "text"]]

Unnamed: 0,context,model,layer,depth_method,temp,top_k,text
0,"At its peak, the Roman",dense_medium,13,extra_linear,,,Empire was the largest and most powerful worl...
1,A robot may not injure a human being,dense_medium,13,extra_linear,,,.\n\nThe first step is to create a new project...
2,A giraffe walks into a bar and orders a beer. ...,dense_medium,13,extra_linear,,,", a man named the ""Jerk’s"" wife, says he was a..."
3,"Dow Jones futures tilted higher, along",dense_medium,13,extra_linear,,,with the dollar.\n\nThe market share of the m...
4,Language models like GPT-3 are useful for many...,dense_medium,13,extra_linear,,,to be used in the same way as the standard mo...
...,...,...,...,...,...,...,...
793,A giraffe walks into a bar and orders a beer. ...,dense_medium,5,logit_lens,0.0,40,", thoughvening to changeabledashleyesterdaying..."
794,"Dow Jones futures tilted higher, along",dense_medium,5,logit_lens,0.0,40,cordlygoes one-time serviceraitslihoodsystems...
795,Language models like GPT-3 are useful for many...,dense_medium,5,logit_lens,0.0,40,"being either way, and/or\r\n\nPracticality of..."
796,Maximum temperature today near 86 degrees. A p...,dense_medium,5,logit_lens,0.0,40,rated using your own customised versionedxt m...
