In [None]:
import glob
import jsonlines
import os
import pandas as pd
import re
from collections import OrderedDict

In [None]:
class sampling_dir:
    # dense_medium.logit_lens.24.sample_output.temp0_4.topk_40.txt
    rgx_toplevel = re.compile("^([^.]*)\.([^.]*).(\d*)\.sample_output(.*)\.txt$")
    rgx_temp = re.compile("^temp([0-9_]*)$")
    rgx_topk = re.compile("^topk_(\d*)$")
    
    #model_dtype = pd.CategoricalDtype(models.all, ordered=True)
    
    def __init__(self, path):
        self.path = path

    @classmethod
    def parse_results_file_name(cls, file_name):
        res = OrderedDict()
        m = cls.rgx_toplevel.match(file_name)
        if m is None:
            print("WARNING: cannot parse results file '{}'".format(file_name))
            return None

        run_id = m[1]
        res['path'] = file_name
        res['model'] = m[1]
        res['layer'] = int(m[3])
        res['depth_method'] = m[2]
        
        args = filter(None, m[4].split('.'))
        for arg in args:
            m_temp = cls.rgx_temp.match(arg)
            if m_temp is not None:
                res['temp'] = float(m_temp[1].replace('_', '.'))
                continue

            m_topk = cls.rgx_topk.match(arg)
            if m_topk is not None:
                res['top_k'] = int(m_topk[1])
                continue
        
            print("WARNING: cannot parse run id '{}' argument '{}'".format(run_id, arg))

        return res

    def as_df(self):
        dict_list = []
        for file_path in glob.glob(os.path.join(self.path, "*.txt")):
            # Parse the file name and add
            meta = self.parse_results_file_name(os.path.basename(file_path))
            if meta is None:
                continue

            # Read the json file into a data frame

            with open(file_path) as f:
                try:
                    lines = [line for line in jsonlines.Reader(f)]
                except Exception as e:
                    print("WARNING: cannot load file '{}'".format(file_path))
                    print(e)
                    continue

            for result_json in lines:
                record = dict()
                for key in result_json.keys():
                    record[key] = result_json[key]

                for meta_key in meta:
                    record[meta_key] = meta[meta_key]
                dict_list.append(record)

        res = pd.json_normalize(dict_list)

        def set_col_type(col_name, t):
            if col_name in res:
                res[col_name] = res[col_name].astype(t)
        set_col_type('path', str)
        set_col_type('layer', pd.Int64Dtype())
        set_col_type('temp', pd.Float32Dtype())
        set_col_type('top_k', pd.Int64Dtype())

        return res


In [None]:
df = sampling_dir("/mnt/ssd-1/igor/gpt-neox/sampling").as_df()

In [None]:
#'context', 'text', 'length', 'finished', 'message', 'duration_seconds', 'path', 'model', 'layer', 'depth_method', 'temp', 'top_k'
df[["context", "model", "layer", "depth_method", "temp", "top_k", "text"]].head(100)

In [None]:
def show_table(context, model="dense_medium", depth_method="extra_linear", temp=None, top_k=None):
    df2 = df[(df["context"] == context) & (df["model"]==model) & (df["depth_method"] == depth_method) & (df["temp"] == temp) & (df["top_k"] == top_k)]
    return df2[["layer", "text", "temp", "top_k"]].sort_values("layer")

In [None]:
show_table("At its peak, the Roman", temp=0.4, top_k=40).head(500)

In [None]:
contexts = df["context"].unique()

In [None]:
df_list = []
for context in contexts:
    depth_methods = ["logit_lens", "extra_linear"]
    dfs = [show_table(context, temp=0.4, top_k=40, depth_method=depth_method) for depth_method in depth_methods]

    df2 = dfs[0].merge(dfs[1], on=["layer", "temp", "top_k"], suffixes=["." + dm for dm in depth_methods])
    df2["context"] = context
    df2 = df2[["context", "temp", "top_k", "layer", "text.logit_lens", "text.extra_linear"]]
    df_list.append(df2)
pd.concat(df_list).to_csv("sampling.csv")