In [62]:
import glob
import jsonlines
import os
import pandas as pd
import re
from collections import OrderedDict

In [130]:
class sampling_dir:
    # dense_medium.logit_lens.24.sample_output.temp0_4.topk_40.txt
    rgx_toplevel = re.compile("^([^.]*)\.([^.]*).(\d*)\.sample_output(.*)\.txt$")
    rgx_temp = re.compile("^temp([0-9_]*)$")
    rgx_topk = re.compile("^topk_(\d*)$")
    
    #model_dtype = pd.CategoricalDtype(models.all, ordered=True)
    
    def __init__(self, path):
        self.path = path

    @classmethod
    def parse_results_file_name(cls, file_name):
        res = OrderedDict()
        m = cls.rgx_toplevel.match(file_name)
        if m is None:
            print("WARNING: cannot parse results file '{}'".format(file_name))
            return None

        run_id = m[1]
        res['path'] = file_name
        res['model'] = m[1]
        res['layer'] = int(m[3])
        res['depth_method'] = m[2]
        
        args = filter(None, m[4].split('.'))
        for arg in args:
            m_temp = cls.rgx_temp.match(arg)
            if m_temp is not None:
                res['temp'] = float(m_temp[1].replace('_', '.'))
                continue

            m_topk = cls.rgx_topk.match(arg)
            if m_topk is not None:
                res['top_k'] = int(m_topk[1])
                continue
        
            print("WARNING: cannot parse run id '{}' argument '{}'".format(run_id, arg))

        return res

    def as_df(self):
        dict_list = []
        for file_path in glob.glob(os.path.join(self.path, "*.txt")):
            # Parse the file name and add
            meta = self.parse_results_file_name(os.path.basename(file_path))
            if meta is None:
                continue

            # Read the json file into a data frame

            with open(file_path) as f:
                try:
                    lines = [line for line in jsonlines.Reader(f)]
                except Exception as e:
                    print("WARNING: cannot load file '{}'".format(file_path))
                    print(e)
                    continue

            for result_json in lines:
                record = dict()
                for key in result_json.keys():
                    record[key] = result_json[key]

                for meta_key in meta:
                    record[meta_key] = meta[meta_key]
                dict_list.append(record)

        res = pd.json_normalize(dict_list)

        def set_col_type(col_name, t):
            if col_name in res:
                res[col_name] = res[col_name].astype(t)
        set_col_type('path', str)
        set_col_type('layer', pd.Int64Dtype())
        set_col_type('temp', pd.Float32Dtype())
        set_col_type('top_k', pd.Int64Dtype())

        return res


In [132]:
df = sampling_dir("/mnt/ssd-1/igor/gpt-neox/sampling").as_df()

In [133]:
#'context', 'text', 'length', 'finished', 'message', 'duration_seconds', 'path', 'model', 'layer', 'depth_method', 'temp', 'top_k'
df[["context", "model", "layer", "depth_method", "temp", "top_k", "text"]].head(100)

Unnamed: 0,context,model,layer,depth_method,temp,top_k,text
0,"At its peak, the Roman",dense_medium,13,extra_linear,,,Empire was the largest and most powerful worl...
1,A robot may not injure a human being,dense_medium,13,extra_linear,,,.\n\nThe first step is to create a new project...
2,A giraffe walks into a bar and orders a beer. ...,dense_medium,13,extra_linear,,,", a man named the ""Jerk’s"" wife, says he was a..."
3,"Dow Jones futures tilted higher, along",dense_medium,13,extra_linear,,,with the dollar.\n\nThe market share of the m...
4,Language models like GPT-3 are useful for many...,dense_medium,13,extra_linear,,,to be used in the same way as the standard mo...
5,Maximum temperature today near 86 degrees. A p...,dense_medium,13,extra_linear,,,"weather forecast for Tuesday, Tuesday, Tuesda..."
6,In this article we took a step-by-step look at...,dense_medium,13,extra_linear,,,the tools of the internet to help you to get ...
7,"At its peak, the Roman",dense_medium,7,extra_linear,0.4,40.0,Empire was the most common place in the Unite...
8,A robot may not injure a human being,dense_medium,7,extra_linear,0.4,40.0,that is not a\n\nconstant. ...
9,A giraffe walks into a bar and orders a beer. ...,dense_medium,7,extra_linear,0.4,40.0,", he said, ""I know, I'm not.""\n\n""No, I'm not...."


In [127]:
def show_table(context, model="dense_medium", depth_method="extra_linear", temp=None, top_k=None):
    df2 = df[(df["context"] == context) & (df["model"]==model) & (df["depth_method"] == depth_method) & (df["temp"] == temp) & (df["top_k"] == top_k)]
    return df2[["layer", "text", "temp", "top_k"]].sort_values("layer")

In [134]:
show_table("At its peak, the Roman", temp=0.4, top_k=40).head(500)

Unnamed: 0,layer,text,temp,top_k
21,0,"Catholic Church of the same time, the last ye...",0.4,40
455,1,Empire of the most of the\n\n\n\n\nThe presen...,0.4,40
441,2,"Empire, the most common.\n\nThe first time I ...",0.4,40
854,3,"Empire, the ancient Greek and the West, and t...",0.4,40
371,4,Empire of the British Empire.\n\nThe Chinese ...,0.4,40
399,5,Empire was the most common of the world. The...,0.4,40
84,6,Empire was the most common place in the Unite...,0.4,40
7,7,Empire was the most common place in the Unite...,0.4,40
364,8,"Catholic Church was the most prominent, but t...",0.4,40
798,9,Empire was a modernized and the most importan...,0.4,40


In [140]:
contexts = df["context"].unique()

In [156]:
df_list = []
for context in contexts:
    depth_methods = ["logit_lens", "extra_linear"]
    dfs = [show_table(context, temp=0.4, top_k=40, depth_method=depth_method) for depth_method in depth_methods]

    df2 = dfs[0].merge(dfs[1], on=["layer", "temp", "top_k"], suffixes=["." + dm for dm in depth_methods])
    df2["context"] = context
    df2 = df2[["context", "temp", "top_k", "layer", "text.logit_lens", "text.extra_linear"]]
    df_list.append(df2)
pd.concat(df_list).to_csv("sampling.csv")