# The datasets

This is a quick look at the datasets generated in this project.

In [1]:
import jsonlines
from pathlib import Path

import pandas as pd

In [2]:
DATA_DIRECTORY = Path("./data/")  # change as necessary

## The traced data

This is the data which was traced by the tracing code and gathered into tables.

`tactic_instances` includes position data and a unique key for every tactic that was executed and traced (which is not every tactic in the code)

In [3]:
tactic_instances = pd.read_json(
    DATA_DIRECTORY / "raw_traced_data" / "tactic_instances.jsonl",
    orient="records",
    lines=True 
)
tactic_instances

Unnamed: 0,table,key,executed,filename,trace_pos_line,trace_pos_column,line,column,depth,index,proof,block,parent,prev,succeeded
0,tactic_instances,2001:20:2:1,True,lean/library/init/meta/interactive.lean,2033,1,2001,20,2,1,2034:4:1:1,2001:20:2:1,2034:4:1:1,2034:4:1:1,1.0
1,tactic_instances,2016:4:1:1,True,lean/library/init/meta/interactive.lean,2016,4,2016,4,1,1,2016:4:1:1,2016:4:1:1,0:0:0:0,0:0:0:0,1.0
2,tactic_instances,2010:4:1:1,True,lean/library/init/meta/interactive.lean,2010,4,2010,4,1,1,2010:4:1:1,2010:4:1:1,0:0:0:0,0:0:0:0,1.0
3,tactic_instances,2013:4:1:1,True,lean/library/init/meta/interactive.lean,2013,4,2013,4,1,1,2013:4:1:1,2013:4:1:1,0:0:0:0,0:0:0:0,1.0
4,tactic_instances,2019:4:1:1,True,lean/library/init/meta/interactive.lean,2019,4,2019,4,1,1,2019:4:1:1,2019:4:1:1,0:0:0:0,0:0:0:0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181606,tactic_instances,74:57:3:2,True,mathlib/src/category_theory/abelian/projective...,74,57,74,57,3,2,74:60:1:1,74:17:3:1,74:30:2:1,74:17:3:1,1.0
181607,tactic_instances,74:57:3:3,True,mathlib/src/category_theory/abelian/projective...,74,57,74,57,3,3,74:60:1:1,74:17:3:1,74:30:2:1,74:57:3:2,1.0
181608,tactic_instances,74:30:2:1,True,mathlib/src/category_theory/abelian/projective...,74,30,74,30,2,1,74:60:1:1,74:30:2:1,74:60:1:1,74:60:1:1,1.0
181609,tactic_instances,74:60:1:1,True,mathlib/src/category_theory/abelian/projective...,74,60,74,60,1,1,74:60:1:1,74:60:1:1,0:0:0:0,0:0:0:0,1.0


`tactic_state.json` includes basic tactic state information both before and after the tactic executes (again, only for recorded tactics which are a subset of all tactics in the code).  This can be added to.

In [4]:
tactic_state = pd.read_json(
    DATA_DIRECTORY / "raw_traced_data" / "tactic_state.jsonl",
    orient="records",
    lines=True 
)
tactic_state

Unnamed: 0,table,key,tactic_instance,filename,trace_pos_line,trace_pos_column,before_after,env_fingerprint,decl_name,open_namespaces,goal_count
0,tactic_state,2001:20:2:1:before,2001:20:2:1,lean/library/init/meta/interactive.lean,2033,1,before,3.935797e+18,list.cons.inj_eq,`interactive `expr `tactic `tactic.interactive...,1
1,tactic_state,2001:20:2:1:after,2001:20:2:1,lean/library/init/meta/interactive.lean,2033,1,after,3.935797e+18,list.cons.inj_eq,`interactive `expr `tactic `tactic.interactive...,1
2,tactic_state,2016:4:1:1:before,2016:4:1:1,lean/library/init/meta/interactive.lean,2016,4,before,3.935797e+18,psum.inl.inj_eq,`interactive `expr `tactic `tactic.interactive...,1
3,tactic_state,2016:4:1:1:after,2016:4:1:1,lean/library/init/meta/interactive.lean,2016,4,after,3.935797e+18,psum.inl.inj_eq,`interactive `expr `tactic `tactic.interactive...,0
4,tactic_state,2010:4:1:1:before,2010:4:1:1,lean/library/init/meta/interactive.lean,2010,4,before,3.935797e+18,sum.inl.inj_eq,`interactive `expr `tactic `tactic.interactive...,1
...,...,...,...,...,...,...,...,...,...,...,...
362445,tactic_state,74:30:2:1:after,74:30:2:1,mathlib/src/category_theory/abelian/projective...,74,30,after,1.800850e+19,category_theory.ProjectiveResolution.of,`category_theory.ProjectiveResolution `categor...,0
362446,tactic_state,74:60:1:1:before,74:60:1:1,mathlib/src/category_theory/abelian/projective...,74,60,before,1.800850e+19,category_theory.ProjectiveResolution.of,`category_theory.ProjectiveResolution `categor...,1
362447,tactic_state,74:60:1:1:after,74:60:1:1,mathlib/src/category_theory/abelian/projective...,74,60,after,1.800850e+19,category_theory.ProjectiveResolution.of,`category_theory.ProjectiveResolution `categor...,0
362448,tactic_state,83:18:1:1:before,83:18:1:1,mathlib/src/category_theory/abelian/projective...,83,18,before,1.769972e+19,category_theory.ProjectiveResolution.category_...,`category_theory.ProjectiveResolution `categor...,1


`tactic_state_goal.json` includes basic tactic state goal information both before and after the tactic executes.

In [5]:
tactic_state_goal = pd.read_json(
    DATA_DIRECTORY / "raw_traced_data" / "tactic_state_goal.jsonl",
    orient='records',
    lines=True
)
tactic_state_goal

Unnamed: 0,table,key,tactic_state,filename,trace_pos_line,trace_pos_column,ix,goal_hash,goal_pp
0,tactic_state_goal,2001:20:2:1:before:0,2001:20:2:1:before,lean/library/init/meta/interactive.lean,2033,1,0,1078517132,"α : Type u,\nh₁ : α,\nt₁ : list α,\nh₂ : α,\nt..."
1,tactic_state_goal,2001:20:2:1:after:0,2001:20:2:1:after,lean/library/init/meta/interactive.lean,2033,1,0,1080190384,"α : Type u,\nh₁ : α,\nt₁ : list α,\nh₂ : α,\nt..."
2,tactic_state_goal,2016:4:1:1:before:0,2016:4:1:1:before,lean/library/init/meta/interactive.lean,2016,4,0,1786353986,"α : Sort u,\nβ : Sort v,\na₁ a₂ : α\n⊢ psum.in..."
3,tactic_state_goal,2016:4:1:1:after:0,2016:4:1:1:after,lean/library/init/meta/interactive.lean,2016,4,0,0,
4,tactic_state_goal,2010:4:1:1:before:0,2010:4:1:1:before,lean/library/init/meta/interactive.lean,2010,4,0,508157654,"α : Type u,\nβ : Type v,\na₁ a₂ : α\n⊢ sum.inl..."
...,...,...,...,...,...,...,...,...,...
362445,tactic_state_goal,74:30:2:1:after:0,74:30:2:1:after,mathlib/src/category_theory/abelian/projective...,74,30,0,0,
362446,tactic_state_goal,74:60:1:1:before:0,74:60:1:1:before,mathlib/src/category_theory/abelian/projective...,74,60,0,1843829988,"C : Type u,\n_inst_1 : category_theory.categor..."
362447,tactic_state_goal,74:60:1:1:after:0,74:60:1:1:after,mathlib/src/category_theory/abelian/projective...,74,60,0,0,
362448,tactic_state_goal,83:18:1:1:before:0,83:18:1:1:before,mathlib/src/category_theory/abelian/projective...,83,18,0,1135957380,"C : Type u,\n_inst_1 : category_theory.categor..."


`tactic_param_pos` includes position data for every *interactive* tactic parameter.  There are some parameters which are not included since they don't go through the parser.

In [6]:
tactic_param_pos = pd.read_json(
    DATA_DIRECTORY / "raw_traced_data" / "tactic_param_pos.jsonl", 
    orient='records',
    lines=True
)
tactic_param_pos

Unnamed: 0,table,key,line,filename,trace_pos_line,trace_pos_column,column,end_line,end_column
0,tactic_param_pos,1985:9,1985,lean/library/init/meta/interactive.lean,1983,6,9,1985,9
1,tactic_param_pos,1998:9,1998,lean/library/init/meta/interactive.lean,1983,6,9,1998,23
2,tactic_param_pos,1999:9,1999,lean/library/init/meta/interactive.lean,1983,6,9,1999,25
3,tactic_param_pos,2001:11,2001,lean/library/init/meta/interactive.lean,1983,6,11,2001,12
4,tactic_param_pos,2001:18,2001,lean/library/init/meta/interactive.lean,1983,6,18,2001,43
...,...,...,...,...,...,...,...,...,...
210066,tactic_param_pos,72:47,72,mathlib/src/category_theory/abelian/projective...,67,16,47,72,73
210067,tactic_param_pos,73:22,73,mathlib/src/category_theory/abelian/projective...,67,16,22,73,54
210068,tactic_param_pos,74:25,74,mathlib/src/category_theory/abelian/projective...,67,16,25,74,30
210069,tactic_param_pos,74:38,74,mathlib/src/category_theory/abelian/projective...,67,16,38,74,38


`tactic_param_value` includes a pretty printed version of the value passed to the tactic argument.  This is not necessarily how it appears in the lean code, but it is how Lean internally receives the data.  A user could add more tracing code to present this in more ways.

In [7]:
tactic_param_value = pd.read_json(
    DATA_DIRECTORY / "raw_traced_data" / "tactic_param_value.jsonl",
    orient='records',
    lines=True
)
tactic_param_value

Unnamed: 0,table,key,reflected_expr_pp,filename,trace_pos_line,trace_pos_column
0,tactic_param_value,1985:9,list.nil,lean/library/init/meta/interactive.lean,1983,6
1,tactic_param_value,1998:9,``(_root_.propext),lean/library/init/meta/interactive.lean,1983,6
2,tactic_param_value,1999:9,``(_root_.iff.intro),lean/library/init/meta/interactive.lean,1983,6
3,tactic_param_value,2001:11,"some (name.mk_string ""_"" name.anonymous)",lean/library/init/meta/interactive.lean,1983,6
4,tactic_param_value,2001:34,some (),lean/library/init/meta/interactive.lean,1983,6
...,...,...,...,...,...,...
210066,tactic_param_value,72:47,``(projective.projective_over),mathlib/src/category_theory/abelian/projective...,67,16
210067,tactic_param_value,73:22,some ``(exact_d_f (projective.π Z)),mathlib/src/category_theory/abelian/projective...,67,16
210068,tactic_param_value,74:25,sum.inl\n [tactic.rcases_patt.alts\n [ta...,mathlib/src/category_theory/abelian/projective...,67,16
210069,tactic_param_value,74:38,interactive.loc.ns [none],mathlib/src/category_theory/abelian/projective...,67,16


## Extracted Proof Data

`proof_trees.json` is a list of abstract syntax trees for each proof.  It doesn't contain much data itself, but has keys to rows in the following tables.  (Unlike the other datasets, this is not a 2D table.)

In [8]:
with jsonlines.open(DATA_DIRECTORY / "extracted_proof_data" / "proof_trees.jsonl") as f:
    proof_trees = list(f)
proof_trees[0]

{'key': 'lean/library/data/buffer.lean:49:1',
 'node_type': 'proof',
 'node_subtype': 'by',
 'tactic': {'key': 'lean/library/data/buffer.lean:49:30',
  'node_type': 'tactic',
  'node_subtype': 'semicolon',
  'tactic1': {'key': 'lean/library/data/buffer.lean:49:11',
   'node_type': 'tactic',
   'node_subtype': 'semicolon',
   'tactic1': {'key': 'lean/library/data/buffer.lean:49:4',
    'node_type': 'tactic',
    'node_subtype': 'named',
    'args': [{'key': 'lean/library/data/buffer.lean:49:10',
      'node_type': 'tactic_arg',
      'node_subtype': 'expression'},
     {'key': 'lean/library/data/buffer.lean:49:11',
      'node_type': 'tactic_arg',
      'node_subtype': 'expression'}]},
   'tactic2': {'key': 'lean/library/data/buffer.lean:49:13',
    'node_type': 'tactic',
    'node_subtype': 'named',
    'args': [{'key': 'lean/library/data/buffer.lean:49:20',
      'node_type': 'tactic_arg',
      'node_subtype': 'expression'},
     {'key': 'lean/library/data/buffer.lean:49:30',
      '

`proofs.json` contains the Lean human-written text for all tactic proofs along with some other extracted information.

In [9]:
proofs = pd.read_json(
    DATA_DIRECTORY / "extracted_proof_data" / "proofs.jsonl",
    orient="records",
    lines=True
)
proofs

Unnamed: 0,key,filename,start_line,start_column,end_line,end_column,code_string,class,parent_key,parent_type,index,line,column,first_tactic_key
0,lean/library/data/buffer.lean:49:1,lean/library/data/buffer.lean,49,1,51,1,by cases b; unfold read read'; simp [array.rea...,by,,,0,49,1,lean/library/data/buffer.lean:49:30
1,lean/library/data/buffer.lean:53:1,lean/library/data/buffer.lean,53,1,55,1,by cases b; unfold write write'; simp [array.w...,by,,,0,53,1,lean/library/data/buffer.lean:53:32
2,lean/library/data/buffer/parser.lean:29:1,lean/library/data/buffer/parser.lean,29,1,36,1,"begin apply funext, intro input, apply funext,...",begin,,,0,29,1,lean/library/data/buffer/parser.lean:30:1
3,lean/library/data/buffer/parser.lean:38:1,lean/library/data/buffer/parser.lean,38,1,47,1,"begin apply funext, intro input, apply funext,...",begin,,,0,38,1,lean/library/data/buffer/parser.lean:39:1
4,lean/library/data/dlist.lean:50:26,lean/library/data/dlist.lean,50,26,50,63,"by abstract { intros, simp, rw [←h] }",by,,,0,50,26,lean/library/data/dlist.lean:50:29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41151,mathlib/src/topology/vector_bundle.lean:222:24,mathlib/src/topology/vector_bundle.lean,222,24,223,89,"by { rw [←continuous_iff_continuous_on_univ, c...",by,,,0,222,24,mathlib/src/topology/vector_bundle.lean:222:27
41152,mathlib/src/topology/vector_bundle.lean:224:25,mathlib/src/topology/vector_bundle.lean,224,25,226,23,"by { rw [←continuous_iff_continuous_on_univ, c...",by,,,0,224,25,mathlib/src/topology/vector_bundle.lean:224:28
41153,mathlib/src/topology/vector_bundle.lean:230:16,mathlib/src/topology/vector_bundle.lean,230,16,230,45,by simp only [univ_prod_univ],by,,,0,230,16,mathlib/src/topology/vector_bundle.lean:230:19
41154,mathlib/src/topology/vector_bundle.lean:237:21,mathlib/src/topology/vector_bundle.lean,237,21,242,6,"begin have : (λ (x : trivial B F b), x) = @id ...",begin,,,0,237,21,mathlib/src/topology/vector_bundle.lean:238:5


`tactics.json` contains the Lean human-written text for all tactic commands along with some other extracted information.

In [10]:
tactics = pd.read_json(
    DATA_DIRECTORY / "extracted_proof_data" / "tactics.jsonl", 
    orient='records',
    lines=True
)
tactics

Unnamed: 0,key,filename,start_line,start_column,end_line,end_column,code_string,class,parent_key,parent_type,index,line,column,proof_key,trace_key
0,lean/library/data/buffer.lean:49:4,lean/library/data/buffer.lean,49,4,49,11,cases b,named,lean/library/data/buffer.lean:49:11,tactic,0,49,4,lean/library/data/buffer.lean:49:1,49:4:3
1,lean/library/data/buffer.lean:49:13,lean/library/data/buffer.lean,49,13,49,30,unfold read read',named,lean/library/data/buffer.lean:49:11,tactic,1,49,13,lean/library/data/buffer.lean:49:1,49:13:3
2,lean/library/data/buffer.lean:49:11,lean/library/data/buffer.lean,49,4,49,30,cases b; unfold read read',semicolon,lean/library/data/buffer.lean:49:30,tactic,0,49,11,lean/library/data/buffer.lean:49:1,49:11:2
3,lean/library/data/buffer.lean:49:32,lean/library/data/buffer.lean,49,32,51,1,simp [array.read_eq_read'],named,lean/library/data/buffer.lean:49:30,tactic,1,49,32,lean/library/data/buffer.lean:49:1,49:32:2
4,lean/library/data/buffer.lean:49:30,lean/library/data/buffer.lean,49,4,51,1,cases b; unfold read read'; simp [array.read_e...,semicolon,lean/library/data/buffer.lean:49:1,proof,0,49,30,lean/library/data/buffer.lean:49:1,49:11:1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175186,mathlib/src/topology/vector_bundle.lean:238:48,mathlib/src/topology/vector_bundle.lean,238,48,238,66,"by { ext x, refl }",solve1,mathlib/src/topology/vector_bundle.lean:237:21,proof,1,238,48,mathlib/src/topology/vector_bundle.lean:237:21,
175187,mathlib/src/topology/vector_bundle.lean:239:5,mathlib/src/topology/vector_bundle.lean,239,5,241,18,"simp only [total_space.topological_space, indu...",named,mathlib/src/topology/vector_bundle.lean:237:21,proof,2,239,5,mathlib/src/topology/vector_bundle.lean:237:21,239:5:1
175188,mathlib/src/topology/vector_bundle.lean:238:53,mathlib/src/topology/vector_bundle.lean,238,53,238,58,ext x,named,mathlib/src/topology/vector_bundle.lean:238:51,tactic,0,238,53,mathlib/src/topology/vector_bundle.lean:238:48,238:53:3
175189,mathlib/src/topology/vector_bundle.lean:238:60,mathlib/src/topology/vector_bundle.lean,238,60,238,65,refl,named,mathlib/src/topology/vector_bundle.lean:238:51,tactic,1,238,60,mathlib/src/topology/vector_bundle.lean:238:48,238:60:3


`args.json` contains the Lean human-written text for all tactic arguements along with some other extracted information.

In [11]:
args = pd.read_json(
    DATA_DIRECTORY / "extracted_proof_data" / "args.jsonl", 
    orient="records",
    lines=True
)
args

Unnamed: 0,key,filename,start_line,start_column,end_line,end_column,code_string,class,parent_key,parent_type,index,line,column
0,lean/library/data/buffer.lean:49:10,lean/library/data/buffer.lean,49,10,49,11,b,expression,lean/library/data/buffer.lean:49:4,tactic,0,49,10
1,lean/library/data/buffer.lean:49:11,lean/library/data/buffer.lean,49,11,49,11,,expression,lean/library/data/buffer.lean:49:4,tactic,1,49,11
2,lean/library/data/buffer.lean:49:20,lean/library/data/buffer.lean,49,20,49,30,read read',expression,lean/library/data/buffer.lean:49:13,tactic,0,49,20
3,lean/library/data/buffer.lean:49:30,lean/library/data/buffer.lean,49,30,49,30,,expression,lean/library/data/buffer.lean:49:13,tactic,1,49,30
4,lean/library/data/buffer.lean:49:37,lean/library/data/buffer.lean,49,37,51,1,[array.read_eq_read'],expression,lean/library/data/buffer.lean:49:32,tactic,0,49,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...
211820,mathlib/src/topology/vector_bundle.lean:239:10,mathlib/src/topology/vector_bundle.lean,239,10,239,15,only,expression,mathlib/src/topology/vector_bundle.lean:239:5,tactic,0,239,10
211821,mathlib/src/topology/vector_bundle.lean:239:15,mathlib/src/topology/vector_bundle.lean,239,15,241,18,"[total_space.topological_space, induced_inf, i...",expression,mathlib/src/topology/vector_bundle.lean:239:5,tactic,1,239,15
211822,mathlib/src/topology/vector_bundle.lean:241:18,mathlib/src/topology/vector_bundle.lean,241,18,241,18,,expression,mathlib/src/topology/vector_bundle.lean:239:5,tactic,2,241,18
211823,mathlib/src/topology/vector_bundle.lean:238:57,mathlib/src/topology/vector_bundle.lean,238,57,238,58,x,expression,mathlib/src/topology/vector_bundle.lean:238:53,tactic,0,238,57


## Examples of combining data

Here we combine the above data sources to make a table containing:
* First goals in the goal stack before the tactic is executed
* The human written tactic command
* A little metadata about the type of tactic.  If a "named" tactic, then the tactic name is the first word in the tactic command.

In [12]:
df = tactic_state_goal.copy()
df = df[df['ix'] == 0]
#df['key'] = df['filename'] + ":" + df['key']
df['tactic_state_key'] = df['filename'] + ":" + df['tactic_state']
df = df[['tactic_state_key', 'goal_pp']]
df = df.set_index('tactic_state_key')

df2 = tactic_state.copy()
df2 = df2[df2['before_after'] == 'before']
df2['tactic_state_key'] = df2['filename'] + ":" + df2['key']
df2['tactic_instance_key'] = df2['filename'] + ":" + df2['tactic_instance']
df2['tactic_key'] = df2['tactic_instance_key'].apply(lambda k: ":".join(k.split(":")[:-1]))
df2 = df2[['tactic_state_key', 'tactic_instance_key', 'tactic_key']]
df2 = df2.set_index('tactic_state_key')

df = df.join(df2)
df = df.set_index('tactic_key')
df.head()

df3 = tactics.copy()
df3['tactic_key'] = df3['filename'] + ":" + df3['trace_key']
df3 = df3[['tactic_key', 'code_string', 'class']]
df3 = df3.set_index('tactic_key')
df = df.join(df3)
df = df.reset_index()
df = df[['goal_pp', 'code_string', 'class']]
df = df.rename(columns={'class': 'tactic_class', 'code_string': 'human_tactic_code'})
# some tactics are recorded by the tracing code, but correspond to tactics 
# instances in a `[...] block in, possibly, another file.  They are dropped here.
df = df.dropna()  

goals_and_tactics = df
goals_and_tactics

Unnamed: 0,goal_pp,human_tactic_code,tactic_class
0,"α : Type u,\n_inst_1 : inhabited α,\nb : buffe...",cases b; unfold read read'; simp [array.read_e...,semicolon
1,"α : Type u,\n_inst_1 : inhabited α,\nb : buffe...",cases b; unfold read read',semicolon
2,"α : Type u,\n_inst_1 : inhabited α,\ni b_fst :...",unfold read read',named
3,"α : Type u,\n_inst_1 : inhabited α,\ni b_fst :...",simp [array.read_eq_read'],named
4,"α : Type u,\n_inst_1 : inhabited α,\nb : buffe...",cases b,named
...,...,...,...
183363,"R : Type u_1,\nB : Type u_2,\nF : Type u_3,\n_...",refl,named
183364,"R : Type u_1,\nB : Type u_2,\nF : Type u_3,\n_...",refl,named
183365,"R : Type u_1,\nB : Type u_2,\nF : Type u_3,\n_...","{ ext x, refl }",solve1
183366,"R : Type u_1,\nB : Type u_2,\nF : Type u_3,\n_...","{ ext x, refl }",solve1


Each named tactic command can have a number of arguments.  Here we seperate the arguments showing both human supplied argument and the internal representation used by Lean.

In [13]:
# the lean internal values are indexed by the starting position of
# the parameter argument.  Unfortionately, because of zero length
# arguments, two different arguements can have the same starting
# position.  We add an cumulative count to address this.
df = tactic_param_value.copy()
df['key'] = df['filename'] + ":" + df['key']
df['cnt'] = df.groupby('key').cumcount()
df['key2'] = df['key'] + ":" + df['cnt'].astype(str)
df = df.set_index('key2')
df = df[['reflected_expr_pp']]

df2 = args.copy()
# args has duplicates since there is a 
# line for each tactic execution instance
df2 = df2.drop_duplicates()  
df2['cnt'] = df2.groupby('key').cumcount()
df2['key2'] = df2['key'] + ":" + df2['cnt'].astype(str)
df2 = df2.set_index('key2')
df2 = df2.rename(columns={'code_string': 'human_tactic_arg'})
df2 = df2[['parent_key', 'index', 'human_tactic_arg']]

df3 = df2.join(df).reset_index()[['parent_key', 'index', 'human_tactic_arg', 'reflected_expr_pp']]
df3 = df3.set_index('parent_key')
df3 = df3.pivot_table(
    values=['human_tactic_arg', 'reflected_expr_pp'],
    index='parent_key',
    columns='index',
    aggfunc='first'
)

df4 = tactics.copy()
df4 = df4[df4['class'] == "named"]
df4 = df4[['key', 'code_string']]
df4 = df4.rename(columns={'code_string': 'human_tactic_code'})
df4 = df4.set_index('key')

parsed_tactics = df4.join(df3)
parsed_tactics = parsed_tactics.set_index('human_tactic_code')
parsed_tactics.columns = pd.MultiIndex.from_tuples(parsed_tactics.columns)
parsed_tactics



Unnamed: 0_level_0,human_tactic_arg,human_tactic_arg,human_tactic_arg,human_tactic_arg,human_tactic_arg,reflected_expr_pp,reflected_expr_pp,reflected_expr_pp,reflected_expr_pp,reflected_expr_pp
Unnamed: 0_level_1,0,1,2,3,4,0,1,2,3,4
human_tactic_code,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
unfold read read',read read',,,,,"[name.mk_string ""read"" name.anonymous, name.mk...",interactive.loc.ns [none],,,
simp [array.read_eq_read'],[array.read_eq_read'],,,,,[tactic.simp_arg_type.expr ``(array.read_eq_re...,interactive.loc.ns [none],,,
cases b,b,,,,,"(none name, ``(b))",list.nil,,,
unfold write write',write write',,,,,"[name.mk_string ""write"" name.anonymous, name.m...",interactive.loc.ns [none],,,
simp [array.write_eq_write'],[array.write_eq_write'],,,,,[tactic.simp_arg_type.expr ``(array.write_eq_w...,interactive.loc.ns [none],,,
...,...,...,...,...,...,...,...,...,...,...
ext x,x,,,,,"[tactic.rcases_patt.one (name.mk_string ""x"" na...",none,,,
ext x,x,,,,,"[tactic.rcases_patt.one (name.mk_string ""x"" na...",none,,,
refl,,,,,,,,,,
refl,,,,,,,,,,


## Training data

Below are examples of the extacted and cleaned data used to train Lean GPT-f.

This is a combined CSV of the training data and metadata.  Note, that the train-valid-test split is deterministic based on a hash of the declaration name.  This makes it simple to combine this data with data from other sources.

In [14]:
data_and_metadata = pd.read_csv(DATA_DIRECTORY / "cleaned_training_data" / "data_and_metadata.csv")
data_and_metadata

Unnamed: 0.1,Unnamed: 0,goal_pp,decl_name,open_namespaces,filename,line,column,proof_key,human_tactic_code,tactic_class,cleaned_goal,split
0,0,"α : Type u,\n_inst_1 : inhabited α,\nb : buffe...",buffer.read_eq_read',buffer,lean/library/data/buffer.lean,49,30,lean/library/data/buffer.lean:49:1,cases b; unfold read read'; simp [array.read_e...,semicolon,"α : Type u,\t_inst_1 : inhabited α,\tb : buffe...",test
1,1,"α : Type u,\n_inst_1 : inhabited α,\nb : buffe...",buffer.read_eq_read',buffer,lean/library/data/buffer.lean,49,11,lean/library/data/buffer.lean:49:1,cases b; unfold read read',semicolon,"α : Type u,\t_inst_1 : inhabited α,\tb : buffe...",test
2,2,"α : Type u,\n_inst_1 : inhabited α,\ni b_fst :...",buffer.read_eq_read',buffer,lean/library/data/buffer.lean,49,13,lean/library/data/buffer.lean:49:1,unfold read read',named,"α : Type u,\t_inst_1 : inhabited α,\ti b_fst :...",test
3,3,"α : Type u,\n_inst_1 : inhabited α,\ni b_fst :...",buffer.read_eq_read',buffer,lean/library/data/buffer.lean,49,32,lean/library/data/buffer.lean:49:1,simp [array.read_eq_read'],named,"α : Type u,\t_inst_1 : inhabited α,\ti b_fst :...",test
4,4,"α : Type u,\n_inst_1 : inhabited α,\nb : buffe...",buffer.read_eq_read',buffer,lean/library/data/buffer.lean,49,4,lean/library/data/buffer.lean:49:1,cases b,named,"α : Type u,\t_inst_1 : inhabited α,\tb : buffe...",test
...,...,...,...,...,...,...,...,...,...,...,...,...
181882,181882,"R : Type u_1,\nB : Type u_2,\nF : Type u_3,\n_...",topological_vector_bundle.trivial_bundle.topol...,topological_vector_bundle bundle set,mathlib/src/topology/vector_bundle.lean,238,60,mathlib/src/topology/vector_bundle.lean:237:21,refl,named,"R : Type u_1,\tB : Type u_2,\tF : Type u_3,\t_...",train
181883,181883,"R : Type u_1,\nB : Type u_2,\nF : Type u_3,\n_...",topological_vector_bundle.trivial_bundle.topol...,topological_vector_bundle bundle set,mathlib/src/topology/vector_bundle.lean,238,60,mathlib/src/topology/vector_bundle.lean:238:48,refl,named,"R : Type u_1,\tB : Type u_2,\tF : Type u_3,\t_...",train
181884,181884,"R : Type u_1,\nB : Type u_2,\nF : Type u_3,\n_...",topological_vector_bundle.trivial_bundle.topol...,topological_vector_bundle bundle set,mathlib/src/topology/vector_bundle.lean,238,51,mathlib/src/topology/vector_bundle.lean:237:21,"{ ext x, refl }",solve1,"R : Type u_1,\tB : Type u_2,\tF : Type u_3,\t_...",train
181885,181885,"R : Type u_1,\nB : Type u_2,\nF : Type u_3,\n_...",topological_vector_bundle.trivial_bundle.topol...,topological_vector_bundle bundle set,mathlib/src/topology/vector_bundle.lean,238,51,mathlib/src/topology/vector_bundle.lean:238:48,"{ ext x, refl }",solve1,"R : Type u_1,\tB : Type u_2,\tF : Type u_3,\t_...",train


For each of train, valid, and test there are seperate files:

`train.src` is the source (input) data for training (one training example per line).

In [15]:
with (DATA_DIRECTORY / "cleaned_training_data" / "train.src").open() as f:
    train_src = list(f)
train_src[:10]

["α : Type u,\tb : buffer α,\ti : ℕ,\th : i < b.size,\tv : α\t⊢ b.write ⟨i, h⟩ v = b.write' i v\n",
 "α : Type u,\tb : buffer α,\ti : ℕ,\th : i < b.size,\tv : α\t⊢ b.write ⟨i, h⟩ v = b.write' i v\n",
 "α : Type u,\ti : ℕ,\tv : α,\tb_fst : ℕ,\tb_snd : array b_fst α,\th : i < buffer.size ⟨b_fst, b_snd⟩\t⊢ buffer.write ⟨b_fst, b_snd⟩ ⟨i, h⟩ v = buffer.write' ⟨b_fst, b_snd⟩ i v\n",
 "α : Type u,\ti : ℕ,\tv : α,\tb_fst : ℕ,\tb_snd : array b_fst α,\th : i < buffer.size ⟨b_fst, b_snd⟩\t⊢ ⟨b_fst, b_snd.write ⟨i, h⟩ v⟩ = ⟨b_fst, b_snd.write' i v⟩\n",
 "α : Type u,\tb : buffer α,\ti : ℕ,\th : i < b.size,\tv : α\t⊢ b.write ⟨i, h⟩ v = b.write' i v\n",
 'α : Type,\tp : parser α\t⊢ ∀ (x : char_buffer), p.bind parser.pure x = p x\n',
 'α : Type,\tp : parser α\t⊢ p.bind parser.pure = p\n',
 'α : Type,\tp : parser α,\tinput : char_buffer\t⊢ ∀ (x : ℕ), p.bind parser.pure input x = p input x\n',
 'α : Type,\tp : parser α,\tinput : char_buffer\t⊢ p.bind parser.pure input = p input\n',
 'α : Type,\tp : par

`train.tgt` is the target (output) data for training (one training example per line).

In [16]:
with (DATA_DIRECTORY / "cleaned_training_data" / "train.tgt").open() as f:
    train_tgt = list(f)
train_tgt[:10]

["cases b; unfold write write'; simp [array.write_eq_write']\n",
 "cases b; unfold write write'\n",
 "unfold write write'\n",
 "simp [array.write_eq_write']\n",
 'cases b\n',
 'intro input\n',
 'apply funext\n',
 'intro pos\n',
 'apply funext\n',
 'dunfold parser.bind\n']

`train.names` includes all declaration (theorem) names used for splitting the data.

In [17]:
with (DATA_DIRECTORY / "cleaned_training_data" / "train.names").open() as f:
    train_names = list(f)
train_names[-10:]

['urysohns.CU.approx_mem_Icc_right_left urysohns.CU urysohns filter topological_space set\n',
 'urysohns.CU.approx_le_succ urysohns.CU urysohns filter topological_space set\n',
 'urysohns.CU.lim_of_mem_C urysohns.CU urysohns filter topological_space set\n',
 'urysohns.CU.lim_of_nmem_U urysohns.CU urysohns filter topological_space set\n',
 'urysohns.CU.lim_eq_midpoint urysohns.CU urysohns filter topological_space set\n',
 'urysohns.CU.continuous_lim urysohns.CU urysohns filter topological_space set\n',
 'topological_vector_bundle.trivialization.continuous_linear_equiv_at topological_vector_bundle bundle set\n',
 "topological_vector_bundle.trivialization.continuous_linear_equiv_at_apply' topological_vector_bundle bundle set\n",
 'topological_vector_bundle.trivial_bundle_trivialization topological_vector_bundle bundle set\n',
 'topological_vector_bundle.trivial_bundle.topological_vector_bundle topological_vector_bundle bundle set\n']

`train.index` contains identifiers for each datapoint (based on the source and declaration name).  _Warning: These identifiers may not be unique._

In [18]:
with (DATA_DIRECTORY / "cleaned_training_data" / "train.index").open() as f:
    train_index = list(f)
train_index[:10]

['{"src": "\\u03b1 : Type u,\\tb : buffer \\u03b1,\\ti : \\u2115,\\th : i < b.size,\\tv : \\u03b1\\t\\u22a2 b.write \\u27e8i, h\\u27e9 v = b.write\' i v", "decl_nm": "buffer.write_eq_write\'"}\n',
 '{"src": "\\u03b1 : Type u,\\tb : buffer \\u03b1,\\ti : \\u2115,\\th : i < b.size,\\tv : \\u03b1\\t\\u22a2 b.write \\u27e8i, h\\u27e9 v = b.write\' i v", "decl_nm": "buffer.write_eq_write\'"}\n',
 '{"src": "\\u03b1 : Type u,\\ti : \\u2115,\\tv : \\u03b1,\\tb_fst : \\u2115,\\tb_snd : array b_fst \\u03b1,\\th : i < buffer.size \\u27e8b_fst, b_snd\\u27e9\\t\\u22a2 buffer.write \\u27e8b_fst, b_snd\\u27e9 \\u27e8i, h\\u27e9 v = buffer.write\' \\u27e8b_fst, b_snd\\u27e9 i v", "decl_nm": "buffer.write_eq_write\'"}\n',
 '{"src": "\\u03b1 : Type u,\\ti : \\u2115,\\tv : \\u03b1,\\tb_fst : \\u2115,\\tb_snd : array b_fst \\u03b1,\\th : i < buffer.size \\u27e8b_fst, b_snd\\u27e9\\t\\u22a2 \\u27e8b_fst, b_snd.write \\u27e8i, h\\u27e9 v\\u27e9 = \\u27e8b_fst, b_snd.write\' i v\\u27e9", "decl_nm": "buffer.w