In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import tqdm
import json

from codebook.python import *
from codebook.semantics import DSNotebooks as DSN

Evaluator.use_ds_gh_2017()

In [2]:
import glob
import xxhash
import os

fpaths = []
for chunk in glob.glob('/data/gh-2017/chunked/chunk-*'):
  with open('{}/listing.txt'.format(chunk)) as fh:
      fpaths.extend([ x.strip() for x in fh.readlines() if len(x.strip()) > 0 ])

def hashit(thing):
    return int.from_bytes(
        xxhash.xxh64(thing, seed=3235823838).intdigest().to_bytes(8, byteorder='little'),
        signed=True, byteorder="little"
    )
  
fids_to_fpaths = {}
for fpath in fpaths:
  fids_to_fpaths[hashit('parsed/' + os.path.basename(fpath) + '.xml.gz')] = fpath

Evaluator._fids_to_fpaths = fids_to_fpaths

In [None]:
import time 

OPS = [
  ('filters', DSN.filters),
  ('single_compares', DSN.single_compares),
  ('projections', DSN.projections),
  ('pandas_reads', DSN.pandas_reads),
  ('likely_labels', DSN.likely_labels),
  ('likely_values', DSN.likely_values),
  ('column_additions', DSN.column_additions),
  ('sklearn_fits', DSN.sklearn_fits),
  ('train_test_splits', DSN.train_test_splits),
  ('sklearn_fit_transforms', DSN.sklearn_fit_transforms),
  ('applies', DSN.applies),
  ('copies', DSN.copies),
  ('fillna', DSN.fillna),
  ('as_matrix', DSN.as_matrix),
  ('array', DSN.array),
  ('reshape', DSN.reshape),
  ('drops', DSN.drops),
  ('astype', DSN.astype),
  ('dropna', DSN.dropna),
  ('replace', DSN.replace),
  ('new_data_frame', DSN.new_data_frame),
  ('join', DSN.join),
  ('merge', DSN.merge),
  ('reset_indices', DSN.reset_indices),
  ('astype', DSN.astype),
  ('get_dummies', DSN.get_dummies),
  ('maps', DSN.maps),
  ('sklearn_transforms', DSN.sklearn_transforms),
]

res_frames = []
for name, op in OPS:
  print('Working on op:={}'.format(name))
  start = time.perf_counter()
  res_frames.append(op())
  elapsed_time = time.perf_counter() - start
  print(f"Total time: {elapsed_time:.4f}s")
  

all_ops = pd.concat([
  x[['gid', 'pretty']] for x in res_frames
])

In [None]:
# Add "unmodeled" calls
all_calls = DSN.exec(
  call() % select('name') % 'call'
)

# Get the ones we don't have models for 
all_calls = all_calls[~all_calls.gid_call.isin(all_ops.gid)]
all_calls['pretty'] = 'Unmodeled[' + all_calls.out_name_call + ']'
all_calls['gid'] = all_calls.gid_call

# Add these now as "unmodeled" operators
all_ops = pd.concat([all_ops, all_calls[['gid', 'pretty']]])



In [None]:
print('Getting flows...')
flows_df = DSN.flows_reads_to_fits()
print('  + Got flows!')

In [None]:
flows_df.to_csv('/data/gh-2017/results/new-flows-df.csv', index=False)

In [None]:
all_ops.to_csv('/data/gh-2017/results/new-all-ops.csv', index=False)

In [None]:
new_df = pd.DataFrame()

print('Joining ops and flows...')
# all_ops.set_index('gid', drop=False)

print('1')
new_df['flow_pretty'] = flows_df.set_index(
  'gid_flow', drop=False
).join(all_ops[['pretty']]).pretty

print('2')
new_df['out_to_pretty'] = flows_df.set_index(
  'out_to_flow', drop=False
).join(all_ops[['pretty']]).pretty
print('  + Joined!')

# new_df['fid'] = flows_df.fid
# new_df['gid_source'] = flows_df.gid_source
# new_df['gid_sink'] = flows_df.gid_sink
# new_df['start_line_flow'] = flows_df.start_line_flow
# new_df['end_line_flow'] = flows_df.end_line_flow
# new_df['gid_flow'] = flows_df.gid_flow
# new_df['out_to_flow'] = flows_df.out_to_flow
# new_df['out_edge_flow'] = flows_df.out_edge_flow

print('Grouping by....')
implicit_flows = new_df.sort_values(
  ['start_line_flow', 'start_col_flow']
).groupby(
  ['fid', 'gid_source', 'gid_sink']
)[[
  'gid_flow', 'flow_pretty', 
  'out_to_flow', 'out_to_pretty',
  'out_edge_flow'
]].agg(list)
print('  + Grouped!')

In [None]:
import networkx as nx
from networkx.drawing.nx_pydot import read_dot
from networkx.drawing.nx_agraph import to_agraph

graphs = []

for key, row in implicit_flows.iterrows():
  tmp_g = nx.DiGraph()
  for (ga, opa), (gb, opb), edge in zip(*row.to_list()):
    tmp_g.add_node(ga, label=opa)
    tmp_g.add_node(gb, label=opb)
    tmp_g.add_edge(ga, gb, label=edge)
  graphs.append(tmp_g)

In [None]:
def cleanup_graph(target):
  removals = []
  for node in  list(target.nodes()):
      incoming = target.in_edges(node)
      
      direct_use = None
      non_pure = None
      for (s,t) in incoming:
        label = target.edges()[s,t]['label']
        if label == 'direct-use':
          direct_use = (s,t)
        elif label == 'non-pure':
          non_pure = (s,t)
      
      if non_pure is None or direct_use is None:
        continue
        
      if (direct_use[0], non_pure[0]) in target.edges():
        removals.append(direct_use)
  
  for e in removals:
    target.remove_edge(*e)
  
  return target
      
  
graphs = list(map(cleanup_graph, graphs))

In [None]:
with open('/app/flow.txt', 'w') as fh:
  for i,g in enumerate(list(graphs)):
    print('{}/{}'.format(i+1, len(graphs)))
    fh.write(str(to_agraph(g)) + '\n---\n\n')


In [3]:
tmp = DSN.exec(
  subscript() % 'op'
  |where| the_value_is(
    attribute()
    |where| the_attribute_is(
      identifier(with_text('loc'))
    )
  )
  |and_w| the_first_subscript_is(
    slice_() 
    |where| the_first_child_is(string() % 'sb')
    |and_w| the_second_child_is(string() % 'se')
  )
)

  + Query already compiled (cached) `/tmp/queries/fd9ad17a774094f4ddac14f2d1bb905deaed13ec9d06d2ded217bc929f2cd70b.dl`
  - TODO: re-enable writing pre-filters.
  + File select time: 0.0000s
  + Query time: 44.9660s
  + Collation time: 2.3608s
Total time: 47.3517s


In [4]:
'ProjectRowSlice[' + tmp.source_text_sb + ':' + tmp.source_text_se + ']'

0                ProjectRowSlice['20130102':'20130104']
1            ProjectRowSlice['2015-01-01':'2015-06-26']
2            ProjectRowSlice['2017-05-01':'2017-05-31']
3       ProjectRowSlice["735 Dolores St":"332 Hill St"]
4                ProjectRowSlice['20130102':'20130105']
                             ...                       
2471          ProjectRowSlice['m_pred[0]':'m_pred[49]']
2472                         ProjectRowSlice['FL':'HI']
2473            ProjectRowSlice['1995-12-01':'1996-01']
2474                         ProjectRowSlice['ND':'WY']
2475                     ProjectRowSlice['1995':'1996']
Length: 2476, dtype: object