In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import json

COMPILE=False

In [2]:
from codebook.python import *

qualified_call = lambda i, c: execute(
  call(with_name(c)) % 'call'
  |where| call_target()
    |isa| use_of(imports(with_name(i))),
  compile=COMPILE
).assign(call_type=c, import_type=i) 

# query for various pandas.read_* methods
pd_read_csv = qualified_call('pandas', 'read_csv')
pd_read_excel = qualified_call('pandas', 'read_excel')
pd_read_fwf = qualified_call('pandas', 'read_fwf')
pd_read_json = qualified_call('pandas', 'read_json')
pd_read_pickle = qualified_call('pandas', 'read_pickle')
pd_read_sql = qualified_call('pandas', 'read_sql')
pd_read_table = qualified_call('pandas', 'read_table')

# join them all into one big dataframe
pd_reads = pd.concat([
  pd_read_csv, pd_read_excel, pd_read_fwf,
  pd_read_json, pd_read_pickle, pd_read_sql,
  pd_read_table
])

  + File select time: 0.3378s
  + Query time: 5.4525s
  + Collation time: 0.0071s
Total time: 5.7989s
  + File select time: 0.3041s
  + Query time: 0.5521s
  + Collation time: 0.0042s
Total time: 0.8624s
  + File select time: 0.2626s
  + Query time: 0.4806s
  + Collation time: 0.0048s
Total time: 0.7495s
  + File select time: 0.2842s
  + Query time: 0.4843s
  + Collation time: 0.0052s
Total time: 0.7751s
  + File select time: 0.3033s
  + Query time: 0.5086s
  + Collation time: 0.0042s
Total time: 0.8175s
  + File select time: 0.2670s
  + Query time: 0.4924s
  + Collation time: 0.0037s
Total time: 0.7646s
  + File select time: 0.2690s
  + Query time: 0.7396s
  + Collation time: 0.0046s
Total time: 1.0144s


In [3]:
from codebook.python import *

# captures: `df['column']`
# and `col = 'column' ; df[col]`
single_col_projections = (
  subscript()
  |where| the_only_subscript_is(use_of(string() % 'columns'))
  |and_w| the_value_is(anything() % 'target')
)

# captures: `df[['a', 'b', 'c']]`
# and `my_list = [ 'a', 'b', 'c' ] ; df[my_list]`
multi_col_projections = (
  subscript() 
  |where| the_only_subscript_is(use_of(
      list_(where_every_child_has_type('string')) % 'columns'
  ))
  |and_w| the_value_is(anything() % 'target')
)

projections = pd.concat([
  execute(single_col_projections, compile=COMPILE),
  execute(multi_col_projections, compile=COMPILE)
])

# Extract the actual "columns" 
projections['columns'] = Utils.source_list_to_py_list(
  projections, 'source_text_columns'
)

  + File select time: 0.0004s
  + Query time: 13.0588s
  + Collation time: 0.0273s
Total time: 13.0876s
  + File select time: 0.0003s
  + Query time: 2.6840s
  + Collation time: 0.0057s
Total time: 2.6914s


In [2]:
from codebook.python import *

# matches: `target["field"]`
as_subscript = (
  subscript() 
  |where| the_value_is(
    anything(same_text_as('target'))
  )
  |and_w| the_subscript_is(
    string() % select_as('text', 'field')
  )
)

# matches: `target.field`
as_attribute = (
  attribute()
  |where| the_object_is(
    anything(same_text_as('target'))
  )
  |and_w| the_attribute_is(
    identifier() % select_as('text', 'field')
  )
)

# captures: `target[{inner} <cmp> rhs]`
match_single_filter = lambda inner: (
  subscript() 
  |where| the_only_subscript_is(use_of(
      comparison() % 'filter' 
      |where| the_first_child_is(inner)
      |and_w| the_second_child_is(
        literal() % select_as('text', 'rhs')
      )
      |and_w| no_third_child()
  ))
  |and_w| the_value_is(anything() % 'target')
)

single_filters = pd.concat([
  execute(match_single_filter(as_subscript), compile=True),
  execute(match_single_filter(as_attribute), compile=True)
])

# Extract the actual "op" 
single_filters['op'] = Utils.get_comp_op(
  single_filters, 'source_text_filter', 'rhs'
)
# Clean up string column names
single_filters['field'] = single_filters.field.str.strip('\'"`')
# Get rhs type
single_filters['rhs_type'] = Utils.get_literal_type(single_filters, 'rhs')

  + File select time: 0.0004s
  + Compile time: 17.8010s
  + Query time: 76.6141s
  + Collation time: 0.0055s
Total time: 94.4240s
  + File select time: 0.0004s
  + Compile time: 18.0743s
  + Query time: 230.8967s
  + Collation time: 0.0049s
Total time: 248.9781s


In [4]:
('Filter[' + single_filters['field'] + single_filters['op'] + single_filters['rhs_type'] + ']').head(20)

0                          Filter[label==str]
1                          Filter[label==str]
2                          Filter[label==str]
3                          Filter[label==str]
4                          Filter[label==str]
5                          Filter[class==int]
6                          Filter[class>=int]
7          Filter[background occurrences>int]
8                           Filter[lang==str]
9                      Filter[paper1_LG==int]
10                      Filter[paralog==bool]
11                      Filter[paralog==bool]
12                        Filter[contig==str]
13                        Filter[contig==str]
14                        Filter[contig==str]
15                Filter[team_long_name==str]
16    Filter[TotalEconomicVulnerability>=int]
17    Filter[TotalEconomicVulnerability>=int]
18                    Filter[heavyTruck>=int]
19                          Filter[name==str]
dtype: object