In [None]:
import json

### Load Data

In [None]:
ground_truth_questions_file = '../../updated_pure_aggregate_V2.json'
lattice_extractions_file = '../../../data/lattice_extractions/data_nyu_2016-11-14.json'

In [None]:
# Load Ground Truth 
ground_truth_questions = []
with open(ground_truth_questions_file, 'r') as f:
    for line in f:
        temp = json.loads(line)
        ground_truth_questions.append(temp)

# Quick sanity check to make sure "special" feature values
# have been selected
if 'Sayville' not in ground_truth_questions[2]['question']:
    print "Check ground truth question file"

In [None]:
lattice_extractions = []
with open(lattice_extractions_file, 'r') as f:
    for line in f:
        temp = json.loads(line)
        lattice_extractions.append(temp)
        
lattice_features = []
for entry in lattice_extractions:
    lattice_features.extend(entry['extractions'].keys())

lattice_features = list(set(lattice_features))

In [None]:
# Define which lattice features are fair game to ask about
valid_features = [
    'lattice-phone',
    'lattice-age',
    'lattice-rate',
    'lattice-name',
    'lattice-email',
    'lattice-location',
]
del(lattice_features)

## DIFFERENT APPROACH: Find matching depth

### Matching depth = 1

In [None]:
one_match = {}
for filt in valid_features:
    one_match[filt] = {}
    temp = {}
    for entry in lattice_extractions:
        if filt in entry['extractions'].keys():
            for val in entry['extractions'][filt]['results']:
                if val['value'] in temp.keys():
                    temp[val['value']].append(entry)
                else:
                    temp[val['value']] = [entry]
    for value in temp.keys():
        temp_contains = {}
        if len(temp[value]) > 1:
            #one_match[filt][value] = temp[value]
            # for every thing in temp[value], find how many ads contain each feauture
            for ad in temp[value]:
                for contains_filt in valid_features:
                    if contains_filt != filt:
                        if contains_filt in ad['extractions'].keys():
                            if contains_filt in temp_contains.keys():
                                temp_contains[contains_filt].append(ad)
                            else:
                                temp_contains[contains_filt] = [ad]
        one_match[filt][value] = temp_contains

### Manually Investigate Combinations

In [None]:
# Choose a matching filter and see some value options
filt = 'lattice-age'
for entry in one_match[filt].keys()[0:10]:
    print entry

In [None]:
# Choose a filter value and see possible contains
value = '28'
for entry in one_match[filt][value].keys():
    print entry

In [None]:
# Choose a contains and see possible values
contains = 'lattice-rate'
for ad in one_match[filt][value][contains]:
    # Confirm presence of value:
    match = 0
    for result in ad['extractions'][filt]['results']:
        if result['value'] == value:
            match = 1
    if match == 1:
        for result in ad['extractions'][contains]['results']:
            if filt == 'lattice-location':
                if contains == 'lattice-location':
                    print (ad['extractions'][filt]['results'][0]['context'], result['context']['city']['name'])
                else:
                    print (ad['extractions'][filt]['results'][0]['context'], result['value'])
            else:
                if contains == 'lattice-location':
                    print (ad['extractions'][filt]['results'], result['context']['city']['name'])
                else:
                    print (ad['extractions'][filt]['results'], result['value'])
                print ad
            print ' '

### Save Chosen Combos

In [None]:
chosen = [
    {'filter':
        {'feature':'lattice-phone',
        'value':'+16187098512'},
     'contains':'lattice-age',
     'type':'AVG'
     },
    {'filter':
        {'feature':'lattice-phone',
        'value':'+16197585452'},
     'contains':'lattice-email',
     'type':'MODE'
     },
    {'filter':
        {'feature':'lattice-age',
        'value':'21'},
     'contains':'lattice-location',
     'type':'MODE'
     },
    {'filter':
        {'feature':'lattice-age',
        'value':'30'},
     'contains':'lattice-email',
     'type':'MODE'
     },
    {'filter':
        {'feature':'lattice-rate',
        'value':'150,NO DURATION'},
     'contains':'lattice-age',
     'type':'MAX'
     },
    {'filter':
        {'feature':'lattice-rate',
        'value':'100,NO DURATION'},
     'contains':'lattice-age',
     'type':'MIN'
     },
    {'filter':
        {'feature':'lattice-name',
        'value':'Avery'},
     'contains':'location',
     'type':'MODE'
     },
    {'filter':
        {'feature':'lattice-name',
        'value':'Vanesa'},
     'contains':'lattice-age',
     'type':'MIN'
     },
    {'filter':
        {'feature':'lattice-email',
        'value':'annabelle@denver-escort.com'},
     'contains':'lattice-phone',
     'type':'MODE'
     },
    {'filter':
        {'feature':'lattice-email',
        'value':'info@bombshellescorts.com'},
     'contains':'lattice-age',
     'type':'AVG'
     },
    {'filter':
        {'feature':'lattice-location',
        'value':'St. Louis, Missouri'},
     'contains':'lattice-rate',
     'type':'MAX'
     },
    {'filter':
        {'feature':'lattice-location',
        'value':'Albany, New York'},
     'contains':'lattice-age',
     'type':'AVG'
     },
    {'filter':
        {'feature':'lattice-age',
        'value':'44'},
     'contains':'found-hair',
     'type':'MODE'
     },
    {'filter':
        {'feature':'lattice-name',
        'value':'Blair'},
     'contains':'found_height',
     'type':'AVG'
     },
    {'filter':
        {'feature':'lattice-age',
        'value':'28'},
     'contains':'found_weight',
     'type':'MAX'
     }
]

print len(chosen)

In [None]:
output_file = 'pure_agg_DD_structure.json'
with open(output_file, 'w') as f:
    json.dump(chosen, f, indent=2)

### Will need to format these values (e.g., rate) into proper value format.

## DEBUG

In [None]:
one_match['lattice-phone']['+16187098512']['lattice-age'][0]['extractions'].keys()

### Summarize Filters in Ground Truth Questions

In [None]:
gt_summary = []
for entry in ground_truth_questions:
    temp_dict = {}
    temp_dict['type'] = entry['type']
    filters = []
    contains = []
    for line in entry['SPARQL'][0].split('\n'):
        if '\tqpr' in line:
            if '?' in line:
                contains.append(line.split(':')[1].split(' ')[0])
            else:
                filters.append(line.split(':')[1].split(' ')[0])
    temp_dict['filters'] = filters
    temp_dict['contains'] = contains
    gt_summary.append(temp_dict)
    
uniq_filters = []
for entry in gt_summary:
    tup = tuple(entry['filters'])
    uniq_filters.append(tup)
    
uniq_filters = list(set(uniq_filters))

In [None]:
for entry in [gt_summary[0]]:
    for filt in entry['filters']:
        if filt == 'price':
            filt = 'rate'
        lattice_filt = 'lattice-' + filt
        print lattice_filt

In [None]:
# This should happend after determining lattice_filt above
for entry in lattice_extractions:
    if 'lattice-content' in entry['extractions'].keys():
        output = entry
    

# Trying to find non-extracted values

In [None]:
for filt_value in one_match[filt].keys():
    for entry in one_match[filt][filt_value]:
        for sub_filt in one_match[filt][filt_value].keys():
            for entry in one_match[filt][filt_value][sub_filt]:
                if 'lattice-content' in entry['extractions'].keys():
                    for result in entry['extractions']['lattice-content']['results']:
                        if 'asian' in result['value'].lower():
                            output =  entry