In [4]:
import json
import operator

In [5]:
def extract_query(qtype, qlines):
    if qtype == 'Point Fact':
        qfeat = qlines[1].split('?')[2]
    if qtype == 'Cluster Aggregate':
        qfeat = qlines[1].split('?')[1]
    return qfeat

In [6]:
def extract_filter(qtype, qlines):
    filt_list = []
    # Determine type of seed feature
    if qtype != 'Point Fact':
        if '@' in qlines[4]:
            filt_list = ['email']
        else:
            filt_list = ['phone']
    # Determine extra filters
    if qtype != 'Cluster Identification':
        for line in qlines:
            chunkers = line.split(' ')
            for chunk in chunkers:
                if 'qpr:' in chunk:
                    newchunk = chunk.replace('qpr:', '*')
                    filt_feat = newchunk.split('*')[1]
                    if filt_feat.lower() not in ['ad', '', 'cluster', '<http://istresearch.com/qpr>', 'seed']:
                        # Make sure this isn't the query term:
                        # Remeber it's not a query term per se for Cluster Facet
                        if qtype != 'Cluster Facet':
                            if filt_feat not in qlines[1]:
                                filt_list.append(filt_feat)
                        else:
                            filt_list.append(filt_feat)
                        
    filt_set = list(set(filt_list))
    
    return filt_set

In [27]:
question_type = {}
question_type['Point Fact'] = '../questions/post_point_fact_V3.json'
#question_type['Cluster Identification'] = '../questions/post_cluster_identification.json'
#question_type['Cluster Facet'] = '../questions/post_cluster_facet.json'
#question_type['Cluster Aggregate'] = '../questions/post_aggregate_V2.json'

#question_type['Pure Aggregate GT'] = '../questions/pure_agg_gt_questions_V3.json'
#question_type['Pure Aggregate DD'] = '../questions/post_aggregate_V2.json'

In [30]:
# This is for reporting in the distribution of questions chosen for evaluation
chosen_questions_file = '../data/annotation_prep/dd_clustering/chosen_questions.json'
with open(chosen_questions_file, 'r') as f:
    chosen_questions = eval(f.read())
    
selected_questions = {}
for qtype in question_type.keys():
    if qtype in ['Cluster Identification', 'Cluster Facet', 'Cluster Aggregate']:
        selected_questions[qtype] = chosen_questions['NYU'][qtype].keys()


selected_questions['Point Fact'] = [
    '1647',
    '392',
    '1707',
    '217',
    '510',
    '799',
    '363',
    '1597',
    '1180',
    '1159',
    '1035',
    '1038',
    '2304',
    '1339',
    '284'
]

## For Default Question Distribution

In [31]:
for qtype in question_type.keys():
    filter_features = {}
    query_features = {}
    agg_function = {}
    # Obtain questions
    file_path = question_type[qtype]
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    # Find features of question
    for question in data:
        qlines = question['SPARQL'][-1].split('\n')
        # Find what feature is queried
        if qtype not in ['Cluster Identification', 'Cluster Facet']:
            qfeat = extract_query(qtype, qlines)
            if qfeat in query_features.keys():
                query_features[qfeat] += 1
            else:
                query_features[qfeat] = 1
        filt_set = extract_filter(qtype, qlines)
        for filt in filt_set:
            if filt in filter_features.keys():
                filter_features[filt] += 1
            else:
                filter_features[filt] = 1
    
    # Format Output for Table
    print qtype.upper()
    print ' '
    print 'Make filter table {} rows (plus 1 for heading)'.format(len(filter_features))
    print ' '
    sorted_filts = sorted(filter_features.items(), key=operator.itemgetter(1), reverse=True)
    for tup in sorted_filts:
        print tup[0], tup[1]
    print ' '
    print 'Make query table {} rows (plus 1 for heading)'.format(len(query_features))
    print ' '
    sorted_queries = sorted(query_features.items(), key=operator.itemgetter(1), reverse=True)
    for tup in sorted_queries:
        print tup[0], tup[1]
    print ' '

POINT FACT
 
Make filter table 17 rows (plus 1 for heading)
 
phone 69
name 45
content 41
age 30
location 26
email 24
height 22
title 21
post_date 18
hair_color 16
weight 15
price 12
ethnicity 11
review_site_id 5
social_media_id 3
services 3
street_address 3
 
Make query table 14 rows (plus 1 for heading)
 
location 25
age 14
name 13
phone 12
ethnicity 11
email 8
review_site_id 6
hair_color 3
post_date 3
price 2
weight 1
social_media_id 1
services 1
height 1
 


## For Specially Selected Question Distribution (in case of special eval on DD)

In [32]:
for qtype in question_type.keys():
    filter_features = {}
    query_features = {}
    agg_function = {}
    # Obtain questions
    file_path = question_type[qtype]
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    # Find features of question
    for question in data:
        if question['id'] in selected_questions[qtype]:
            qlines = question['SPARQL'][-1].split('\n')
            # Find what feature is queried
            if qtype not in ['Cluster Identification', 'Cluster Facet']:
                qfeat = extract_query(qtype, qlines)
                if qfeat in query_features.keys():
                    query_features[qfeat] += 1
                else:
                    query_features[qfeat] = 1
            filt_set = extract_filter(qtype, qlines)
            for filt in filt_set:
                if filt in filter_features.keys():
                    filter_features[filt] += 1
                else:
                    filter_features[filt] = 1
    
    # Format Output for Table
    print qtype.upper()
    print ' '
    print 'Make filter table {} rows (plus 1 for heading)'.format(len(filter_features))
    print ' '
    sorted_filts = sorted(filter_features.items(), key=operator.itemgetter(1), reverse=True)
    for tup in sorted_filts:
        print tup[0], tup[1]
    print ' '
    print 'Make query table {} rows (plus 1 for heading)'.format(len(query_features))
    print ' '
    sorted_queries = sorted(query_features.items(), key=operator.itemgetter(1), reverse=True)
    for tup in sorted_queries:
        print tup[0], tup[1]
    print ' '

POINT FACT
 
Make filter table 14 rows (plus 1 for heading)
 
phone 13
name 6
title 5
content 5
location 5
post_date 3
age 3
height 3
email 3
review_site_id 2
hair_color 2
weight 1
price 1
ethnicity 1
 
Make query table 11 rows (plus 1 for heading)
 
name 2
review_site_id 2
email 2
age 2
weight 1
price 1
hair_color 1
height 1
services 1
social_media_id 1
ethnicity 1
 


In [15]:
data[0]['id']

u'1'

In [None]:
i = 4
print data[0]['SPARQL'][0]

In [None]:
if 'height' in qlines[1]:
    print "hey"

In [None]:
qlines[1]

In [1]:
ls

Question_Distribution.ipynb


In [3]:
ls ../data/annotation_prep/dd_clustering

Check_Chosen_Seeds.ipynb
Domain_Discovery_Clustering_and_Question_Selection.ipynb
Now_Get_ISI_Late_Submission.ipynb
OLD_Check_Chosen_Seeds2.ipynb
chosen_questions.json
cluster_annotation_data.json
seeds.xlsx
seeds_debug.xlsx


In [14]:
selected_questions

{'Cluster Identification': ['17', '49', '19', '37', '29', '50', '3']}