In [14]:
import csv

headers = ['name','eatType','priceRange', 'customer rating', 'near', 'food', 'area','familyFriendly']

trainset_reader = csv.DictReader(open('trainset.csv', encoding='utf-8', mode='rt'))

feature_list = []
sentences = []
foods = set()
for row in trainset_reader:
    i1 = row['mr']
    i2 = row['ref']
    kv = {}
    keywords = i1.split(',')
    for keyword in keywords:
        kidx = keyword.find('[')
        key = keyword[:kidx].strip()
        value = keyword[kidx + 1: keyword.find(']')]
        kv[key] = value
        if key == 'food':
            foods.add(value)
        
    feature_list.append(kv)
    sentences.append(i2)

len(sentences)

42061

In [3]:
lex_feature_reader = csv.DictReader(open('train_lex_features.csv', encoding='utf-8', mode='rt'), delimiter='\t')
lex_feature_list = []
for row in lex_feature_reader:
    nsentences = row['Number Of Sentences']
    fwords = row['First Words']
    fpostags = row['First POS tags']
    fphrases = row['First Phrases']
    
    fv = {}
    fv['nsentences'] = nsentences
    fv['fwords'] = fwords
    fv['fpostags'] = fpostags
    fv['fphrases'] = fphrases
    lex_feature_list.append(fv)
    
lex_feature_list
    

[{'fphrases': 'NP,NP',
  'fpostags': 'NNP,NNS',
  'fwords': 'XNAMEX,Prices',
  'nsentences': '2'},
 {'fphrases': 'ADJP,NP',
  'fpostags': 'RB,NNP',
  'fwords': 'Close,Delicious',
  'nsentences': '2'},
 {'fphrases': 'NP', 'fpostags': 'NNP', 'fwords': 'XNAMEX', 'nsentences': '1'},
 {'fphrases': 'VP', 'fpostags': 'VBN', 'fwords': 'Located', 'nsentences': '1'},
 {'fphrases': 'PP', 'fpostags': 'IN', 'fwords': 'For', 'nsentences': '1'},
 {'fphrases': 'NP,NP',
  'fpostags': 'NNP,PRP',
  'fwords': 'XNAMEX,You',
  'nsentences': '2'},
 {'fphrases': 'NP', 'fpostags': 'NNP', 'fwords': 'XNAMEX', 'nsentences': '1'},
 {'fphrases': 'NP,NP,NP,NP',
  'fpostags': 'NNP,PRP,NNS,PRP',
  'fwords': 'XNAMEX,It,Prices,It',
  'nsentences': '4'},
 {'fphrases': 'PP', 'fpostags': 'IN', 'fwords': 'In', 'nsentences': '1'},
 {'fphrases': 'NP,NP',
  'fpostags': 'NNP,PRP',
  'fwords': 'XNAMEX,They',
  'nsentences': '2'},
 {'fphrases': 'PP,NP',
  'fpostags': 'IN,PRP',
  'fwords': 'In,It',
  'nsentences': '2'},
 {'fphrase

In [4]:
#max sentence length
import operator
sentence_lengths = [x['nsentences'] for x in lex_feature_list]
idx, value = max(enumerate(sentence_lengths), key=operator.itemgetter(1))
features = feature_list[idx]
sentence = sentences[idx]

print(idx, value, features, sentence)

1214 7 {'name': 'Aromi', 'eatType': 'coffee shop', 'food': 'French', 'customer rating': 'high', 'area': 'riverside', 'familyFriendly': 'yes'} Aromi, stunning coffee shop thats kid friendly.. Yes,  high customer rating.. Yes,  beautiful view.. Riverside and the very best French food .


In [5]:
from collections import Counter
c = Counter(sentence_lengths)
c

Counter({'1': 23738,
         '2': 15126,
         '3': 2652,
         '4': 461,
         '5': 72,
         '6': 11,
         '7': 1})

In [6]:
indices_5 = [i for i, x in enumerate(sentence_lengths) if x == '3']
print(len(indices_5))
for idx in indices_5[:50]:
    print(sentences[idx])
    print()

2652
The Golden Curry offers English food at a high price. It is near The Bakers. It should be noted that it is not kids friendly.

Green Man is a French restaurant in the city centre. It is not child friendly and is located near All Bar One. It costs more than thirty pounds.

Blue Spice is a coffee shop along the river near Avalon. The prices are quite high while the customer ratings are quite low. It is not recommended to take children there.

A good coffee shop that received a customer rating of 5 out of 5 is The Eagle. It is in the riverside area near the Burger King. It costs more than 30 pounds and is kid friendly.

Alimentum is Fast food. They are children friendly. The price range is cheap and located near riverside.

Strada offers British food in the low price range. It is family friendly with a 3 out of 5 star rating. You can find it near the Rainbow Vegetarian Café.

The Rice Boat is an average rated place. It is located in the city centre. It provides French food and it isn

In [7]:
import _pickle as cPickle
phrase_labels =  [x['fphrases'].split(',') for x in lex_feature_list]
phrase_labels_flat = [x for slist in phrase_labels for x in slist]
c = Counter(phrase_labels_flat)
vocab_keys = set([x for x in phrase_labels_flat if c[x] > 50])
vocab = {}
for i, key in enumerate(vocab_keys):
    vocab[key] = i
ofile = open('vocab_phrase_tags.pkl', 'wb')
cPickle.dump(vocab, ofile)
ofile.close()


In [8]:
WHADVP_idx = [i for i, x in enumerate(phrase_labels) if x[0] == 'PRN']
for idx in WHADVP_idx[:50]:
    print(sentences[idx])
    print()

In [9]:
pos_tags = [x['fpostags'].split(',') for x in lex_feature_list]
pos_tags_flat = [x for slist in pos_tags for x in slist]
most_freq_tags = [(tag, freq) for tag, freq in Counter(pos_tags_flat).items() if freq > 100]
vocab = {tag: idx for idx, (tag, freq) in enumerate(most_freq_tags)}
for i, key in enumerate(vocab_keys):
    vocab[key] = i
ofile = open('vocab_pos_tags.pkl', 'wb')
cPickle.dump(vocab, ofile)
ofile.close()

In [13]:
first_words = [x['fwords'].split(',') for x in lex_feature_list]
first_words_flat = [x for slist in first_words for x in slist]
most_freq_fwords = {tag: freq for tag, freq in Counter(first_words_flat).items() if freq > 120}
vocab_fwords = {}
vocab_fwords = {tag: idx for idx, (tag, freq) in enumerate(most_freq_fwords.items())}
for i, key in enumerate(vocab_fwords.keys()):
    vocab_fwords[key] = i
ofile = open('vocab_fwords.pkl', 'wb')
cPickle.dump(vocab_fwords, ofile)
ofile.close()
sorted_items = ['{}\t{}'.format(x[0], x[1]) for x in sorted(most_freq_fwords.items(), key= lambda x: x[1], reverse=True)]
for s in sorted_items:
    print(s)

XNAMEX	25771
It	12646
The	3826
There	3106
A	1685
Located	1637
They	1406
Near	1290
In	1177
Its	883
For	713
This	627
With	626
If	475
Customers	322
An	262
You	256
Customer	224
Prices	211
it	195
Not	188
Rated	162
At	151
Their	150
We	133
Children	132
Although	121


In [11]:
from collections import defaultdict
first_words = [x['fwords'].split(',') for x in lex_feature_list]
fwords_for_nsent = defaultdict(lambda: [])
for first_word in first_words:
    for i, fword in enumerate(first_word):
        idx = 0 if i == 0 else 1
        fwords_for_nsent[idx].append(fword)
fwords_for_nsent = {k: {vk: vv/len(v) for vk, vv in Counter(v).items() if vv > {0: 60, 1: 100}[k]} for k, v in fwords_for_nsent.items()}
fwords_for_nsent = {k: set(v.keys()) for k, v in fwords_for_nsent.items() if k not in foods}
fwords_for_nsent0 = {x[0].upper() + x[1:]: i for i, x in enumerate(fwords_for_nsent[0]) if x not in foods}
fwords_for_nsent1 = {x[0].upper() + x[1:]: i for i, x in enumerate(fwords_for_nsent[1]) if x not in foods}

fwords_for_nsent0 = {x[0].upper() + x[1:]: i for i, x in enumerate(fwords_for_nsent0)}
fwords_for_nsent1 = {x[0].upper() + x[1:]: i for i, x in enumerate(fwords_for_nsent1)}

ofile = open('vocab_fwords_0.pkl', 'wb')
cPickle.dump(fwords_for_nsent0, ofile)
ofile.close()
ofile = open('vocab_fwords_1.pkl', 'wb')
cPickle.dump(fwords_for_nsent1, ofile)
ofile.close()
fwords_for_nsent0

{'A': 12,
 'An': 18,
 'At': 24,
 'By': 10,
 'Cheap': 21,
 'Close': 16,
 'Come': 19,
 'Family': 17,
 'Fast': 9,
 'For': 5,
 'Highly': 0,
 'If': 7,
 'In': 8,
 'Located': 22,
 'Near': 23,
 'On': 15,
 'Rated': 11,
 'Riverside': 4,
 'Serving': 3,
 'The': 1,
 'There': 13,
 'This': 20,
 'With': 14,
 'XNAMEX': 2,
 'You': 6}

[('Prices', 1),
 ('Customer', 2),
 ('XNAMEX', 3),
 ('We', 4),
 ('Customers', 5),
 ('They', 6),
 ('With', 7),
 ('Located', 8),
 ('This', 9),
 ('Its', 10),
 ('The', 11),
 ('Not', 12),
 ('You', 13),
 ('It', 14),
 ('Their', 15)]

In [35]:
import json
train_tree_feature_file = open('train_lex_features_tree.json', 'rt', encoding='utf-8')
train_tree_features = json.load(train_tree_feature_file)
dev_tree_feature_file = open('dev_lex_features_tree.json', 'rt', encoding='utf-8')
dev_tree_features = json.load(dev_tree_feature_file)
phrase_level_tags = ['ADJP','ADVP','CONJP','FRAG','INTJ','LST','NAC','NP','NP-TMP','NX','PP','PRN','PRT','QP','RRC','UCP','VP','WHADJP','WHADVP','WHNP','WHPP','X',]
clause_level_tags = ['S','SBAR','SBARQ','SINV','SQ']
word_level_tags = ['CC','CD','DT','EX','FW','IN','JJ','JJR','JJS','LS','MD','NN','NNP','NNPS','NNS','PDT','POS','PRP','PRP$','RB','RBR', 'RBS','RP','SYM','TO','UH','VB','VBD','VBG','VBN','VBP', 'VBZ','WDT','WP','WP$','WRB']  


In [61]:
total_tag_stats = Counter()
for train_tree_feature in train_tree_features:
    for sentence_feature in train_tree_feature:
        total_tag_stats.update(sentence_feature)
        
#min_threshold = 1000
#total_tag_stats = {x : total_tag_stats[x] for x in total_tag_stats if total_tag_stats[x] >= min_threshold }
prase_freq = {p: freq for p, freq in total_tag_stats.items() if p in phrase_level_tags}
pos_freq = {p: freq for p, freq in total_tag_stats.items() if p in word_level_tags}

prase_freq = sorted(prase_freq.items(), key= lambda x: x[1], reverse=True)[int(len(phrase_level_tags) * .20) : int(len(phrase_level_tags) * .70)]
pos_freq = sorted(pos_freq.items(), key=lambda x: x[1], reverse=True)[int(len(word_level_tags) * .20) : int(len(word_level_tags) * .70)]
print(prase_freq)
print(pos_freq)

phrase_vocab = {p[0]: i for i, p in enumerate(prase_freq)}
pos_vocab = {p[0]: i for i, p in enumerate(pos_freq)}

print(phrase_vocab)
print(pos_vocab)

json.dump(phrase_vocab, open('phrase_vocab.json', 'wt', encoding='utf-8'))
json.dump(pos_vocab, open('pos_vocab.json', 'wt', encoding='utf-8'))

[('WHNP', 8164), ('ADVP', 7011), ('QP', 5816), ('PRT', 685), ('UCP', 561), ('FRAG', 470), ('WHADVP', 295), ('NX', 252), ('PRN', 233), ('INTJ', 229), ('NP-TMP', 166)]
[('VBN', 28290), ('RB', 24804), ('PRP', 20784), ('CC', 19628), ('NNS', 16608), ('WDT', 8070), ('VBP', 6464), ('VBG', 5460), ('JJR', 5099), ('VB', 4497), ('EX', 4159), ('TO', 3613), ('VBD', 2951), ('PRP$', 2487), ('MD', 1802), ('NNPS', 1539), ('RP', 691), ('POS', 324)]
{'WHNP': 0, 'ADVP': 1, 'QP': 2, 'PRT': 3, 'UCP': 4, 'FRAG': 5, 'WHADVP': 6, 'NX': 7, 'PRN': 8, 'INTJ': 9, 'NP-TMP': 10}
{'VBN': 0, 'RB': 1, 'PRP': 2, 'CC': 3, 'NNS': 4, 'WDT': 5, 'VBP': 6, 'VBG': 7, 'JJR': 8, 'VB': 9, 'EX': 10, 'TO': 11, 'VBD': 12, 'PRP$': 13, 'MD': 14, 'NNPS': 15, 'RP': 16, 'POS': 17}


In [57]:
lens = {
    'nsent',
    'fout_word_vectors',
    'fout_phrase_vectors',
    'fout_pos_vectors',
    'fword_vectors',
    'fphrase_vectors',
    'fpos_vectors',
    'pos_tag_feature',
    'phrase_tag_feature',
}
lens.keys()

NameError: name 'max_sentences' is not defined