In [3]:
import csv

headers = ['name','eatType','priceRange', 'customer rating', 'near', 'food', 'area','familyFriendly']

trainset_reader = csv.DictReader(open('trainset.csv', encoding='utf-8', mode='rt'))

feature_list = []
sentences = []
for row in trainset_reader:
    i1 = row['mr']
    i2 = row['ref']
    kv = {}
    keywords = i1.split(',')
    for keyword in keywords:
        kidx = keyword.find('[')
        key = keyword[:kidx].strip()
        value = keyword[kidx + 1: keyword.find(']')]
        kv[key] = value
    feature_list.append(kv)
    sentences.append(i2)
    
feature_list

[{'customer rating': '5 out of 5',
  'eatType': 'pub',
  'name': 'The Vaults',
  'near': 'Café Adriatic',
  'priceRange': 'more than £30'},
 {'eatType': 'pub',
  'food': 'English',
  'name': 'The Cambridge Blue',
  'near': 'Café Brazil',
  'priceRange': 'cheap'},
 {'area': 'riverside',
  'customer rating': 'low',
  'eatType': 'coffee shop',
  'familyFriendly': 'yes',
  'food': 'Japanese',
  'name': 'The Eagle',
  'near': 'Burger King',
  'priceRange': 'less than £20'},
 {'area': 'riverside',
  'eatType': 'coffee shop',
  'food': 'French',
  'name': 'The Mill',
  'near': 'The Sorrento',
  'priceRange': '£20-25'},
 {'area': 'riverside',
  'customer rating': 'high',
  'food': 'French',
  'name': 'Loch Fyne',
  'near': 'The Rice Boat'},
 {'area': 'riverside',
  'food': 'English',
  'name': 'Bibimbap House',
  'near': 'Clare Hall',
  'priceRange': 'moderate'},
 {'area': 'riverside',
  'customer rating': 'average',
  'familyFriendly': 'no',
  'food': 'French',
  'name': 'The Rice Boat'},
 {'

In [4]:
lex_feature_reader = csv.DictReader(open('train_lex_features.csv', encoding='utf-8', mode='rt'), delimiter='\t')
lex_feature_list = []
for row in lex_feature_reader:
    nsentences = row['Number Of Sentences']
    fwords = row['First Words']
    fpostags = row['First POS tags']
    fphrases = row['First Phrases']
    
    fv = {}
    fv['nsentences'] = nsentences
    fv['fwords'] = fwords
    fv['fpostags'] = fpostags
    fv['fphrases'] = fphrases
    lex_feature_list.append(fv)
    
lex_feature_list
    

[{'fphrases': 'NP,NP',
  'fpostags': 'NNP,NNS',
  'fwords': 'XNAMEX,Prices',
  'nsentences': '2'},
 {'fphrases': 'ADJP,NP',
  'fpostags': 'RB,NNP',
  'fwords': 'Close,Delicious',
  'nsentences': '2'},
 {'fphrases': 'NP', 'fpostags': 'NNP', 'fwords': 'XNAMEX', 'nsentences': '1'},
 {'fphrases': 'VP', 'fpostags': 'VBN', 'fwords': 'Located', 'nsentences': '1'},
 {'fphrases': 'PP', 'fpostags': 'IN', 'fwords': 'For', 'nsentences': '1'},
 {'fphrases': 'NP,NP',
  'fpostags': 'NNP,PRP',
  'fwords': 'XNAMEX,You',
  'nsentences': '2'},
 {'fphrases': 'NP', 'fpostags': 'NNP', 'fwords': 'XNAMEX', 'nsentences': '1'},
 {'fphrases': 'NP,NP,NP,NP',
  'fpostags': 'NNP,PRP,NNS,PRP',
  'fwords': 'XNAMEX,It,Prices,It',
  'nsentences': '4'},
 {'fphrases': 'PP', 'fpostags': 'IN', 'fwords': 'In', 'nsentences': '1'},
 {'fphrases': 'NP,NP',
  'fpostags': 'NNP,PRP',
  'fwords': 'XNAMEX,They',
  'nsentences': '2'},
 {'fphrases': 'PP,NP',
  'fpostags': 'IN,PRP',
  'fwords': 'In,It',
  'nsentences': '2'},
 {'fphrase

In [5]:
#max sentence length
import operator
sentence_lengths = [x['nsentences'] for x in lex_feature_list]
idx, value = max(enumerate(sentence_lengths), key=operator.itemgetter(1))
features = feature_list[idx]
sentence = sentences[idx]

print(idx, value, features, sentence)

1214 7 {'name': 'Aromi', 'eatType': 'coffee shop', 'food': 'French', 'customer rating': 'high', 'area': 'riverside', 'familyFriendly': 'yes'} Aromi, stunning coffee shop thats kid friendly.. Yes,  high customer rating.. Yes,  beautiful view.. Riverside and the very best French food .


In [6]:
from collections import Counter
c = Counter(sentence_lengths)
c

Counter({'1': 23738,
         '2': 15126,
         '3': 2652,
         '4': 461,
         '5': 72,
         '6': 11,
         '7': 1})

In [7]:
indices_5 = [i for i, x in enumerate(sentence_lengths) if x == '3']
print(len(indices_5))
for idx in indices_5[:50]:
    print(sentences[idx])
    print()

2652
The Golden Curry offers English food at a high price. It is near The Bakers. It should be noted that it is not kids friendly.

Green Man is a French restaurant in the city centre. It is not child friendly and is located near All Bar One. It costs more than thirty pounds.

Blue Spice is a coffee shop along the river near Avalon. The prices are quite high while the customer ratings are quite low. It is not recommended to take children there.

A good coffee shop that received a customer rating of 5 out of 5 is The Eagle. It is in the riverside area near the Burger King. It costs more than 30 pounds and is kid friendly.

Alimentum is Fast food. They are children friendly. The price range is cheap and located near riverside.

Strada offers British food in the low price range. It is family friendly with a 3 out of 5 star rating. You can find it near the Rainbow Vegetarian Café.

The Rice Boat is an average rated place. It is located in the city centre. It provides French food and it isn

In [32]:
phrase_labels =  [x['fphrases'].split(',')[0] for x in lex_feature_list]
#phrase_labels_flat = [x for slist in phrase_labels for x in slist]
Counter(phrase_labels)

Counter({'ADJP': 123,
         'ADVP': 390,
         'FRAG': 4,
         'INTJ': 35,
         'LST': 4,
         'NAC': 2,
         'NP': 34671,
         'NP-TMP': 1,
         'PP': 4063,
         'QP': 4,
         'S': 59,
         'SBAR': 628,
         'SINV': 3,
         'SQ': 4,
         'UCP': 1,
         'VP': 2047,
         'WHADVP': 19,
         'WHNP': 2,
         'X': 1})

In [31]:
WHADVP_idx = [i for i, x in enumerate(phrase_labels) if x[0] == 'PRN']
for idx in WHADVP_idx[:50]:
    print(sentences[idx])
    print()

In [36]:
pos_tags = [x['fpostags'].split(',')[0] for x in lex_feature_list]
[(tag, freq) for tag, freq in Counter(pos_tags).items() if freq > 100]

[('NNP', 25454),
 ('RB', 499),
 ('VBN', 1437),
 ('IN', 4680),
 ('DT', 4420),
 ('NN', 387),
 ('EX', 3079),
 ('JJ', 846),
 ('VBG', 248),
 ('NNS', 130),
 ('CD', 156),
 ('PRP', 222),
 ('VB', 367)]