From the [previous analysis](./analysis.ipynb)

In [1]:
import csv
from collections import defaultdict

from sklearn import metrics
from sklearn.model_selection import train_test_split

    
SEED = 1337

with open('../data/atis/train.tsv', 'rt') as f:
    reader = csv.reader(f, delimiter='\t')
    data = list(reader)
    data_x = [row[0] for row in data]
    data_y = [row[1] for row in data]

data_x_train, data_x_val, data_y_train, data_y_val = train_test_split(data_x, data_y, test_size=0.2, random_state=SEED)

def val_accuracy(f):
    print(metrics.accuracy_score(data_y_val, f(data_x_val)))

data_x_words = sorted({w for x in data_x_train for w in x.split(' ')})

def one_hot_features(x):
    """Feature at index i equals 1 iff the word i is in the string x"""
    xw = x.split(' ')
    return [int(w in xw) for w in data_x_words]

data_x_features = [one_hot_features(x) for x in data_x_train]

by_label = defaultdict(list)
for x, y in zip(data_x_train, data_y_train):
    by_label[y].append(x)
labels = set(by_label)

In [7]:
import numpy as np

data_x_wcounts = np.sum(data_x_features, axis=0)
data_x_wcounts

array([ 165,    1,    4,   37,    6,   77,    1,    1,    1,    8,   45,
          2,   11,    1,    1,    1,    1,    1,    1,    1,    1,    1,
          1,    5,    1,    3,    2,    3,    1,    1,    2,   31,    1,
          1,    1,    1,    1,    3,    1,    1,    1,    1,    1,    1,
          1,    1,    1,    1,    2,    1,    1,    1,   15,    3,    1,
         20,    2,    2,    2,    3,    1,    1,    1,    2,    1,    2,
          2,    2,    1,    4,    1,   14,    1,    1,    1,    1,    1,
          2,    1,    1,    1,    1,    2,   33,    1,    1,    3,    1,
          1,    1,    1,    1,    5,    1,    1,    1,    3,    1,   61,
          1,    1,    1,    2,    1,    1,    3,    4,   43,    1,    5,
          1,    1,   35,    1,    1,    4,    1,    1,    1,    2,    1,
          1,    2,    1,    1,    5,    2,    1,    1,    1,   36,    3,
          3,    1,    1,    1,    1,    3,    4,    1,    1,   21,    1,
          1,    3,    1,  645,    1,    5,    1,   

In [8]:
len(data_x_words)

845

In [10]:
from pandas import DataFrame
data_x_words_counts = DataFrame({'counts': data_x_wcounts, 'words': data_x_words})
data_x_words_counts

Unnamed: 0,counts,words
0,165,'d
1,1,'hare
2,4,'ll
3,37,'m
4,6,'re
...,...,...
840,101,york
841,164,you
842,15,your
843,1,yyz


In [11]:
data_x_words_counts.sort_values(by='counts', ascending=False, inplace=True)
data_x_words_counts.head(20)

Unnamed: 0,counts,words
760,2850,to
384,2737,from
369,1794,flights
736,1433,the
576,1056,on
820,991,what
506,903,me
368,871,flight
225,776,boston
652,748,san


In [14]:
data_x_features = np.array(data_x_features)
flights_feature = data_x_features[:, 369]

In [15]:
flights_feature

array([0, 1, 1, ..., 0, 0, 1])

In [17]:
data_x_train[0:3]

['can i see ground transportation from long beach airport to downtown',
 "i 'd like to see flights from baltimore to atlanta that arrive before noon",
 'what flights from salt lake city to new york city arrive next saturday before 6 pm']

In [19]:
data_y_train = np.array(data_y_train)
data_y_train[flights_feature != 0]

array(['flight', 'flight', 'flight', ..., 'flight', 'flight', 'flight'],
      dtype='<U19')

In [22]:
np.unique(_, return_counts=True)

(array(['aircraft', 'airfare', 'airfare+flight_time', 'airline', 'airport',
        'flight', 'flight+airfare', 'flight_no', 'flight_time', 'quantity',
        'restriction'], dtype='<U19'),
 array([   1,   45,    1,   24,    1, 1658,   13,    3,   16,   31,    1]))

In [28]:
data_x_train = np.array(data_x_train)
data_x_train[(data_y_train == "airfare") & (flights_feature != 0)]

array(['show me any show me what the first class fares are of the flights from pittsburgh to san francisco on monday',
       'show me all the prices of flights from baltimore to dallas',
       'what are the fares for flights between atlanta and dfw provided by american airlines',
       'please show me all fares for flights from denver to oakland',
       'what are the fares for flights between atlanta and dfw',
       'what is the cost of flights from denver to san francisco',
       'please give me the prices for all flights from philadelphia to denver airport next sunday',
       'show fares and flights from denver to oakland',
       'show me the price of all flights from atlanta to washington dc',
       'show me prices of flights from baltimore to dallas',
       'what is the price of flights from indianapolis to memphis',
       'show me fares less than 400 dollars for flights from dallas to baltimore',
       'what are the round trip fares for flights from denver to philadelp

In [35]:
{s for s in data_x_train[(data_y_train == "airfare") & (flights_feature != 0)] if 'price' not in s and 'cost' not in s and 'fare' not in s}

set()

In [38]:
{s for s in data_x_train[(data_y_train == "airfare")] if 'price' not in s and 'how much' not in s and  'cost' not in s and 'fare' not in s}

{'give me the least expensive first class round trip ticket on us air from cleveland to miami',
 'hi could i get a one way ticket from milwaukee to orlando',
 'hi i want a round trip ticket to dallas',
 "i 'd like a round trip ticket from los angeles to tacoma washington",
 "i 'd like a ticket from denver to atlanta with a stopover in pittsburgh is this possible",
 "i 'd like to buy a coach class ticket from denver to atlanta with a stopover in pittsburgh",
 "i 'd like to have some information on a ticket from denver to pittsburgh and atlanta",
 'i need 2 first class tickets from dallas fort worth to san francisco',
 'i need a first class ticket on united airlines from denver to baltimore scheduled for december seventeenth',
 'i need a ticket from los angeles to charlotte that leaves early in the morning',
 'i need one first class ticket from dallas fort worth to san francisco',
 'i want a one way ticket from dallas to baltimore',
 'i would like a ticket leaving from denver colorado to

In [39]:
airfare_preditor = lambda s: (' price' in s) + (' cost ' in s) + (' fare' in s) + (' rate' in s) + (' ticket' in s) + ('how much' in s)


In [40]:
def f1_score(tp, fp, fn):
    assert tp or fn or fp
    return 2 * tp / (2 * tp + fn + fp)


def f1_scores(confusion_matrix):
    tp = defaultdict(float)
    fn = defaultdict(float)
    fp = defaultdict(float)
    all_labels = set(confusion_matrix)

    for actual, row in confusion_matrix.items():
        for predicted, value in row.items():
            all_labels.add(predicted)
            if predicted == actual:
                tp[actual] = value
            else:
                fp[actual] += value
                fn[predicted] += value

    return [
        (lb, tp[lb], fn[lb], fp[lb], f1_score(tp[lb], fn[lb], fp[lb]))
        for lb in sorted(all_labels)
    ]

In [46]:
def confusion(model, on_data : tuple[list[str], list[str]]):
    cm = defaultdict(lambda: defaultdict(int))
    for x, y in zip(*on_data):
        cm[y][model(x)] += 1
    return cm

In [44]:
format_integer = format_confusion = str

def format_f1_scores(scores: list[tuple[str, int, int, int, float]]):
    return "\n".join(
        f"  {label} ({', '.join(map(format_integer, nums))}): {format_confusion(value)}"
        for label, *nums, value in scores
    )

def print_train_f1_scores(model, on_data = [data_x_train, data_y_train]):
    print(format_f1_scores(f1_scores(confusion(model, on_data))))

In [47]:
print_train_f1_scores(lambda s: 'airfare' if airfare_preditor(s) else 'flight')

  abbreviation (0.0, 0.0, 93.0): 0.0
  aircraft (0.0, 0.0, 58.0): 0.0
  airfare (299, 116.0, 20.0): 0.8147138964577657
  airfare+flight_time (0.0, 0.0, 1.0): 0.0
  airline (0.0, 0.0, 118.0): 0.0
  airline+flight_no (0.0, 0.0, 1.0): 0.0
  airport (0.0, 0.0, 13.0): 0.0
  capacity (0.0, 0.0, 11.0): 0.0
  cheapest (0.0, 0.0, 1.0): 0.0
  city (0.0, 0.0, 15.0): 0.0
  distance (0.0, 0.0, 13.0): 0.0
  flight (2703, 589.0, 52.0): 0.8939970233173474
  flight+airfare (0.0, 0.0, 14.0): 0.0
  flight_no (0.0, 0.0, 9.0): 0.0
  flight_time (0.0, 0.0, 44.0): 0.0
  ground_fare (0.0, 0.0, 14.0): 0.0
  ground_service (0.0, 0.0, 185.0): 0.0
  meal (0.0, 0.0, 2.0): 0.0
  quantity (0.0, 0.0, 37.0): 0.0
  restriction (0.0, 0.0, 4.0): 0.0


In [61]:
ground_predictor = lambda s: ('ground transport' in s) + (' car ' in s ) + (' cars ' in s) + (' limo' in s) + (' taxi' in s) + (' downtown' in s)

print_train_f1_scores(lambda s:'ground_service' if ground_predictor(s) else 'airfare' if airfare_preditor(s) else 'flight')


  abbreviation (0.0, 0.0, 93.0): 0.0
  aircraft (0.0, 0.0, 58.0): 0.0
  airfare (296, 103.0, 23.0): 0.8245125348189415
  airfare+flight_time (0.0, 0.0, 1.0): 0.0
  airline (0.0, 0.0, 118.0): 0.0
  airline+flight_no (0.0, 0.0, 1.0): 0.0
  airport (0.0, 0.0, 13.0): 0.0
  capacity (0.0, 0.0, 11.0): 0.0
  cheapest (0.0, 0.0, 1.0): 0.0
  city (0.0, 0.0, 15.0): 0.0
  distance (0.0, 0.0, 13.0): 0.0
  flight (2697, 403.0, 58.0): 0.921263877028181
  flight+airfare (0.0, 0.0, 14.0): 0.0
  flight_no (0.0, 0.0, 9.0): 0.0
  flight_time (0.0, 0.0, 44.0): 0.0
  ground_fare (0.0, 0.0, 14.0): 0.0
  ground_service (179, 29.0, 6.0): 0.910941475826972
  meal (0.0, 0.0, 2.0): 0.0
  quantity (0.0, 0.0, 37.0): 0.0
  restriction (0.0, 0.0, 4.0): 0.0


In [62]:
df = DataFrame({'x': data_x_train, 'y': data_y_train})

In [63]:
df['gs_p'] = df['x'].apply(ground_predictor)
df['a_p'] = df['x'].apply(airfare_preditor)
df

Unnamed: 0,x,y,gs_p,a_p
0,can i see ground transportation from long beac...,ground_service,2,0
1,i 'd like to see flights from baltimore to atl...,flight,0,0
2,what flights from salt lake city to new york c...,flight,0,0
3,i need a flight from denver to philadelphia on...,flight,0,0
4,ground transportation in san jose,ground_service,1,0
...,...,...,...,...
3702,i need to go from boston to atlanta in the sam...,flight,0,0
3703,oakland to denver,flight,0,0
3704,what does us stand for,abbreviation,0,0
3705,how much time does it take to go between the a...,ground_service,2,1


In [64]:
df[(df.y == 'ground_service') & (df.gs_p == 0)]

Unnamed: 0,x,y,gs_p,a_p
104,i 'd like to know the ground travel available ...,ground_service,0,0
462,train to newark,ground_service,0,0
1299,in new york i 'll need to rent a car,ground_service,0,0
1430,ground san francisco,ground_service,0,0
1445,what is the transportation time from the airpo...,ground_service,0,0
3280,in pittsburgh i 'd like to rent a car,ground_service,0,0


In [65]:
df[(df.y != 'ground_service') & (df.gs_p != 0)]


Unnamed: 0,x,y,gs_p,a_p
76,how do i get from pittsburgh airport to downto...,flight,1,0
146,what is the distance from la guardia to new yo...,distance,1,0
232,how far is oakland airport from downtown,distance,1,0
349,how much does it cost to rent a car in tacoma,ground_fare,1,2
375,what are the costs of car rental in dallas,ground_fare,1,0
401,how much is the ground transportation between ...,ground_fare,2,1
481,what is the cost of limousine service in phila...,ground_fare,1,1
533,what are the rental car rates in dallas,ground_fare,1,1
922,how do i get from the airport to downtown in t...,flight,1,0
950,what is the cost of the air taxi operation at ...,ground_fare,1,1


In [75]:
costs_predictor = lambda s: (' price ' in s) + (' cost ' in s) + (' fare ' in s) + (' prices ' in s) + (' costs' in s) + (' fares' in s) + (' rate ' in s) + (' ticket ' in s) + (' rates ' in s) + (' tickets ' in s) + ('how much' in s)

distance_predictor = lambda s: (' distance ' in s) + (' distances ' in s) + ('how far ' in s) + ('how long ' in s)

flights_predictor = lambda s: (' flights ' in s) + (' list ' in s)

city_predictor = lambda s: ('what city ' in s) + ('where is ' in s) + (' cities ' in s) + ('time zone' in s)

In [76]:
def air_or_ground_model(s):
    if city_predictor(s):
        return 'city'
    
    costs = costs_predictor(s)
    distance = distance_predictor(s)
    ground = ground_predictor(s)

    if distance: 
        return 'distance'
    if ground:
        return 'ground_fare' if costs else 'ground_service'
    if costs:    
        return 'flight+airfare' if flights_predictor(s) else 'airfare'
    return 'flight'

print_train_f1_scores(air_or_ground_model)

  abbreviation (0.0, 0.0, 93.0): 0.0
  aircraft (0.0, 0.0, 58.0): 0.0
  airfare (253, 49.0, 66.0): 0.8148148148148148
  airfare+flight_time (0.0, 0.0, 1.0): 0.0
  airline (0.0, 0.0, 118.0): 0.0
  airline+flight_no (0.0, 0.0, 1.0): 0.0
  airport (0.0, 0.0, 13.0): 0.0
  capacity (0.0, 0.0, 11.0): 0.0
  cheapest (0.0, 0.0, 1.0): 0.0
  city (13, 8.0, 2.0): 0.7222222222222222
  distance (13, 1.0, 0.0): 0.9629629629629629
  flight (2707, 376.0, 48.0): 0.9273723878040425
  flight+airfare (12, 74.0, 2.0): 0.24
  flight_no (0.0, 0.0, 9.0): 0.0
  flight_time (0.0, 0.0, 44.0): 0.0
  ground_fare (12, 5.0, 2.0): 0.7741935483870968
  ground_service (176, 8.0, 9.0): 0.9539295392953929
  meal (0.0, 0.0, 2.0): 0.0
  quantity (0.0, 0.0, 37.0): 0.0
  restriction (0.0, 0.0, 4.0): 0.0


In [77]:
df['ag_m'] = df['x'].apply(air_or_ground_model)

In [78]:
df[(df.ag_m == 'airfare') & (df.y != 'airfare')]

Unnamed: 0,x,y,gs_p,a_p,ag_m
100,what 's the difference between fare code q and...,abbreviation,0,1,airfare
131,show me the cheapest fare in the database,cheapest,0,1,airfare
325,what types of aircraft can i get a first class...,aircraft,0,1,airfare
331,i would like the flight number and the time fo...,flight_no,0,1,airfare
368,how many fare codes belong to economy class,quantity,0,1,airfare
527,how many fares are there one way from tacoma t...,quantity,0,1,airfare
537,what is fare code f,abbreviation,0,1,airfare
539,list lowest cost flight from dallas to baltimore,flight,0,1,airfare
553,please explain fare code f,abbreviation,0,1,airfare
789,what does fare code qw mean,abbreviation,0,1,airfare
