In [3]:
import itertools
from collections import Counter
import pandas
from matplotlib import pyplot as pl
import numpy as np
training_fname = 'data/PrediqtTrainData.Sample.1000000.csv'
airports_fname = 'data/PrediqtAirports.csv'
testing_fname = 'data/PrediqtTestDataSmall.csv'

In [4]:
training_data = pandas.read_csv(training_fname, dtype={
    'PRICE': float, 
})

In [5]:
for key in ('SEARCH_DATEHOUR', 'OUTBOUND_DATE', 'INBOUND_DATE'):
    training_data[key] = training_data[key].astype('datetime64')

In [6]:
training_data = training_data.sort_index(by='PRICE').sample(500000)

AttributeError: 'DataFrame' object has no attribute 'sample'

In [None]:
price_counts = training_data.PRICE.value_counts(sort=False)

In [None]:
num_prices = len(training_data.PRICE)
num_buckets = 150.
buckets = [0.]
running_total = 0
for price, count in price_counts.sort_index().iteritems():
    running_total += count
    if running_total >= num_prices / num_buckets:
        buckets.append(price)
        running_total = 0
buckets.append(price * 1.01)
print buckets
del price_counts

In [None]:
%matplotlib inline
from matplotlib import pyplot as pl
pl.plot(buckets)
pl.show()

In [None]:
pl.hist(training_data.PRICE, num_buckets, color='black')
pl.xlim([0, 2500])
pl.show()

In [None]:
bucket_counts = [0 for __ in xrange(int(num_buckets) + 1)]
bucket_ranges = zip(xrange(int(num_buckets) + 1), buckets, buckets[1:])
print len(training_data.PRICE)
for i_, price in enumerate(training_data.PRICE):
    for i, low, high in bucket_ranges:
        if low <= price <= high:
            bucket_counts[i] += 1
            break
    if bucket_ranges[0][-1] < price:
        bucket_ranges = bucket_ranges[1:]
        print 'adjusting range', bucket_ranges[0]

In [None]:
%matplotlib inline
pl.plot(bucket_counts)
pl.show()

I think bucket_counts looks better than bin_counts.

In [None]:
print buckets

In [None]:
len(buckets)

In [None]:
step = 1 / 142.
print step
bucket_ranges = zip(buckets, buckets[1:])
def convert(num):
    cumsum = 0
    for low, high in bucket_ranges:
        if cumsum <= num < cumsum + step:
            t = (num - cumsum) / step
            return (1 - t) * low + t * high
        cumsum += step
    t = (num - (cumsum - step)) / step
    return (1 - t) * low + t * high

In [None]:
print convert(.992)
print convert(.9925)
print convert(.993)
print convert(.9935)

In [None]:
%matplotlib inline
pl.plot([convert(i / 250.) for i in xrange(250)])
pl.xlim([0, 200])
pl.ylim([0, 1800])
pl.show()

In [None]:
training_data['SEARCH_TO_OUTBOUND'] = training_data.OUTBOUND_DATE - training_data.SEARCH_DATEHOUR

In [None]:
training_data['SEARCH_TO_INBOUND'] = training_data.INBOUND_DATE - training_data.SEARCH_DATEHOUR

In [None]:
training_data['OUTBOUND_TO_INBOUND'] = training_data.INBOUND_DATE - training_data.OUTBOUND_DATE

In [None]:
airports_data = pandas.read_csv(airports_fname)

In [None]:
training_merged = pandas.merge(training_data, airports_data, left_on='ORIGIN', right_on='AIRPORT')
del training_data

In [None]:
del training_merged['AIRPORT']
training_merged.head()

In [None]:
for key in ('CITY', 'COUNTRY', 'CONTINENT', 'LATITUDE', 'LONGITUDE', 'TRAFFIC'):
    training_merged['ORIGIN_' + key] = training_merged[key]
    del training_merged[key]

In [None]:
training_merged = pandas.merge(training_merged, airports_data, left_on='DESTINATION', right_on='AIRPORT')
del training_merged['AIRPORT']
for key in ('CITY', 'COUNTRY', 'CONTINENT', 'LATITUDE', 'LONGITUDE', 'TRAFFIC'):
    training_merged['DESTINATION_' + key] = training_merged[key]
    del training_merged[key]
training_merged.head()

In [None]:
#training_merged = training_merged.sort('PRICE')
del airports_data

In [None]:
#for key in (
#    'ORIGIN', 'DESTINATION', 'ORIGIN_CITY', 'ORIGIN_COUNTRY', 
#    'ORIGIN_CONTINENT', 'DESTINATION_CITY', 'DESTINATION_COUNTRY', 
#    'DESTINATION_CONTINENT',
#):
#    training_merged[key] = training_merged[key].astype('category')
#training_merged.head()

In [None]:
market_counts = Counter(
    market
    for __, marketlist in 
    training_merged.MARKETS.iteritems()
    for market in
    marketlist.split(';')
)
print market_counts.most_common(10)

In [None]:
len(market_counts)

In [None]:
most_common_markets = [market for market, __ in market_counts.most_common(100)]
print most_common_markets

In [None]:
market_onehots = np.zeros((len(training_merged.MARKETS), 100), dtype='bool')
for i, (marketlist, onehot) in enumerate(itertools.izip(training_merged.MARKETS, market_onehots)):
    ms = marketlist.split(';')
    for m in ms:
        try:
            onehot[most_common_markets.index(m)] = True
        except ValueError:
            onehot[-1] = True
    if not i % 1000000:
        print i

In [None]:
training_merged.MARKETS.tail(2)

In [None]:
sum(market_onehots[-1])

In [None]:
sum(market_onehots[-2])

In [None]:
import gc
gc.collect()

In [None]:
training_merged['MARKETS'] = list(market_onehots)
training_merged.MARKETS.loc[0]

In [None]:
gc.collect()

In [None]:
training_merged.MARKETS.head()

In [None]:
training_merged.dtypes

In [None]:
def make_onehot(df, field, size):
    series = df[field]
    counts = Counter(series)
    print len(counts)
    most_common = [item for item, __ in counts.most_common(size)]
    print most_common
    most_common_set = frozenset(most_common)
    excluded = sum(place not in most_common_set for place in series) / float(len(series))
    print excluded
    zeros_size = len(most_common) + (1 if excluded > 0 else 0)
    onehots = np.zeros((len(series), zeros_size), dtype='bool')
    most_common_index = most_common.index
    for i, (item, onehot) in enumerate(itertools.izip(series, onehots)):
        try:
            onehot[most_common_index(item)] = True
        except ValueError:
            onehot[-1] = True
        if not i % 1000000:
            print i
    df[field] = list(onehots)
    return df

In [None]:
pairs = {
    'ORIGIN': 99,
    'ORIGIN_CITY': 99,
    'ORIGIN_COUNTRY': 49,
    'ORIGIN_CONTINENT': 6,
    'DESTINATION': 99,
    'DESTINATION_CITY': 99,
    'DESTINATION_COUNTRY': 49,
    'DESTINATION_CONTINENT': 6,
}
for field, size in pairs.iteritems():
    print field
    gc_collect_1 = gc.collect()
    training_merged = make_onehot(training_merged, field, size)
    print gc.collect() + gc_collect_1

In [None]:
training_merged.to_hdf('my_data/TrainingMerged.h5', 'fixed')
print 'done hdf'

In [None]:
training_merged.to_pickle('my_data/TrainingMerged.pickle')

Doesn't seem like we need to rescale traffics based on distributions

In [None]:
one_day = 1000000000 * 60 * 60 * 24
oti_days = training_merged.OUTBOUND_TO_INBOUND.map(lambda x: x.item() / one_day)

In [None]:
oti_days.hist()

In [None]:
oti_days.sample(1000).hist()

In [None]:
oti_out, oti_bins = pandas.qcut(oti_days, 10, retbins=True)

In [None]:
oti_bins

In [None]:
oti_out

In [None]:
oti_out.sample(500).value_counts(sort=False)

In [None]:
one_day = 1000000000 * 60 * 60 * 24
sto_days = training_merged.SEARCH_TO_OUTBOUND.map(lambda x: x.item() / one_day)
sto_days.hist()

In [None]:
sto_out, sto_bins = pandas.qcut(sto_days, 10, retbins=True)
sto_bins

In [None]:
sto_out.sample(500).value_counts(sort=False)