In [1]:
import sys
#Path to Trane for imports
sys.path.append('/Users/Alexander/Documents/Trane/Trane__HDI_REPO')
import pandas as pd
import trane
import json
import random
import datetime
import pickle

In [2]:
def file_to_table_meta(filepath):
    return trane.TableMeta(json.loads(open(filepath).read()))

In [162]:
path_to_datasets = '../Trane__Local_Misc/Formatted Datasets/'
#taxi_data_df = pd.read_csv(path_to_datasets + 'NYC Taxi/taxi.csv')
#taxi_table_meta = file_to_table_meta(path_to_datasets + 'NYC Taxi/taxi_meta.json')
taxi_data_df = pd.read_csv(path_to_datasets + 'Synthetic Taxi/synthetic_taxi_data.csv')
taxi_table_meta = file_to_table_meta(path_to_datasets + 'Synthetic Taxi/taxi_meta.json')

In [156]:
NUM_PROBLEMS_TO_GENERATE = 100

In [157]:
def generate_probs_and_nl(entity_id_column,
                            label_generating_column,
                            time_column,
                            table_meta,
                            filter_column,
                            is_pick_random_problems = True):
    generator = trane.PredictionProblemGenerator(table_meta, entity_id_column, label_generating_column, time_column, filter_column)
    probs = []

    all_probs = list(generator.generate())

    if is_pick_random_problems:
        random.shuffle(all_probs)

    for idx, prob in enumerate(all_probs):
        probs.append(prob)
        if idx + 1 == NUM_PROBLEMS_TO_GENERATE:
            break
    prediction_problems_json = trane.prediction_problems_to_json_file(
        probs, table_meta, entity_id_column, label_generating_column, time_column, "prediction_problems.json")

    nl_descrips = trane.generate_nl_description(
        probs, table_meta, entity_id_column, label_generating_column, time_column, trane.ConstantIntegerCutoffTimes(0))
    return probs, nl_descrips
def save_obj(obj, name):
    with open('pickled_objects/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name):
    with open('pickled_objects/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
def convert(str, format = None):
    return datetime.datetime.strptime(str, format)

# Trane
Below is the code for generating prediction problems using Trane.

In [158]:
entity_id_column = 'taxi_id'
label_generating_column = 'num_passengers'
time_column = 'start_time'
filter_column = 'vendor_id'
table_meta = taxi_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
prediction_problems_json = trane.prediction_problems_to_json_file(
    probs, table_meta, entity_id_column, label_generating_column, time_column, "prediction_problems.json")

In [168]:
nl_descrips

['For each taxi_id, predict the lastNone num_passengers is less than 0, with vendor_id equal to 0, after start_time 0.',
 'For each taxi_id, predict whether the {op} num_passengers is less than 0, with vendor_id less than 0, after start_time 0.',
 'For each taxi_id, predict the firstNone num_passengers is not equal to 0, with vendor_id less than 0, after start_time 0.',
 'For each taxi_id, predict whether the {op} records, with vendor_id greater than 0, after start_time 0.',
 'For each taxi_id, predict the sum of the fluctuation of num_passengers, with vendor_id less than 0, after start_time 0.',
 'For each taxi_id, predict whether the {op} num_passengers is not equal to 0, with vendor_id less than 0, after start_time 0.',
 'For each taxi_id, predict the firstNone num_passengers is less than 0, with vendor_id greater than 0, after start_time 0.',
 'For each taxi_id, predict the firstNone the exp of num_passengers, with vendor_id not equal to 0, after start_time 0.',
 'For each taxi_id,

#### Note
Splits dataframe by cutoff time and segments the according to entity.

In [163]:
datetime_constant_cutoff = datetime.date(2000, 1, 1)
taxi_data_df[time_column] = taxi_data_df[time_column].apply(convert, format = "%Y-%m-%d %H:%M:%S.%f")

taxi_data_df_pre_cutoff = taxi_data_df[taxi_data_df[time_column] < datetime_constant_cutoff]
taxi_data_df_post_cutoff = taxi_data_df[taxi_data_df[time_column] >= datetime_constant_cutoff]

taxi_data_df_pre_cutoff[label_generating_column] = pd.to_numeric(taxi_data_df_pre_cutoff[label_generating_column])
taxi_data_df_pre_cutoff[entity_id_column] = pd.to_numeric(taxi_data_df_pre_cutoff[entity_id_column])
entity_to_data_dict = trane.df_group_by_entity_id(taxi_data_df_pre_cutoff, entity_id_column)
entity_to_data_and_cutoff_dict_pre_cutoff = trane.ConstantDatetimeCutoffTime(datetime_constant_cutoff).generate_cutoffs(entity_to_data_dict)

taxi_data_df_post_cutoff[label_generating_column] = pd.to_numeric(taxi_data_df_post_cutoff[label_generating_column])
taxi_data_df_post_cutoff[entity_id_column] = pd.to_numeric(taxi_data_df_post_cutoff[entity_id_column])
entity_to_data_dict = trane.df_group_by_entity_id(taxi_data_df_post_cutoff, entity_id_column)
entity_to_data_and_cutoff_dict_post_cutoff = trane.ConstantDatetimeCutoffTime(datetime_constant_cutoff).generate_cutoffs(entity_to_data_dict)

# save_obj(entity_to_data_and_cutoff_dict, "entity_to_data_and_cutoff_dict")

### Generate Labels
This code takes 10 minutes to execute.

In [165]:
labeler = trane.Labeler()

labels_train = labeler.execute(entity_to_data_and_cutoff_dict_pre_cutoff, "prediction_problems.json").fillna(0)
labels_test = labeler.execute(entity_to_data_and_cutoff_dict_post_cutoff, "prediction_problems.json").fillna(0)

save_obj(labels_train, "labels_train")
save_obj(labels_test, "labels_test")

# Feature Tools
Below is the code for generating features using DFS from featuretools.

In [17]:
import featuretools as ft

### The input dataset:

In [39]:
taxi_data_df

Unnamed: 0,vendor_id,taxi_id,trip_id,distance,duration,fare,num_passengers,start_time,end_time,unique_entry_id
0,0,11,111,6.62,10.51,38.32,2,1999-12-13 15:02:04.057898,1999-12-13 15:12:34.657898,0
1,0,11,112,2.81,11.02,31.11,3,1999-05-29 23:56:16.015491,1999-05-30 00:07:17.215491,1
2,0,11,113,4.06,16.86,45.49,1,2000-05-01 23:37:46.946319,2000-05-01 23:54:38.546319,2
3,0,11,114,6.82,6.85,31.43,2,1999-10-23 04:36:39.851697,1999-10-23 04:43:30.851697,3
4,0,11,115,4.55,11.30,35.43,3,2000-03-27 16:55:30.144056,2000-03-27 17:06:48.144056,4
5,0,11,116,5.00,23.55,60.90,2,1999-01-19 00:24:34.284707,1999-01-19 00:48:07.284707,5
6,0,11,117,5.09,19.78,53.55,1,1999-04-01 03:41:41.557501,1999-04-01 04:01:28.357501,6
7,0,11,118,4.89,21.02,55.60,4,2000-02-21 09:08:56.976828,2000-02-21 09:29:58.176828,7
8,0,11,119,3.53,14.62,39.86,3,1999-11-26 19:21:54.927817,1999-11-26 19:36:32.127817,8
9,0,11,1110,4.37,3.19,18.82,2,1999-03-13 11:00:13.139864,1999-03-13 11:03:24.539864,9


#### Note
The code below is responsible for making two additional dataframes that are part of the integration with feature tools.

In [129]:
taxi_data_df_pre_cutoff_time = taxi_data_df[taxi_data_df['start_time'] < datetime_constant_cutoff]
taxi_data_df_post_cutoff_time = taxi_data_df[taxi_data_df['start_time'] >= datetime_constant_cutoff]

taxi_ids = taxi_data_df_pre_cutoff_time['taxi_id'].unique()
columns = ['taxi_id', 'vendor_id']
rows = []
for taxi_id in taxi_ids:
    vendor_id = int(str(taxi_id)[0]) - 1
    rows.append([taxi_id, vendor_id])

taxi_id_df = pd.DataFrame(rows, columns = columns)
vendor_id_df = pd.DataFrame(taxi_data_df_pre_cutoff_time['vendor_id'].unique(), columns = ['vendor_id'])

In [132]:
entities = {
    "taxis": (taxi_id_df, "taxi_id"),
    "vendors": (vendor_id_df, "vendor_id"),
    "trips": (taxi_data_df_pre_cutoff_time, "unique_entry_id")
}
relationships = [
    ("vendors", "vendor_id", "taxis", "taxi_id"),
    ("taxis", "taxi_id", "trips", "unique_entry_id")
]
feature_matrix, feature_definitions = ft.dfs(entities = entities,
      relationships = relationships,
      target_entity = "trips")
feature_matrix_train = feature_matrix.fillna(0.0)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  df = self.df.loc[instance_vals]


In [131]:
entities = {
    "taxis": (taxi_id_df, "taxi_id"),
    "vendors": (vendor_id_df, "vendor_id"),
    "trips": (taxi_data_df_post_cutoff_time, "unique_entry_id")
}
relationships = [
    ("vendors", "vendor_id", "taxis", "taxi_id"),
    ("taxis", "taxi_id", "trips", "unique_entry_id")
]
feature_matrix, feature_definitions = ft.dfs(entities = entities,
      relationships = relationships,
      target_entity = "trips")
feature_matrix_test = feature_matrix.fillna(0.0)


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  df = self.df.loc[instance_vals]


In [138]:
features_by_entity_train = feature_matrix_train.groupby('taxi_id').mean()
features_by_entity_test = feature_matrix_test.groupby('taxi_id').mean()

# ATM
Below is the code for automatically tuning the model.

# Model
Below is the code for training the model.

In [110]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [111]:
logistic = linear_model.LogisticRegression()

In [166]:
X_training = features_by_entity_train
Y_training_problems = []
for i in range(1, NUM_PROBLEMS_TO_GENERATE + 1):
    Y_training_problem = labels_train['Problem ' + str(i)]
    Y_training_problems.append(Y_training_problem)

#### Note
Most of the problems labels are not fittable because no two labels differ. In other words, the labels are all the same because the prediction problems are too mundane.

In [167]:
for i in range(NUM_PROBLEMS_TO_GENERATE):
    Y_training = Y_training_problems[i]
    try:
        logistic.fit(X_training, Y_training)
        break
    except:
        print("Problem {} not fittable".format(i))
        continue

Problem 0 not fittable
Problem 1 not fittable
Problem 2 not fittable
Problem 3 not fittable
Problem 4 not fittable
Problem 5 not fittable
Problem 6 not fittable
Problem 7 not fittable
Problem 8 not fittable
Problem 9 not fittable
Problem 10 not fittable
Problem 11 not fittable
Problem 12 not fittable
Problem 13 not fittable
Problem 14 not fittable
Problem 15 not fittable
Problem 16 not fittable
Problem 17 not fittable
Problem 18 not fittable
Problem 19 not fittable
Problem 20 not fittable
Problem 21 not fittable
Problem 22 not fittable
Problem 23 not fittable
Problem 24 not fittable
Problem 25 not fittable
Problem 26 not fittable
Problem 27 not fittable
Problem 28 not fittable
Problem 29 not fittable
Problem 30 not fittable
Problem 31 not fittable
Problem 32 not fittable
Problem 33 not fittable
Problem 34 not fittable
Problem 35 not fittable
Problem 36 not fittable
Problem 37 not fittable
Problem 38 not fittable
Problem 39 not fittable
Problem 40 not fittable
Problem 41 not fittable
Pr

In [None]:
X_test = features_by_entity_test
Y_test_problems = []
for i in range(1, 11):
    Y_test_problem = labels_test['Problem ' + str(i)]
    Y_test_problems.append(Y_test_problem)
    