In [1]:
import sys
#Path to Trane for imports
sys.path.append('/Users/Alexander/Documents/Trane/Trane__HDI_REPO')
import pandas as pd
import trane
import json
import random
import pickle


In [2]:
def file_to_table_meta(filepath):
    return trane.TableMeta(json.loads(open(filepath).read()))

In [3]:
#IMPORT DATA
path_to_datasets = '../Trane__Local_Misc/Formatted Datasets/'
taxi_data_df = pd.read_csv(path_to_datasets + 'NYC Taxi/taxi.csv')
taxi_table_meta = file_to_table_meta(path_to_datasets + 'NYC Taxi/taxi_meta.json')

yelp_review_data_df = pd.read_csv(path_to_datasets + 'Yelp Reviews/yelp_review_sampled.csv')
yelp_table_meta = file_to_table_meta(path_to_datasets + 'Yelp Reviews/yelp_review_meta.json')

github_data_df = pd.read_csv(path_to_datasets + 'Github/github_archive.csv')
github_table_meta = file_to_table_meta(path_to_datasets + 'Github/github_meta.json')

saudi_er_data_df = pd.read_csv(path_to_datasets + 'Saudi ER/ER.csv')
saudi_er_table_meta = file_to_table_meta(path_to_datasets + 'Saudi ER/ER_table_meta.json')

In [4]:
NUM_PROBLEMS_TO_GENERATE = 100

In [5]:
def generate_probs_and_nl(entity_id_column,
                            label_generating_column,
                            time_column,
                            table_meta,
                            filter_column,
                            is_pick_random_problems = True):
    generator = trane.PredictionProblemGenerator(table_meta, entity_id_column, label_generating_column, time_column, filter_column)
    probs = []

    all_probs = list(generator.generate())

    if is_pick_random_problems:
        random.shuffle(all_probs)

    for idx, prob in enumerate(all_probs):
        probs.append(prob)
        if idx + 1 == NUM_PROBLEMS_TO_GENERATE:
            break
    prediction_problems_json = trane.prediction_problems_to_json_file(
        probs, table_meta, entity_id_column, label_generating_column, time_column, "prediction_problems.json")

    nl_descrips = trane.generate_nl_description(
        probs, table_meta, entity_id_column, label_generating_column, time_column, trane.ConstantIntegerCutoffTimes(0))
    return probs, nl_descrips
def save_obj(obj, name):
    with open('pickled_objects/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name):
    with open('pickled_objects/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

# Taxi Data
Below, is the code to generate prediction problems and natural language descriptions from the taxi dataset.

In [6]:
entity_id_column = 'id'
label_generating_column = 'passenger_count'
time_column = 'pickup_datetime'
filter_column = 'id'
table_meta = taxi_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
print(nl_descrips)


['For each id, predict the last minus firstNone passenger_count is not equal to 0, after pickup_datetime 0.', 'For each id, predict whether the {op} passenger_count is greater than 0, after pickup_datetime 0.', 'For each id, predict the number of records, after pickup_datetime 0.', 'For each id, predict the last minus firstNone the exp of passenger_count, after pickup_datetime 0.', 'For each id, predict the last minus firstNone passenger_count is less than 0, after pickup_datetime 0.', 'For each id, predict the last the fluctuation of passenger_count, after pickup_datetime 0.', 'For each id, predict the number of records, after pickup_datetime 0.', 'For each id, predict the first the fluctuation of the exp of passenger_count, after pickup_datetime 0.', 'For each id, predict the lastNone passenger_count, after pickup_datetime 0.', 'For each id, predict the sum ofNone passenger_count is equal to 0, after pickup_datetime 0.', 'For each id, predict whether the {op} records, after pickup_da

# Yelp Data
Below, is the code to generate prediction problems and natural language descriptions from the yelp dataset.

In [7]:
entity_id_column = 'user_id'
label_generating_column = 'stars'
time_column = 'date'
filter_column = 'user_id'
table_meta = yelp_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)


In [24]:
# print(nl_descrips)
# for idx, prob in enumerate(probs):
#     print(prob)
#     print(idx)
# print(probs[12])
# save_obj(probs[12], "yelp_prediction_problem")
print(load_obj("yelp_prediction_problem"))

AllFilterOp(user_id)->IdentityRowOp(stars)->DiffTransformationOp(stars)->SumAggregationOp(stars)


# Github Data
Below, is the code to generate prediction problems and natural language descriptions from the github dataset.

In [8]:
entity_id_column = 'actor'
label_generating_column = 'repo'
time_column = 'created_at'
filter_column = 'user_id'
table_meta = github_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
print(nl_descrips)

KeyError: 'user_id'

# Saudi ER Data
Below, is the code to generate prediction problems and natural language descriptions from the Saudi ER Dataset.

In [10]:
entity_id_column = 'PATIENT_ID'
label_generating_column = 'HOSP_CODE'
time_column = 'TIME_ARRIVED'
filter_column = 'PATIENT_ID'
table_meta = saudi_er_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
print(nl_descrips)

['For each PATIENT_ID, predict the firstNone HOSP_CODE is less than 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the last HOSP_CODE is equal to 0, with PATIENT_ID not equal to 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the last minus first the exp of HOSP_CODE, with PATIENT_ID equal to 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the last minus first the fluctuation of HOSP_CODE is less than 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the firstNone HOSP_CODE, with PATIENT_ID greater than 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the sum ofNone HOSP_CODE, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the first HOSP_CODE is less than 0, with PATIENT_ID greater than 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the last minus first the fluctuation of HOSP_CODE is less than 0, with PATIENT_ID less than 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the last HOSP_CODE is not equal to 0, with PATIE