In [2]:
import sys
#Path to Trane for imports
sys.path.append('/Users/Alexander/Documents/Trane/Trane__HDI_REPO')
import pandas as pd
import trane
import json
import random

In [3]:
def file_to_table_meta(filepath):
    return trane.TableMeta(json.loads(open(filepath).read()))

In [4]:
#IMPORT DATA
path_to_datasets = '../Trane__Local_Misc/Formatted Datasets/'
taxi_data_df = pd.read_csv(path_to_datasets + 'NYC Taxi/taxi.csv')
taxi_table_meta = file_to_table_meta(path_to_datasets + 'NYC Taxi/taxi_meta.json')

yelp_review_data_df = pd.read_csv(path_to_datasets + 'Yelp Reviews/yelp_review_sampled.csv')
yelp_table_meta = file_to_table_meta(path_to_datasets + 'Yelp Reviews/yelp_review_meta.json')

github_data_df = pd.read_csv(path_to_datasets + 'Github/github_archive.csv')
github_table_meta = file_to_table_meta(path_to_datasets + 'Github/github_meta.json')

saudi_er_data_df = pd.read_csv(path_to_datasets + 'Saudi ER/ER.csv')
saudi_er_table_meta = file_to_table_meta(path_to_datasets + 'Saudi ER/ER_table_meta.json')

In [5]:
NUM_PROBLEMS_TO_GENERATE = 10

In [6]:
def generate_probs_and_nl(entity_id_column,
                            label_generating_column,
                            time_column,
                            table_meta,
                            filter_column,
                            is_pick_random_problems = True):
    generator = trane.PredictionProblemGenerator(table_meta, entity_id_column, label_generating_column, time_column, filter_column)
    probs = []

    all_probs = list(generator.generate())

    if is_pick_random_problems:
        random.shuffle(all_probs)

    for idx, prob in enumerate(all_probs):
        probs.append(prob)
        if idx + 1 == NUM_PROBLEMS_TO_GENERATE:
            break
    prediction_problems_json = trane.prediction_problems_to_json_file(
        probs, table_meta, entity_id_column, label_generating_column, time_column, "prediction_problems.json")

    nl_descrips = trane.generate_nl_description(
        probs, table_meta, entity_id_column, label_generating_column, time_column, trane.ConstantIntegerCutoffTimes(0))
    return probs, nl_descrips

# Taxi Data
Below, is the code to generate prediction problems and natural language descriptions from the taxi dataset.

In [7]:
entity_id_column = 'id'
label_generating_column = 'passenger_count'
time_column = 'pickup_datetime'
filter_column = 'id'
table_meta = taxi_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
print(nl_descrips)

['For each id, predict the firstNone the exp of passenger_count, with id less than 0, after pickup_datetime 0.', 'For each id, predict the last passenger_count is equal to 0, with id greater than 0, after pickup_datetime 0.', 'For each id, predict the first the fluctuation of passenger_count is equal to 0, with id equal to 0, after pickup_datetime 0.', 'For each id, predict the number of records, with id equal to 0, after pickup_datetime 0.', 'For each id, predict the first the fluctuation of passenger_count is greater than 0, with id greater than 0, after pickup_datetime 0.', 'For each id, predict the sum ofNone passenger_count is less than 0, after pickup_datetime 0.', 'For each id, predict the sum of passenger_count is greater than 0, after pickup_datetime 0.', 'For each id, predict the last minus first passenger_count is equal to 0, after pickup_datetime 0.', 'For each id, predict the number of records, with id not equal to 0, after pickup_datetime 0.', 'For each id, predict the fi

# Yelp Data
Below, is the code to generate prediction problems and natural language descriptions from the yelp dataset.

In [8]:
entity_id_column = 'user_id'
label_generating_column = 'stars'
time_column = 'date'
filter_column = 'user_id'
table_meta = yelp_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
print(nl_descrips)

['For each user_id, predict the number of records, with user_id less than 0, after date 0.', 'For each user_id, predict the first stars, after date 0.', 'For each user_id, predict the sum of stars is less than 0, with user_id not equal to 0, after date 0.', 'For each user_id, predict the sum of the fluctuation of stars is greater than 0, with user_id less than 0, after date 0.', 'For each user_id, predict the first the fluctuation of stars is less than 0, after date 0.', 'For each user_id, predict the last the fluctuation of stars is less than 0, with user_id greater than 0, after date 0.', 'For each user_id, predict the last minus first the fluctuation of stars is greater than 0, with user_id less than 0, after date 0.', 'For each user_id, predict the last the fluctuation of stars is equal to 0, with user_id less than 0, after date 0.', 'For each user_id, predict the last minus first stars, after date 0.', 'For each user_id, predict the number of records, with user_id less than 0, aft

# Github Data
Below, is the code to generate prediction problems and natural language descriptions from the github dataset.

In [9]:
entity_id_column = 'actor'
label_generating_column = 'repo'
time_column = 'created_at'
filter_column = 'user_id'
table_meta = github_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
print(nl_descrips)

['For each actor, predict the last minus firstNone repo is less than 0, with user_id not equal to 0, after created_at 0.', 'For each actor, predict the sum of the fluctuation of the exp of repo, with user_id less than 0, after created_at 0.', 'For each actor, predict the sum ofNone repo is not equal to 0, with user_id greater than 0, after created_at 0.', 'For each actor, predict the last minus first the fluctuation of repo is equal to 0, with user_id less than 0, after created_at 0.', 'For each actor, predict the number of records, after created_at 0.', 'For each actor, predict the number of records, with user_id equal to 0, after created_at 0.', 'For each actor, predict the last minus first repo, with user_id less than 0, after created_at 0.', 'For each actor, predict the last the exp of repo, with user_id less than 0, after created_at 0.', 'For each actor, predict the number of records, with user_id less than 0, after created_at 0.', 'For each actor, predict the lastNone repo is gre

# Saudi ER Data
Below, is the code to generate prediction problems and natural language descriptions from the Saudi ER Dataset.

In [10]:
entity_id_column = 'PATIENT_ID'
label_generating_column = 'HOSP_CODE'
time_column = 'TIME_ARRIVED'
filter_column = 'PATIENT_ID'
table_meta = saudi_er_table_meta
probs, nl_descrips = generate_probs_and_nl(entity_id_column, label_generating_column,
                     time_column, table_meta, filter_column)
print(nl_descrips)

['For each PATIENT_ID, predict the firstNone HOSP_CODE is less than 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the last HOSP_CODE is equal to 0, with PATIENT_ID not equal to 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the last minus first the exp of HOSP_CODE, with PATIENT_ID equal to 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the last minus first the fluctuation of HOSP_CODE is less than 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the firstNone HOSP_CODE, with PATIENT_ID greater than 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the sum ofNone HOSP_CODE, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the first HOSP_CODE is less than 0, with PATIENT_ID greater than 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the last minus first the fluctuation of HOSP_CODE is less than 0, with PATIENT_ID less than 0, after TIME_ARRIVED 0.', 'For each PATIENT_ID, predict the last HOSP_CODE is not equal to 0, with PATIE