## Imports and Helper Functions

In [1]:
import sys
#Path to Trane for imports
sys.path.append('/Users/Alexander/Documents/Trane/Trane__HDI_REPO')
path_to_datasets = '../../Trane__Local_Misc/Formatted Datasets/Yelp Reviews/'
import pandas as pd
import trane
import json
import random
import datetime
import pickle
import featuretools as ft
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, decomposition, datasets
from sklearn.metrics import accuracy_score
from sklearn import metrics
import scikitplot as skplt

def save_obj(obj, name):
    with open('../pickled_objects/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
def load_obj(name):
    try:
        with open('../pickled_objects/' + name + '.pkl', 'rb') as f:
            return pickle.load(f)
    except:
        return None
def generate_probs(entity_id_column,
                            label_generating_column,
                            time_column,
                            table_meta,
                            filter_column,
                            is_pick_random_problems = True):
    generator = trane.PredictionProblemGenerator(table_meta, entity_id_column, label_generating_column, time_column, filter_column)
    probs = []

    all_probs = list(generator.generate())

    if is_pick_random_problems:
        random.shuffle(all_probs)

    for idx, prob in enumerate(all_probs):
        probs.append(prob)
        if idx + 1 == NUM_PROBLEMS_TO_GENERATE:
            break
    prediction_problems_json = trane.prediction_problems_to_json_file(
        probs, table_meta, entity_id_column, label_generating_column, time_column, "prediction_problems.json")

    return probs

def convert(str, format = None):
    return datetime.datetime.strptime(str, format)
def file_to_table_meta(filepath):
    return trane.TableMeta(json.loads(open(filepath).read()))

## Load, Format and Sample Data

In [2]:
filename = 'merged_df'
merged_df = None #load_obj(filename)
if merged_df is None:
    
    yelp_review_df = pd.read_csv(path_to_datasets + 'yelp_review.csv')
    yelp_checkin_df = pd.read_csv(path_to_datasets + 'yelp_checkin.csv')
    yelp_business_df = pd.read_csv(path_to_datasets + 'yelp_business.csv')
    yelp_user_df = pd.read_csv(path_to_datasets + 'yelp_user.csv')

    sampled_yelp_review_df = yelp_review_df.head(1000)
    sampled_business_ids = sampled_yelp_review_df['business_id'].unique()
    sampled_user_ids = sampled_yelp_review_df['user_id'].unique()
    sampled_review_ids = sampled_yelp_review_df['review_id'].unique()
    sampled_yelp_checkin_df = yelp_checkin_df[yelp_checkin_df['business_id'].isin(sampled_business_ids)]
    sampled_yelp_business_df = yelp_business_df[yelp_business_df['business_id'].isin(sampled_business_ids)] 
    sampled_yelp_user_df = yelp_user_df[yelp_user_df['user_id'].isin(sampled_user_ids)]

    assert(len(sampled_business_ids) == len(sampled_yelp_business_df))
    assert(len(sampled_user_ids) == len(sampled_yelp_user_df))
    assert(len(sampled_review_ids) == len(sampled_yelp_review_df))

    print("Sampling Reuslts ---")
    print("Number of reviews: {}".format(len(sampled_yelp_review_df)))
    print("Number of businesses: {}".format(len(sampled_business_ids)))
    print("Number of users: {}".format(len(sampled_user_ids)))
    print("Number of checkins: {}".format(len(sampled_yelp_checkin_df)))

    merge_step_1 = pd.merge(sampled_yelp_review_df, sampled_yelp_user_df, how = 'left', on ='user_id')
    merge_step_2 = pd.merge(merge_step_1, sampled_yelp_business_df, how = 'left', on = 'business_id')
    merge_step_3 = pd.merge(merge_step_2, sampled_yelp_checkin_df, how = 'right', on = 'business_id')
    merged_df = merge_step_3
    merged_df['date'] = merged_df['date'].apply(str)
    merged_df['date'] = merged_df['date'].apply(convert, format = '%Y-%m-%d')
    merged_df = merged_df.rename(columns = {'stars_x': 'stars'})

    distinct_business_ids_in_merged_df = merged_df['business_id'].unique()
    #Note merged_df only contains 959 distinct business_ids. Checkins only contains information from 959 businesses.
    #    That's why merged_df only has 959 distinct business_ids, as opposed to the 974 unique business_ids 
    #    contained in the sample_yelp_review_df.
    #Note merged_df contains more than the 78792 unique check-ins because there are multiple reviews for some business_ids so
    #    each unique review_id is matched to a new business_id. There are 1000 review_ids, but only 974 business_ids.
    #
    save_obj(merged_df, filename)

Sampling Reuslts ---
Number of reviews: 1000
Number of businesses: 974
Number of users: 45
Number of checkins: 78792


## Generate Prediction Problems

In [3]:
table_meta = file_to_table_meta(path_to_datasets + "meta.json")
entity_id_column = 'business_id'
label_generating_column = 'stars'
time_column = 'date'
filter_column = 'user_id'
NUM_PROBLEMS_TO_GENERATE = 100

### Note: Code Below Takes 10 Minutes to Execute without Hashing. With Hashing 8 Seconds

In [4]:
prediction_problems_filename = "../JSON Files/yelp_prediction_problems.json"

problem_generator_obj = trane.PredictionProblemGenerator(table_meta, entity_id_column, label_generating_column, time_column, filter_column)
problem_generator = problem_generator_obj.generator(merged_df)
problems = []
problems = list(problem_generator)

# greaterRowOp = trane.ops.GreaterRowOp(label_generating_column)
# greaterRowOp.set_thresholds(table_meta)

# for prob in probs:
#     prob.operations.append(greaterRowOp)

trane.prediction_problems_to_json_file(problems, table_meta, 
                                       entity_id_column, label_generating_column, 
                                       time_column, 
                                       prediction_problems_filename)


In [5]:
labeler = trane.Labeler()
entity_to_data_dict = trane.df_group_by_entity_id(merged_df, entity_id_column)
training_cutoff_time = datetime.date(2012, 1, 1)
label_cutoff_time = datetime.date(2015, 1, 1)
#First date: 2007-06-12
#Last date: 2017-12-10
entity_to_data_and_cutoff_dict = trane.ConstantCutoffTime(training_cutoff_time, label_cutoff_time).generate_cutoffs(entity_to_data_dict)
labels = labeler.execute(entity_to_data_and_cutoff_dict, prediction_problems_filename)

## Deep Feature Synthesis with Feature Tools

In [6]:
entities = {
    "business": (sampled_yelp_business_df, "business_id"),
    "reviews": (sampled_yelp_review_df, "review_id"),
    "users": (sampled_yelp_user_df, "user_id"),
    "checkins": (sampled_yelp_checkin_df, "checkin_id")
}
relationships = [
    ("business", "business_id", "reviews", "business_id"),
    ("users", "user_id", "reviews", "user_id"),
    ("business", "business_id", "checkins", "business_id")
]

#----FEATURES FOR TRAINING----#
training_features_cutoff_times = pd.DataFrame([[_, training_cutoff_time] for _ in sampled_business_ids], columns = ['business_id', 'cutoff_time'])

training_features_matrix, training_feature_definitions = ft.dfs(entities = entities,
    relationships = relationships,
    target_entity = "business",
    cutoff_time = training_features_cutoff_times)
training_features_matrix, training_features = ft.encode_features(training_features_matrix, training_feature_definitions)

#----FEATURES FOR TEST----#
test_features_cutoff_times = pd.DataFrame([[_, label_cutoff_time] for _ in sampled_business_ids], columns = ['business_id', 'cutoff_time'])

test_features_matrix, test_feature_definitions = ft.dfs(entities = entities,
    relationships = relationships,
    target_entity = "business",
    cutoff_time = test_features_cutoff_times)
test_features_matrix, test_features = ft.encode_features(test_features_matrix, test_feature_definitions)

## Logistic Regression Fitting and Prediciton

In [14]:
logistic = linear_model.LogisticRegression()

for idx, label in enumerate(labels):
    print('\n\n\n')
    print('------------------------------------------------------------------------------------------------------------------')
    print("Training logistic regression on prediction problem: {}, idx: {}".format(problems[idx], idx))
    label = label.dropna()
    training_features = training_features_matrix[training_features_matrix.index.isin(label.business_id)]
    training_labels = list(label['problem_label_excluding_data_post_label_cutoff_time'])
    
    test_features = test_features_matrix[test_features_matrix.index.isin(label.business_id)]
    test_labels = list(label['problem_label_all_data'])
    try:
        logistic.fit(training_features, training_labels)
    except Exception as e:
        print("ERROR: {}".format(e))
        print("label: \n{}".format(label))
        continue
    
    predicted_labels = logistic.predict(test_features)
    predicted_probabilities = logistic.predict_proba(test_features)
    
    accuracy = accuracy_score(test_labels, predicted_labels)
    print("Classifier Accuracy: {0:.4f}".format(accuracy))

    accuracy_no_prediction = accuracy_score(training_labels, predicted_labels)
    print("Baseline Accuracy Metric: {0:.4f}, based on using training_labels as the prediction".format(accuracy))

    try:
        skplt.metrics.plot_roc_curve(test_labels, predicted_probabilities)
    except Exception as e:
        print("Test labels: {}".format(test_labels['problem_label_excluding_data_post_label_cutoff_time']))
        print("Predicted probabilities: {}".format(predicted_probabilities))
        print("ERROR: {}".format(e))
        continue
#     plt.show()






------------------------------------------------------------------------------------------------------------------
Training logistic regression on prediction problem: AllFilterOp(user_id)->IdentityRowOp(stars)->IdentityTransformationOp(stars)->FirstAggregationOp(stars), idx: 0
Classifier Accuracy: 0.8013
Baseline Accuracy Metric: 0.8013, based on using training_labels as the prediction




------------------------------------------------------------------------------------------------------------------
Training logistic regression on prediction problem: AllFilterOp(user_id)->GreaterRowOp(stars)->IdentityTransformationOp(stars)->FirstAggregationOp(stars), idx: 1
Classifier Accuracy: 0.9969
Baseline Accuracy Metric: 0.9969, based on using training_labels as the prediction




------------------------------------------------------------------------------------------------------------------
Training logistic regression on prediction problem: AllFilterOp(user_id)->EqRowOp(stars)->Identi

Classifier Accuracy: 0.8013
Baseline Accuracy Metric: 0.8013, based on using training_labels as the prediction




------------------------------------------------------------------------------------------------------------------
Training logistic regression on prediction problem: AllFilterOp(user_id)->GreaterRowOp(stars)->ObjectFrequencyTransformationOp(stars)->FirstAggregationOp(stars), idx: 9
Classifier Accuracy: 0.9969
Baseline Accuracy Metric: 0.9969, based on using training_labels as the prediction




------------------------------------------------------------------------------------------------------------------
Training logistic regression on prediction problem: AllFilterOp(user_id)->EqRowOp(stars)->ObjectFrequencyTransformationOp(stars)->FirstAggregationOp(stars), idx: 10
Classifier Accuracy: 0.7136
Baseline Accuracy Metric: 0.7136, based on using training_labels as the prediction




-------------------------------------------------------------------------------------------

TypeError: list indices must be integers or slices, not str