In [1]:
"""
syx009lab3

@author: Kevin Wilson syx009
"""
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split

COL_NAMES = ['UserCountry', 'NrReviews', 'NrHotelReviews', 'HelpfulVotes', 'Score', 'PeriodOfStay', 'TravelerType',
            'Pool', 'Gym', 'TennisCourt', 'Spa', 'Casino', 'FreeInternet', 'HotelName', 'HotelStars', 'NrRooms', 
             'UserContinent', 'MemberYears', 'ReviewMonth', 'ReviewWeekday']

# loads data from csv returning relevant features, labels, unique labels indexed by id
def load_data(selected_features, label_name):
    df = pd.read_csv('vegas.csv', names=COL_NAMES, header=0)
    features, labels = df, df.pop(label_name)
    
    # uniques are unique labels indexed by integer used to map to label name
    _, uniques = pd.factorize(labels)
    
    # use selected_features to filter out unnecessary features
    return features[selected_features], labels, uniques

# split training and test data in 70/30 split
def split_data(features, labels):
    train_x, test_x, train_y, test_y = train_test_split(features, labels, train_size=0.7, random_state=42)
    return train_x, test_x, train_y, test_y

# returns feature columns to identify column types, used by the estimator
def get_cols(features_unique):
    cols = []
    for key in features_unique:
        if features_unique[key].dtype == object:
            # make str columns categorical_column using unique names as vocabulary_list
            cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key, features_unique[key])
            # make indicator_column from categorical_column, required for DNN estimators
            cat_col = tf.feature_column.indicator_column(cat_col)
            cols.append(cat_col)
        else:
            # for columns containing only numbers
            cols.append(tf.feature_column.numeric_column(key))
            
    return cols

  from ._conv import register_converters as _register_converters


In [2]:
def predict_hotel(data, selected_features, num_epoch, question_num):
    # load data, including unique mapped label names, label is HotelName
    features, labels, uniques = load_data(selected_features, 'HotelName')

    # dict with column names as keys mapped to unique values in that column
    features_unique = {}
    for col in features:
        features_unique[col] = features[col].unique()

    # get feature_columns for DNNClassifier
    cols = get_cols(features_unique)

    # split the data into test and training
    train_x, test_x, train_y, test_y = split_data(features, labels)
    
    # init DNNClassifier, lower the amount of logging in output during training
    run_config = tf.estimator.RunConfig().replace(log_step_count_steps=1000)
    # hidden_units is 1 hidden layer with 13 nodes (average between number classes and input nodes number)
    clf = tf.estimator.DNNClassifier(hidden_units=[13], config=run_config, feature_columns=cols, n_classes=21, 
                                     label_vocabulary=uniques.tolist())

    # pandas_input_fn returns a function that must be called
    # pass to input_fn a lambda that calls the input_fn returned by pandas_input_fn
    # set num_epochs to passed value and batch_size to entire training set since not a lot of data
    clf.train(input_fn=lambda: tf.estimator.inputs.pandas_input_fn(train_x, train_y, num_epochs=num_epoch, 
                                                                   batch_size=train_x.shape[0], shuffle=True, 
                                                                   target_column='HotelName')())
    
    # evaluate accuracy using test set and give quick summary
    results = clf.evaluate(input_fn=lambda: tf.estimator.inputs.pandas_input_fn(test_x, test_y, 
                                                                                batch_size=test_x.shape[0], 
                                                                                shuffle=False, 
                                                                                target_column='HotelName')())
    for key in sorted(results):
        print('{}: {}'.format(key, results[key]))
        
    # predict hotel name based on passed in data describing the individual
    results_pred = clf.predict(input_fn=lambda: tf.estimator.inputs.pandas_input_fn(data, shuffle=False)())
    # present prediction of best hotel for the individual
    for pred in results_pred:
        template = '\nPrediction best hotel for question {}: "{}" ({:.1f}%)'

        class_id = pred['class_ids'][0]
        probability = pred['probabilities'][class_id]

        print(template.format(question_num, uniques[class_id], 100 * probability))

In [3]:
num_epoch = 2000
    
# question 1 data: want high score, going in May, group of friends, want high stars
q1_df = pd.DataFrame({'Score': [5], 'PeriodOfStay': ['Mar-May'], 'TravelerType': ['Friends'], 'HotelStars': [5.0], 
                     'ReviewMonth': ['May']})
# question 2 data: want high score, going in July, newlywed couple, want high stars, want a spa
q2_df = pd.DataFrame({'Score': [5], 'PeriodOfStay': ['Jun-Aug'], 'TravelerType': ['Couples'], 'HotelStars': [5.0], 
                      'ReviewMonth': ['July'], 'Spa': ['YES']})

In [4]:
# relevant features for question 1
selected_features = ['Score', 'PeriodOfStay', 'TravelerType', 'HotelStars', 'ReviewMonth']
predict_hotel(q1_df, selected_features, num_epoch, 1)

INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpejev1rnp', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 1000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc8ded60208>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}




INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpejev1rnp/model.ckpt.
INFO:tensorflow:loss = 1221.9989, step = 1
INFO:tensorflow:loss = 833.17725, step = 101 (0.446 sec)
INFO:tensorflow:loss = 737.0071, step = 201 (0.429 sec)
INFO:tensorflow:loss = 682.5219, step = 301 (0.317 sec)
INFO:tensorflow:loss = 646.44763, step = 401 (0.446 sec)
INFO:tensorflow:loss = 614.9945, step = 501 (0.402 sec)
INFO:tensorflow:loss = 577.6859, step = 601 (0.437 sec)
INFO:tensorflow:loss = 597.8354, step = 701 (0.433 sec)
INFO:tensorflow:loss = 575.8179, step = 801 (0.453 sec)
INFO:tensorflow:loss = 552.7625, step = 901 (0.418 sec)
INFO:tensorflow:global_step/sec: 240.309
INFO:tensorflow:loss = 553.1336, step = 1001 (0.383 sec)
INFO:tensorflow:loss = 527.8427, step = 1101 (0.459 sec)
INFO:tensorflow:loss = 535.23914, step = 1201 (0.368 sec)
INFO:tensorflow:loss = 526.8779, step = 1301 (0.400 sec)
INFO:tensorflow:loss = 520.79596, step = 1401 (0.348 sec)
INFO

# Question 1

Prediction for best hotel for friends going in May:  
**Encore at wynn Las Vegas** with 71.7% probability

In [5]:
# relevant features for question 2 same as question 1 with spa also
selected_features.append('Spa')
predict_hotel(q2_df, selected_features, num_epoch, 2)

INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp8yqqbjx0', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 1000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc86f6840f0>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}




INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp8yqqbjx0/model.ckpt.
INFO:tensorflow:loss = 1134.1091, step = 1
INFO:tensorflow:loss = 701.4768, step = 101 (0.403 sec)
INFO:tensorflow:loss = 599.07874, step = 201 (0.375 sec)
INFO:tensorflow:loss = 533.04, step = 301 (0.423 sec)
INFO:tensorflow:loss = 490.05188, step = 401 (0.375 sec)
INFO:tensorflow:loss = 480.411, step = 501 (0.342 sec)
INFO:tensorflow:loss = 477.0861, step = 601 (0.342 sec)
INFO:tensorflow:loss = 443.87, step = 701 (0.369 sec)
INFO:tensorflow:loss = 427.4829, step = 801 (0.687 sec)
INFO:tensorflow:loss = 413.75546, step = 901 (0.581 sec)
INFO:tensorflow:global_step/sec: 222.51
INFO:tensorflow:loss = 422.76794, step = 1001 (0.600 sec)
INFO:tensorflow:loss = 434.3399, step = 1101 (0.818 sec)
INFO:tensorflow:loss = 406.42737, step = 1201 (0.721 sec)
INFO:tensorflow:loss = 406.47394, step = 1301 (0.692 sec)
INFO:tensorflow:loss = 403.99985, step = 1401 (0.748 sec)
INFO:te

# Question 2

Prediction for best hotel for newly-wed couple going in July:  
**The Venetian Las Vegas Hotel** with 35.0% probability

In [6]:
def predict_rating(data, selected_features, num_epoch, question_num):
    # load data, including unique mapped label names, label is Score
    features, labels, _ = load_data(selected_features, 'Score')

    features_unique = {}
    for col in features:
        features_unique[col] = features[col].unique()

    cols = get_cols(features_unique)

    train_x, test_x, train_y, test_y = split_data(features, labels)
    run_config = tf.estimator.RunConfig().replace(log_step_count_steps=1000)
    # hidden_units is 1 hidden layer with 5 nodes since about 5 inputs
    clf = tf.estimator.DNNRegressor(hidden_units=[5], config=run_config, feature_columns=cols)

    clf.train(input_fn=lambda: tf.estimator.inputs.pandas_input_fn(train_x, train_y, num_epochs=num_epoch, 
                                                                   batch_size=train_x.shape[0], shuffle=True, 
                                                                   target_column='Score')())
    
    results = clf.evaluate(input_fn=lambda: tf.estimator.inputs.pandas_input_fn(test_x, test_y, 
                                                                                batch_size=train_x.shape[0], 
                                                                                shuffle=False, 
                                                                                target_column='Score')())
    for key in sorted(results):
        print('{}: {}'.format(key, results[key]))
        
    results_pred = clf.predict(input_fn=lambda: tf.estimator.inputs.pandas_input_fn(data, shuffle=False)())
    for pred in results_pred:
        template = '\nPrediction rating for question {}: {:.1f}'

        rating = pred['predictions'][0]

        print(template.format(question_num, rating))

In [7]:
num_epoch = 2000

# question 3 data: stayed at Circus Circus, went in October, family, prefer more stars, perfer pool
q3_df = pd.DataFrame({'HotelName': ['Circus Circus Hotel & Casino Las Vegas'], 'PeriodOfStay': ['Sep-Nov'], 
                     'TravelerType': ['Families'], 'HotelStars': [5.0], 'Pool': ['YES'], 
                      'ReviewMonth': ['October']})
# question 4 data: stayed at Bellagio, went in Janurary, business trip, prefer more stars, prefer free wi-fi
q4_df = pd.DataFrame({'HotelName': ['Bellagio Las Vegas'], 'PeriodOfStay': ['Dec-Feb'], 
                     'TravelerType': ['Business'], 'HotelStars': [5.0], 'FreeInternet': ['YES'], 
                      'ReviewMonth': ['January']})

In [8]:
selected_features = ['HotelName', 'PeriodOfStay', 'TravelerType', 'HotelStars', 'Pool', 'ReviewMonth']
predict_rating(q3_df, selected_features, num_epoch, 3)

INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpykn0zp4n', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 1000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc8663127f0>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}




INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmpykn0zp4n/model.ckpt.
INFO:tensorflow:loss = 9894.208, step = 1
INFO:tensorflow:loss = 317.17377, step = 101 (0.387 sec)
INFO:tensorflow:loss = 295.01312, step = 201 (0.322 sec)
INFO:tensorflow:loss = 325.27606, step = 301 (0.334 sec)
INFO:tensorflow:loss = 266.43753, step = 401 (0.322 sec)
INFO:tensorflow:loss = 251.16615, step = 501 (0.354 sec)
INFO:tensorflow:loss = 293.21024, step = 601 (0.358 sec)
INFO:tensorflow:loss = 254.65689, step = 701 (0.323 sec)
INFO:tensorflow:loss = 264.87518, step = 801 (0.324 sec)
INFO:tensorflow:loss = 266.03107, step = 901 (0.347 sec)
INFO:tensorflow:global_step/sec: 293.447
INFO:tensorflow:loss = 234.0405, step = 1001 (0.340 sec)
INFO:tensorflow:loss = 244.3476, step = 1101 (0.391 sec)
INFO:tensorflow:loss = 271.47314, step = 1201 (0.351 sec)
INFO:tensorflow:loss = 269.97717, step = 1301 (0.415 sec)
INFO:tensorflow:loss = 270.3614, step = 1401 (0.358 sec

# Question 3
Predicted score for family who stayed at Circus Circus in October:  
**4.0**

In [9]:
selected_features = ['HotelName', 'PeriodOfStay', 'TravelerType', 'HotelStars', 'FreeInternet', 'ReviewMonth']
predict_rating(q4_df, selected_features, num_epoch, 4)

INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp5w1ovctr', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 1000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc86f6bf4e0>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}




INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp5w1ovctr/model.ckpt.
INFO:tensorflow:loss = 5516.138, step = 1
INFO:tensorflow:loss = 259.78845, step = 101 (0.381 sec)
INFO:tensorflow:loss = 302.70966, step = 201 (0.373 sec)
INFO:tensorflow:loss = 267.4823, step = 301 (0.331 sec)
INFO:tensorflow:loss = 237.71532, step = 401 (0.333 sec)
INFO:tensorflow:loss = 305.45844, step = 501 (0.319 sec)
INFO:tensorflow:loss = 251.96625, step = 601 (0.323 sec)
INFO:tensorflow:loss = 265.0102, step = 701 (0.323 sec)
INFO:tensorflow:loss = 250.5008, step = 801 (0.404 sec)
INFO:tensorflow:loss = 258.0967, step = 901 (0.328 sec)
INFO:tensorflow:global_step/sec: 289.155
INFO:tensorflow:loss = 264.20184, step = 1001 (0.343 sec)
INFO:tensorflow:loss = 264.10226, step = 1101 (0.331 sec)
INFO:tensorflow:loss = 234.43224, step = 1201 (0.329 sec)
INFO:tensorflow:loss = 242.41803, step = 1301 (0.370 sec)
INFO:tensorflow:loss = 275.8522, step = 1401 (0.334 sec)


# Question 4

Predicted score given by person on business trip who stayed at Bellagio in January:  
**4.7**