In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
def load_movie_lens():
    age_desc = {
        1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 45: "45-49", 50: "50-55", 56: "56+"
    }
    occupation_desc = { 
        0: "other or not specified", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
        4: "college/grad student", 5: "customer service", 6: "doctor/health care",
        7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student", 11: "lawyer",
        12: "programmer", 13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
        17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer"
    }
    rating_data = pd.read_csv(
        "ml-1m/ratings.dat",
        sep="::",
        engine="python",
        encoding="latin-1",
        names=['userid', 'movieid', 'rating', 'timestamp'])
    user_data = pd.read_csv(
        "ml-1m/users.dat", 
        sep='::', 
        engine='python', 
        encoding='latin-1',
        names=['userid', 'gender', 'age', 'occupation', 'zipcode']
    )
    user_data['age_desc'] = user_data['age'].apply(lambda x: age_desc[x])
    user_data['occ_desc'] = user_data['occupation'].apply(lambda x: occupation_desc[x])
    movie_data = pd.read_csv(
        "ml-1m/movies.dat",
        sep='::', 
        engine='python', 
        encoding='latin-1',
        names=['movieid', 'title', 'genre']
    )
    dataset = pd.merge(pd.merge(rating_data, movie_data, how="left", on="movieid"), user_data, how="left", on="userid")
    adj_col = dataset['movieid']
    adj_col_uni = adj_col.sort_values().unique()
    adj_df = pd.DataFrame(adj_col_uni).reset_index().rename(columns = {0:'movieid','index':'adj_movieid'})
    dataset = pd.merge(adj_df,dataset,how="right", on="movieid")
    dataset['adj_userid'] = dataset['userid'] - 1
    return dataset

def split_dataset(dataset, split_frac=.7):
    dataset = dataset.sample(frac=1, replace=False)
    n_split = int(len(dataset)*split_frac)
    trainset = dataset[:n_split]
    validset = dataset[n_split:]
    return trainset, validset

fullset = load_movie_lens()
trainset, validset = split_dataset(fullset)

In [5]:
CAT_STR_COLS = ["genre", "zipcode", "gender"]
CAT_INT_COLS = [ "age", "occupation"]
LABEL_COL = "rating"
DEEP_COLS = CAT_STR_COLS + CAT_INT_COLS
WIDE_COL_CROSSES = [["age", "genre"],["gender", "genre"]]

In [19]:
def make_inputs(dataframe):
    feature_inputs = {
        col_name: tf.SparseTensor(
            indices = [[i, 0] for i in range(len(dataframe[col_name]))],
            values = dataframe[col_name].values,
            dense_shape = [len(dataframe[col_name]), 1]
        )
        for col_name in CAT_STR_COLS + CAT_INT_COLS
    }
    label_input = tf.constant(dataframe[LABEL_COL].values-1)
    return (feature_inputs, label_input)

def make_hash_layers():
    hashed_layers = {
        col_name : tf.feature_column.categorical_column_with_hash_bucket(col_name, hash_bucket_size=1000) 
        for col_name in CAT_STR_COLS
    }
    return hashed_layers

def make_int_layers():
    age = tf.feature_column.categorical_column_with_vocabulary_list(
    "age", [1,18,25,35,45, 50, 56])
    occupation = tf.feature_column.categorical_column_with_vocabulary_list(
    "occupation", [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
    age = tf.feature_column.indicator_column(age)
    occupation = tf.feature_column.indicator_column(occupation)
    int_layers = [age, occupation]
    return int_layers

def make_embedding_layers(hashed_layers, dim=6):
    embedding_layers = [
        tf.feature_column.embedding_column(
            hashed_layers[col_name],
            dimension=dim
        )
        for col_name in CAT_STR_COLS
    ]
    return embedding_layers

def make_deep_layers(embedding_layers,int_layers):
    return embedding_layers+int_layers

def make_wide_input_layers():
    crossed_wide_input_layers = [
        tf.feature_column.crossed_column([c for c in cs], hash_bucket_size=int(10**(3+len(cs))))
        for cs in WIDE_COL_CROSSES
    ]
    return crossed_wide_input_layers

In [21]:
print("create input layers...", end="")
#input_layers = make_input_layers()
hashed_layers =make_hash_layers()
int_layers = make_int_layers()
embedding_layers = make_embedding_layers(hashed_layers,dim =6)
deep_input_layers = make_deep_layers(embedding_layers,int_layers)
wide_input_layers = make_wide_input_layers()
print("done!")
print("create model...", end="")
model = tf.contrib.learn.DNNLinearCombinedClassifier(
    n_classes=5,
    linear_feature_columns = wide_input_layers,
    dnn_feature_columns = deep_input_layers,
    dnn_hidden_units = [32, 16],
    fix_global_step_increment_bug=True,
    config = tf.contrib.learn.RunConfig(
        keep_checkpoint_max = 1,
        save_summary_steps = 10,
        model_dir = "./model/"
    )
)
print("done!")
print("training model...", end="")
model.fit(input_fn = lambda: make_inputs(trainset), steps=1000)
print("done!")
print("evaluating model...", end="")
results = model.evaluate(input_fn = lambda: make_inputs(validset), steps=1)
print("done!")
print("calculating predictions...", end="")
predictions = model.predict_classes(input_fn = lambda: make_inputs(validset))
print("done!")
print("calculating probabilites...", end="")
probabilities = model.predict_proba(input_fn = lambda: make_inputs(validset))
print("done!")

create input layers...done!
create model...INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x122268b38>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_tf_random_seed': None, '_save_summary_steps': 10, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': './model/'}
done!
Instructions for updating:
Please switch to tf.summary.scalar. Note that tf.summary.scalar uses the node name instead of the tag. This means that TensorFlow will automatically de-duplicate summary names based on the scope they are created in. Also, passing a tensor or list of tags to a scalar summary op is no longer suppor

In [22]:
for n, r in results.items():
    print("%s: %a"%(n, r))

loss: 1.4285545
accuracy: 0.35313916
global_step: 1000
