In [34]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

import tensorflow as tf

import matplotlib.pyplot as plt

import logging
logging.getLogger().setLevel(logging.INFO)

In [35]:
from google.datalab import Context
import google.datalab.storage as storage
from io import BytesIO
import random, string

In [36]:
project = Context.default().project_id

sample_bucket_name = 'james-sandbox-bucket'
sample_bucket_path = 'gs://' + sample_bucket_name
sample_bucket_object = sample_bucket_path + '/dbsqlite3.csv	'

print('Bucket: ' + sample_bucket_path)
print('Object: ' + sample_bucket_object)

Bucket: gs://james-sandbox-bucket
Object: gs://james-sandbox-bucket/dbsqlite3.csv	


In [37]:
%%gcs list --objects $sample_bucket_path

Name,Type,Size,Updated
db.sqlite3,application/octet-stream,1372160,2019-06-30 10:39:01.287000+00:00
dbsqlite3.csv,text/csv,1301779,2019-06-30 12:45:46.386000+00:00


In [38]:
working_dir = os.getcwd()
working_dir

data_dir = sample_bucket_path

In [39]:
my_bucket = storage.Bucket(sample_bucket_name)
for obj in my_bucket.objects():
  if obj.key.find('/') < 0:
    print(obj.key)

db.sqlite3
dbsqlite3.csv


In [61]:
bucket_object = my_bucket.object('dbsqlite3.csv')

In [62]:
uri = bucket_object.uri
%gcs read --object $uri --variable data_object_bytes

In [63]:
# read data into pandas df
data_df = pd.read_csv(BytesIO(data_object_bytes), sep=',')

In [64]:
data_df.isnull().mean()

# types
data_df.dtypes

# first removal of features
# data_df.filter(regex='judge|rating').columns.to_list
# remove_features = [col for col in data_df.columns if 'judge' in col]
remove_features = data_df.columns[data_df.columns.str.contains('judge|rating|Unnamed')].tolist()

data_df = data_df[data_df.columns.difference(remove_features)]

# type conversions
#print('feature types are {data_df.dtypes}')
#print('data types are {data_df.dtypes.unique()}')
feaeture_types = data_df.dtypes.unique()

# continuuous
feature_cont_names = data_df.select_dtypes(include=['float64','int64']).columns.tolist()
# categorical
feature_cat_names = data_df.select_dtypes(include='O').columns.tolist()

# missings
data_df[feature_cont_names].isnull().mean()
data_df[feature_cat_names].isnull().mean()

feature_cont_missings = data_df[feature_cont_names].columns[data_df[feature_cont_names].isnull().any()]

# replace missings
# continuous
def impute_na(df, variable, median):
    df[variable] = df[variable].fillna(median)

for name in feature_cont_missings:
    impute_na(data_df, name, data_df[name].median())

data_df[feature_cont_names].isnull().mean()

# categorical labels
# drop some variable for speed (date, time) etc
features_temporal_names = ['date', 'time']
feature_cat_names = [name for name in feature_cat_names if name not in features_temporal_names]

# number of categories
data_df[feature_cat_names]

for name in feature_cat_names:
    caridnality = data_df[name].unique().shape
    #print(f'{name} has a caridnality: {caridnality}')

# drop some tag names
drop_features = ['fighter_1', 'fighter_2']
new_cont = ['attendance']
feature_cat_names = [name for name in feature_cat_names if name not in drop_features if name not in new_cont]
feature_cont_names = feature_cont_names + new_cont

target_name = ['result']
feature_cont_names.remove('result')

In [65]:
# categorical
# count/frequency encoding
def frequency_encoding(df, variable):
    x_frequency_map = df[variable].value_counts().to_dict()
    df[variable] = df[variable].map(x_frequency_map)

for name in feature_cat_names:
    frequency_encoding(data_df, name)
    
data_df['attendance'] = data_df['attendance'].replace('','0').str.replace(',','').astype(dtype='float64')

In [66]:
# selection , final checks and to numpy
data_df[target_name].dtypes
data_df[feature_cat_names + feature_cont_names].dtypes
X_df = data_df[feature_cat_names + feature_cont_names]
Y_df = data_df[target_name]

In [67]:
X = X_df.values
Y = Y_df.values

Y = np.array([2 if y == 1 else ( 1 if y == 0 else 0) for y in Y])
#Y = np.array([1 if y == 1 else 0 for y in Y])

# tensorflow neural net (with estimator API)
K = len(set(Y.flat))

In [68]:
features = feature_cat_names + feature_cont_names
feature_dict = dict(zip(features, X.T))

In [69]:
train_input_1 = tf.estimator.inputs.numpy_input_fn(feature_dict, Y, batch_size = 100, num_epochs = 1, shuffle = True)

In [70]:
# define feature columns for input
tf_feature_columns = []
for key in feature_dict.keys():
    tf_feature_columns.append(tf.feature_column.numeric_column(key=key))

In [71]:
tf_feature_columns

[_NumericColumn(key='fighter_1_significant_strikes_landed', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='rounds_scheduled', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='fighter_1_total_strikes', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='division', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='method', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='referee', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='location', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='fighter_1_takedown_attempts', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='fighter_1_total_strikes_landed', shape=(1,), default_value=None, dtype=tf.float32, norma

In [72]:
# instantiate estimator
tensorflow_dnn_model = tf.estimator.DNNClassifier(
    feature_columns = tf_feature_columns,
    hidden_units = [5,5],
    n_classes = K)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': 'worker', '_save_checkpoints_secs': 600, '_model_dir': '/tmp/tmp__mmabhx', '_save_checkpoints_steps': None, '_num_ps_replicas': 0, '_log_step_count_steps': 100, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f55e4830160>, '_session_config': None, '_tf_random_seed': None, '_save_summary_steps': 100, '_train_distribute': None, '_master': '', '_global_id_in_cluster': 0, '_evaluation_master': '', '_task_id': 0, '_keep_checkpoint_max': 5, '_is_chief': True, '_num_worker_replicas': 1, '_service': None, '_keep_checkpoint_every_n_hours': 10000}


In [75]:
# train model
# 1) using the numpy input parameter
#tf.logging.set_verbosity(tf.logging.INFO)
tensorflow_dnn_model.train(
    input_fn = train_input_1
)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp__mmabhx/model.ckpt.
INFO:tensorflow:loss = 214833.72, step = 1
ERROR:tensorflow:Model diverged with loss = NaN.


NanLossDuringTrainingError: NaN loss during training.

In [76]:
# evaluate model
# 1
tf.logging.set_verbosity(tf.logging.INFO)
results = tensorflow_dnn_model.evaluate(
    input_fn = train_input_1,
    steps=1
)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-06-30-13:33:56
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp__mmabhx/model.ckpt-1
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2019-06-30-13:33:56
INFO:tensorflow:Saving dict for global step 1: accuracy = 0.1, average_loss = nan, global_step = 1, loss = nan


In [77]:
print(f'Accuracy of the ole tensorflow api {results}')

SyntaxError: invalid syntax (<ipython-input-77-767623d5f555>, line 1)