In [2]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

loss = [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.075, 0.05, 0.025, 1e-8 ]

df = pd.DataFrame({'pred':loss, 'logloss': -np.log(loss)},columns=['pred','logloss'])

display(df)

Unnamed: 0,pred,logloss
0,1.0,-0.0
1,0.9,0.105361
2,0.8,0.223144
3,0.7,0.356675
4,0.6,0.510826
5,0.5,0.693147
6,0.4,0.916291
7,0.3,1.203973
8,0.2,1.609438
9,0.1,2.302585


In [3]:
loss = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.5, 2, 2.5, 3, 3.5, 4 ]

df = pd.DataFrame({'logloss':loss, 'pred': np.exp(np.negative(loss))},
                  columns=['logloss','pred'])

display(df)

Unnamed: 0,logloss,pred
0,0.1,0.904837
1,0.2,0.818731
2,0.3,0.740818
3,0.4,0.67032
4,0.5,0.606531
5,0.6,0.548812
6,0.7,0.496585
7,0.8,0.449329
8,0.9,0.40657
9,1.0,0.367879


In [3]:
# Classification ranking

import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.contrib.learn as learn
import numpy as np
from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil


tf.logging.set_verbosity(tf.logging.INFO)

path = "./data/"
    
filename = os.path.join(path,"iris.csv")    
df = pd.read_csv(filename,na_values=['NA','?'])

# Encode a numeric column as zscores
def encode_numeric_zscore(df,name,mean=None,sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name]-mean)/sd

# Encode feature vector
encode_numeric_zscore(df,'petal_w')
encode_numeric_zscore(df,'petal_l')
encode_numeric_zscore(df,'sepal_w')
encode_numeric_zscore(df,'sepal_l')

def encode_text_index(df,name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

species = encode_text_index(df,"species")
num_classes = len(species)

# Create the x-side (feature vectors) of the training
def to_xy(df,target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)

    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        return df.as_matrix(result).astype(np.float32),df.as_matrix([target]).astype(np.int32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32),df.as_matrix([target]).astype(np.float32)

x, y = to_xy(df,'species')
    
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(    
    x, y, test_size=0.25, random_state=45)

def get_model_dir(name,erase):
    base_path = os.path.join(".","dnn")
    model_dir = os.path.join(base_path,name)
    os.makedirs(model_dir,exist_ok=True)
    if erase and len(model_dir)>4 and os.path.isdir(model_dir):
        shutil.rmtree(model_dir,ignore_errors=True) # be careful, this deletes everything below the specified path
    return model_dir


# Get/clear a directory to store the neural network to
model_dir = get_model_dir('iris',True)

# deep neural network with 3 hidden layers of 10, 20, 5
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=x.shape[0])]
classifier = learn.DNNClassifier(
    model_dir= model_dir,
    config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1),
    hidden_units=[10, 20, 5], n_classes=num_classes, feature_columns=feature_columns)


# Early stopping
validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
    x_test,
    y_test,
    every_n_steps=500,
    #metrics=validation_metrics,
    early_stopping_metric="loss",
    early_stopping_metric_minimize=True,
    early_stopping_rounds=50)
    
# Fit/train neural network
classifier.fit(x_train, y_train,monitors=[validation_monitor],steps=10000)


INFO:tensorflow:Using config: {'_evaluation_master': '', 'keep_checkpoint_every_n_hours': 10000, '_task_type': None, '_environment': 'local', '_task_id': 0, 'keep_checkpoint_max': 5, 'save_summary_steps': 100, 'save_checkpoints_steps': None, '_master': '', 'save_checkpoints_secs': 1, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff82a6b29b0>, '_is_chief': True, 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, 'tf_random_seed': None, '_num_ps_replicas': 0}
Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into

<tensorflow.contrib.learn.python.learn.estimators.dnn.DNNClassifier at 0x7ff82a6b2a58>

In [4]:
from sklearn import metrics
import scipy as sp
import numpy as np
import math
from IPython.display import display, HTML

def mlogloss(y_test, preds):
    epsilon = 1e-15
    sum = 0
    for row in zip(preds,y_test):
        x = row[0][row[1]]
        x = max(epsilon,x)
        x = min(1-epsilon,x)
        sum+=math.log(x)
    return( (-1/len(preds))*sum)

def perturbation_rank(model, x, y, names, regression):
    errors = []

    for i in range(x.shape[1]):
        hold = np.array(x[:, i])
        np.random.shuffle(x[:, i])
        
        if regression:
            # The following code is only needed until Google fixes SKCOMPAT
            # pred = model.predict(x)
            pred = list(model.predict(x_test, as_iterable=True))
            error = metrics.mean_squared_error(y, pred)
        else:
            # The following code is only needed until Google fixes SKCOMPAT
            # pred = model.predict_proba(x)
            pred = list(model.predict_proba(x_test, as_iterable=True))
            error = mlogloss(y, pred)
            
        errors.append(error)
        x[:, i] = hold
        
    max_error = np.max(errors)
    importance = [e/max_error for e in errors]
   
    data = {'name':names,'error':errors,'importance':importance}
    result = pd.DataFrame(data, columns = ['name','error','importance'])
    result.sort_values(by=['importance'], ascending=[0], inplace=True)
    return result


tf.logging.set_verbosity(tf.logging.ERROR)
names = df.columns.values[0:-1] # x column names
rank = perturbation_rank(classifier, x_test, y_test, names, False)
display(rank)

Unnamed: 0,name,error,importance
3,petal_w,6.096499,1.0
2,petal_l,5.776619,0.947531
0,sepal_l,0.978465,0.160496
1,sepal_w,0.228206,0.037432


In [7]:
#Regression Input Perturbation Ranking
import tensorflow as tf
import tensorflow.contrib.learn as learn
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore

path = "./data/"

# Set the desired TensorFlow output level for this example
tf.logging.set_verbosity(tf.logging.INFO)

filename_read = os.path.join(path,"auto-mpg.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)
    
# create feature vector
missing_median(df, 'horsepower')
df.drop('name',1,inplace=True)
encode_numeric_zscore(df, 'horsepower')
encode_numeric_zscore(df, 'weight')
encode_numeric_zscore(df, 'cylinders')
encode_numeric_zscore(df, 'displacement')
encode_numeric_zscore(df, 'acceleration')

def encode_text_dummy(df,name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name,x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)
    
encode_text_dummy(df, 'origin')

# Encode to a 2D matrix for training
x,y = to_xy(df,'mpg')

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.20, random_state=42)

# Get/clear a directory to store the neural network to
model_dir = get_model_dir('mpg',True)

# Create a deep neural network with 3 hidden layers of 50, 25, 10
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=x.shape[0])]
regressor = learn.DNNRegressor(
    model_dir= model_dir,
    config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1),
    feature_columns=feature_columns,
    hidden_units=[50, 25, 10])


# Early stopping
validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
    x_test,
    y_test,
    every_n_steps=500,
    early_stopping_metric="loss",
    early_stopping_metric_minimize=True,
    early_stopping_rounds=50)
    
# Fit/train neural network
regressor.fit(x_train, y_train,monitors=[validation_monitor],steps=10000)

INFO:tensorflow:Using config: {'_evaluation_master': '', 'keep_checkpoint_every_n_hours': 10000, '_task_type': None, '_environment': 'local', '_task_id': 0, 'keep_checkpoint_max': 5, 'save_summary_steps': 100, 'save_checkpoints_steps': None, '_master': '', 'save_checkpoints_secs': 1, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7ff8219effd0>, '_is_chief': True, 'tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, 'tf_random_seed': None, '_num_ps_replicas': 0}
Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into

DNNRegressor(feature_columns=[_RealValuedColumn(column_name='', dimension=398, default_value=None, dtype=tf.float32, normalizer=None)], hidden_units=[50, 25, 10], dropout=None, optimizer=None)

In [8]:
tf.logging.set_verbosity(tf.logging.ERROR)

# Rank the features
from IPython.display import display, HTML

names = df.columns.values[1:] # x column names
rank = perturbation_rank(regressor, x_test, y_test, names, True)
display(rank)

Unnamed: 0,name,error,importance
2,horsepower,27.136688,1.0
3,weight,19.588627,0.72185
5,year,13.714651,0.505391
6,origin-1,9.258288,0.341172
7,origin-2,8.633492,0.318148
1,displacement,8.087773,0.298038
4,acceleration,8.012904,0.295279
8,origin-3,7.901067,0.291158
0,cylinders,7.098988,0.261601
