In [34]:
#imports
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.contrib.learn as skflow
from scipy.stats import zscore
from sklearn.cross_validation import KFold
from sklearn import metrics
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

In [35]:
#define common functions
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)    
def encode_text_dummy(df,name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name,x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)
    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).    
def encode_text_index(df,name): 
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_
                
# Encode a numeric column as zscores    
def encode_numeric_zscore(df,name,mean=None,sd=None):
    if mean is None:
        mean = df[name].mean()
        
    if sd is None:
        sd = df[name].std()
        
    df[name] = (df[name]-mean)/sd
    
# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df,target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)

    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    print(target_type)
    
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        return df.as_matrix(result).astype(np.float32),df.as_matrix([target]).astype(np.int32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32),df.as_matrix([target]).astype(np.float32)


# setup exponential decay function
def exp_decay(global_step):
    return tf.train.exponential_decay(
        learning_rate=0.01, global_step=global_step,
        decay_steps=100, decay_rate=0.001)

In [36]:
#Read Input CSV file
path = "./data/"
inputFilePath = os.path.join(path, "oronite.csv")
#df = pd.read_csv(inputFilePath, compression="gzip", header=0, na_values=['NULL'])
df = pd.read_csv(inputFilePath, header=0, na_values=['NULL'])

In [37]:
#show headers
#headers = list(df.columns.values)
#print(headers)
print(df.columns.values)

['\ufeffOR-F Code' 'D' 'PD' 'B' 'MD' 'F' 'FM' 'ZN' 'DE' 'IN' 'O' 'VI' 'DI'
 'LAB_CODE' 'Result_Code' 'TEST_RESULT_VALUE']


In [38]:
#Sort dataset
#df.sort_values(by="SortKey",ascending=True)
#shuffle dataset. Unnecessary in this case because already sorted by guid
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

In [39]:
df.drop('\ufeffOR-F Code', axis=1, inplace=True)

In [40]:
#encode result_code and lab_code as numbers
encode_text_dummy(df, 'LAB_CODE')
encode_text_dummy(df, 'Result_Code')

In [41]:
df

Unnamed: 0,D,PD,B,MD,F,FM,ZN,DE,IN,O,VI,DI,TEST_RESULT_VALUE,LAB_CODE-AL,LAB_CODE-EG,LAB_CODE-LT,LAB_CODE-SR,Result_Code-DP1--WDN
0,0.000,0.30,80.13,0.0,0.025,0.0,1.377,2.721,1.150,0.00,4.00,9.580,223.2,0.0,0.0,0.0,1.0,1.0
1,0.000,0.10,87.02,0.0,0.023,0.0,1.457,4.087,0.000,0.00,0.00,6.670,445.7,0.0,0.0,1.0,0.0,1.0
2,2.510,0.20,77.31,0.0,0.020,0.0,1.109,3.442,0.214,0.00,12.50,4.068,314.1,0.0,0.0,0.0,1.0,1.0
3,0.000,0.30,82.10,0.0,0.010,0.0,1.227,4.303,0.170,0.00,4.90,5.676,214.3,0.0,0.0,1.0,0.0,1.0
4,0.000,0.20,84.75,0.0,0.025,0.0,1.251,3.072,0.202,0.00,6.20,4.304,215.3,0.0,0.0,1.0,0.0,1.0
5,0.000,0.00,77.13,0.0,0.025,0.0,1.377,3.569,0.700,0.00,12.20,5.000,207.0,0.0,1.0,0.0,0.0,1.0
6,0.000,0.00,74.80,0.0,0.020,0.0,2.094,5.849,0.150,0.00,9.00,6.329,252.6,0.0,0.0,0.0,1.0,1.0
7,0.000,0.20,83.20,0.0,0.025,0.0,1.249,3.088,0.199,0.00,7.12,4.925,210.5,0.0,0.0,1.0,0.0,1.0
8,0.000,0.00,74.80,0.0,0.020,0.0,2.094,5.849,0.150,0.00,9.00,6.329,321.1,0.0,0.0,0.0,1.0,1.0
9,0.000,0.20,84.04,0.0,0.025,0.0,0.697,1.980,0.100,0.00,10.00,2.673,477.6,0.0,1.0,0.0,0.0,1.0


In [42]:
print(df.columns.values)

['D' 'PD' 'B' 'MD' 'F' 'FM' 'ZN' 'DE' 'IN' 'O' 'VI' 'DI'
 'TEST_RESULT_VALUE' 'LAB_CODE-AL' 'LAB_CODE-EG' 'LAB_CODE-LT'
 'LAB_CODE-SR' 'Result_Code-DP1--WDN']


In [44]:
#encode all numeric values to zscored values
#encode_numeric_zscore(df, 'AW')
encode_numeric_zscore(df, 'D')
encode_numeric_zscore(df, 'PD')
encode_numeric_zscore(df, 'B')
encode_numeric_zscore(df, 'MD')
encode_numeric_zscore(df, 'F')
encode_numeric_zscore(df, 'FM')
encode_numeric_zscore(df, 'ZN')
encode_numeric_zscore(df, 'DE')
encode_numeric_zscore(df, 'IN')
encode_numeric_zscore(df, 'O')
encode_numeric_zscore(df, 'VI')
encode_numeric_zscore(df, 'DI')

In [45]:
#discard rows where z-score > 2
df.fillna(0)
df

Unnamed: 0,D,PD,B,MD,F,FM,ZN,DE,IN,O,VI,DI,TEST_RESULT_VALUE,LAB_CODE-AL,LAB_CODE-EG,LAB_CODE-LT,LAB_CODE-SR,Result_Code-DP1--WDN
0,-0.516318,1.246489,0.166940,-0.078567,-0.239846,-0.110968,-0.030189,-1.151937,1.474092,-0.177906,-0.987497,1.455932,223.2,0.0,0.0,0.0,1.0,1.0
1,-0.516318,-0.287943,1.604487,-0.078567,-0.414869,-0.110968,0.253734,0.305958,-1.152751,-0.177906,-2.077384,0.053231,445.7,0.0,0.0,1.0,0.0,1.0
2,6.530478,0.479273,-0.421432,-0.078567,-0.677403,-0.110968,-0.981330,-0.382433,-0.663930,-0.177906,1.328513,-1.201005,314.1,0.0,0.0,0.0,1.0,1.0
3,-0.516318,1.246489,0.577965,-0.078567,-1.552516,-0.110968,-0.562544,0.536489,-0.764435,-0.177906,-0.742272,-0.425904,214.3,0.0,0.0,1.0,0.0,1.0
4,-0.516318,0.479273,1.130868,-0.078567,-0.239846,-0.110968,-0.477367,-0.777324,-0.691340,-0.177906,-0.388059,-1.087246,215.3,0.0,0.0,1.0,0.0,1.0
5,-0.516318,-1.055158,-0.458988,-0.078567,-0.239846,-0.110968,-0.030189,-0.246889,0.446197,-0.177906,1.246772,-0.751755,207.0,0.0,1.0,0.0,0.0,1.0
6,-0.516318,-1.055158,-0.945125,-0.078567,-0.677403,-0.110968,2.514468,2.186493,-0.810119,-0.177906,0.374862,-0.111140,252.6,0.0,0.0,0.0,1.0,1.0
7,-0.516318,0.479273,0.807472,-0.078567,-0.239846,-0.110968,-0.484465,-0.760248,-0.698193,-0.177906,-0.137385,-0.787907,210.5,0.0,0.0,1.0,0.0,1.0
8,-0.516318,-1.055158,-0.945125,-0.078567,-0.677403,-0.110968,2.514468,2.186493,-0.810119,-0.177906,0.374862,-0.111140,321.1,0.0,0.0,0.0,1.0,1.0
9,-0.516318,0.479273,0.982732,-0.078567,-0.239846,-0.110968,-2.443532,-1.942786,-0.924330,-0.177906,0.647334,-1.873433,477.6,0.0,1.0,0.0,0.0,1.0


In [46]:
# Create x(predictors) and y (expected outcome)
x,y = to_xy(df, "TEST_RESULT_VALUE")

float64


In [47]:
#Split into test/train
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=45)

In [52]:
# Create a deep neural network with 3 hidden layers of 105,50,25,12,5
regressor = skflow.TensorFlowDNNRegressor(hidden_units=[85, 40, 20, 10], learning_rate=0.01, steps=300000)

#early_stop = skflow.monitors.ValidationMonitor(x_test,y_test,early_stopping_rounds=2000, print_steps=50)

In [None]:
#Fit/train neural network
regressor.fit(x_train,y_train)

Step #100, epoch #25, avg. train loss: 26694.48438
Step #200, epoch #50, avg. train loss: 4071.89160
Step #300, epoch #75, avg. train loss: 1306.95129
Step #400, epoch #100, avg. train loss: 1132.84021
Step #500, epoch #125, avg. train loss: 1031.47119
Step #600, epoch #150, avg. train loss: 953.54272
Step #700, epoch #175, avg. train loss: 891.61108
Step #800, epoch #200, avg. train loss: 835.11084
Step #900, epoch #225, avg. train loss: 773.07153
Step #1000, epoch #250, avg. train loss: 722.12128
Step #1100, epoch #275, avg. train loss: 681.34680
Step #1200, epoch #300, avg. train loss: 646.50403
Step #1300, epoch #325, avg. train loss: 606.13226
Step #1400, epoch #350, avg. train loss: 576.56317
Step #1500, epoch #375, avg. train loss: 548.33429
Step #1600, epoch #400, avg. train loss: 525.23633
Step #1700, epoch #425, avg. train loss: 506.14227
Step #1800, epoch #450, avg. train loss: 487.63281
Step #1900, epoch #475, avg. train loss: 470.48450
Step #2000, epoch #500, avg. train lo

TensorFlowDNNRegressor(batch_size=32, clip_gradients=5.0, config=None,
            continue_training=False, dropout=None,
            hidden_units=[85, 40, 20, 10], learning_rate=0.01, n_classes=0,
            optimizer='Adagrad', steps=300000, verbose=1)

In [54]:
#Measure accuracy
score = np.sqrt(metrics.mean_squared_error(regressor.predict(x_test), y_test))
print("Final score (RMSE): {}".format(score))

Final score (RMSE): 148.8486328125


In [58]:
#Make predictions 
pred = regressor.predict(x_test)
predDF = pd.DataFrame(pred)
dfytest=pd.DataFrame(y_test)
dfytest.reset_index(inplace=True, drop=True)
predDF.reset_index(inplace=True, drop=True)

df2 = pd.concat([predDF, dfytest], axis=1,ignore_index=True)
df2.columns=['Predicted', 'Actual']
df2


Unnamed: 0,Predicted,Actual
0,254.960403,246.399994
1,231.180801,223.199997
2,229.887314,293.0
3,183.837448,235.899994
4,195.853485,314.899994
5,309.261749,214.300003
6,516.272095,198.800003
7,229.286972,221.899994
8,170.185654,219.100006
9,234.212799,289.200012
