In [8]:
from sklearn import preprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


# encode the classes(setosa, versicolor, virginica) into [1,0,0], [0,1,0], [0,0,1]
# and store the classes by n(donate the number of the classes) fields
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    
    for x in dummies.columns:
        dummy_name = '{}-{}'.format(str(name), str(x))
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)
    
# encode the classes  into the binary type, 1 if the class is target class, nor 0
def encode_text_single_dummy(df, name, target_vals):
    for val in target_vals:
        is_value = lambda x: 1 if str(x) == str(val) else 0
        val_name = 'dummy-%s'%val
        df[dummy_name] = df[name].apply(is_value)
    
# encode the classes(setosa, versicolor, virginica) into 1, 2, 3
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    encode_name = 'le-%s'%name
    df[encode_name] = le.fit_transform(df[name])
    return le.classes_

# encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    mean = mean or df[name].mean()
    sd = sd or df[name].std()
    
    df[name] = (df[name] - mean)/sd

# convert all missing value in specific column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

# convert all missing value in specific column to the default value
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)
    
# convert pandas dataFrame to x, y inputs that tensorflow needs
def to_xy(df, target):
    results = []
    for c in df.columns:
        if c != target:
            results.append(c)
    
    target_type = df[target].dtypes[0] if hasattr(df[target].dtypes, '__iter__') else df[target].dtypes
    # encode to int for classification, otherwise float, TensorFlow prefer 32bits
    if target_type in (np.int64, np.int32):
        # classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(results).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(results).astype(np.float32), df.as_matrix(target).astype(np.float32)

# plot chart of Regression
def chart_regression(pred, y, sort=True):
    df = pd.DataFrame({'pred': pred, 'y': y.flatten()})
    if sort:
        df.sort_values(by=['y'], inpalce=True)
    plt.plot(df['y'].tolist(), label='expected')
    plt.plot(df['pred'].tolist(), label='predict')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# remove all rows that deviate over the sd 
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd*df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)
    return df

# encode a column to a range between normalize_low and normalized_high
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = df[name].min()
        data_high = df[name].max()
    
    df[name] = (df[name]-data_low)*(normalized_high-normalized_low)/(data_high-data_low) + normalized_low

    return df[name]


In [1]:
# load data and normalize 

import tensorflow.contrib.learn as sklflow
import pandas as pd
import os 
import numpy as np
from sklearn import metrics
from scipy.stats import zscore

path = '../data/'

file = os.path.join(path, 'auto-mpg.csv')


In [9]:
mpg_df = pd.read_csv(file, encoding='utf8', na_values=['NA', '?'])

'''
find special column and row contains NAN
mpg_df.isnull().any()  # return column that contain NaN

mpg_df[mpg_df[colname].isnull()]

'''
missing_median(mpg_df, 'horsepower')
mpg_df.drop('name', axis=1, inplace=True)


# remove outliers in mpg
print("the length of MPG before dropped : %s"%len(mpg_df))

remove_outliers(mpg_df, 'mpg', 2)

print("the length of MPG after dropped : %s"%len(mpg_df))

the length of MPG before dropped : 398
the length of MPG after dropped : 388


In [10]:
mpg_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,1
1,15.0,8,350.0,165.0,3693,11.5,70,1
2,18.0,8,318.0,150.0,3436,11.0,70,1
3,16.0,8,304.0,150.0,3433,12.0,70,1
4,17.0,8,302.0,140.0,3449,10.5,70,1


## Training with a Validation Set and Early Stopping

split dataset into training data and validation dataset, 

In [11]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

In [25]:
# load data 
iris_df = pd.read_csv('../data/iris.csv', na_values=['NA', '?'])
iris_df.head()

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [26]:
# check NaN column 
iris_df.isnull().any()

sepal_l    False
sepal_w    False
petal_l    False
petal_w    False
species    False
dtype: bool

In [27]:
# encode the species
cl_ = encode_text_index(iris_df, 'species')
iris_df.head()

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w,species,le-species
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0


In [28]:
# generate valid data from dataframe
iris_df.drop('species', axis=1, inplace=True)
x, y = to_xy(iris_df, 'le-species')

In [29]:
x[:3], y[:3]

(array([[ 5.0999999 ,  3.5       ,  1.39999998,  0.2       ],
        [ 4.9000001 ,  3.        ,  1.39999998,  0.2       ],
        [ 4.69999981,  3.20000005,  1.29999995,  0.2       ]], dtype=float32),
 array([[ 1.,  0.,  0.],
        [ 1.,  0.,  0.],
        [ 1.,  0.,  0.]], dtype=float32))

In [30]:
cl_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [32]:
# train data with sequantial model

# split dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=32)

model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], activation='relu'))  # input layer and hidden 1
model.add(Dense(25, activation='relu')) # hidden 2
model.add(Dense(y.shape[1], activation='softmax'))

# define the loss function and optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam')

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, mode='auto')

model.fit(x, y, validation_data=(x_test, y_test), callbacks=[monitor], verbose=2, epochs=1000)

Train on 150 samples, validate on 38 samples
Epoch 1/1000
0s - loss: 1.3831 - val_loss: 1.2252
Epoch 2/1000
0s - loss: 1.1690 - val_loss: 1.0890
Epoch 3/1000
0s - loss: 1.0651 - val_loss: 1.0404
Epoch 4/1000
0s - loss: 1.0330 - val_loss: 1.0427
Epoch 5/1000
0s - loss: 1.0305 - val_loss: 1.0316
Epoch 6/1000
0s - loss: 1.0101 - val_loss: 1.0043
Epoch 7/1000
0s - loss: 0.9855 - val_loss: 0.9820
Epoch 8/1000
0s - loss: 0.9699 - val_loss: 0.9691
Epoch 9/1000
0s - loss: 0.9589 - val_loss: 0.9575
Epoch 10/1000
0s - loss: 0.9484 - val_loss: 0.9422
Epoch 11/1000
0s - loss: 0.9323 - val_loss: 0.9282
Epoch 12/1000
0s - loss: 0.9199 - val_loss: 0.9139
Epoch 13/1000
0s - loss: 0.9057 - val_loss: 0.8982
Epoch 14/1000
0s - loss: 0.8931 - val_loss: 0.8820
Epoch 15/1000
0s - loss: 0.8799 - val_loss: 0.8696
Epoch 16/1000
0s - loss: 0.8659 - val_loss: 0.8529
Epoch 17/1000
0s - loss: 0.8513 - val_loss: 0.8393
Epoch 18/1000
0s - loss: 0.8366 - val_loss: 0.8232
Epoch 19/1000
0s - loss: 0.8233 - val_loss: 0.

Epoch 175/1000
0s - loss: 0.1038 - val_loss: 0.0818
Epoch 176/1000
0s - loss: 0.1017 - val_loss: 0.0735
Epoch 177/1000
0s - loss: 0.1013 - val_loss: 0.0712
Epoch 178/1000
0s - loss: 0.1009 - val_loss: 0.0714
Epoch 179/1000
0s - loss: 0.1000 - val_loss: 0.0733


<keras.callbacks.History at 0x7f364b73def0>

In [33]:
pred = model.predict(x_test)
pred[:5]

array([[  1.86586077e-03,   8.40946913e-01,   1.57187268e-01],
       [  9.97686744e-01,   2.31325789e-03,   4.80374451e-09],
       [  9.90083575e-01,   9.91635118e-03,   2.95967322e-08],
       [  1.77585264e-03,   9.40439880e-01,   5.77843674e-02],
       [  5.29086901e-06,   3.91865112e-02,   9.60808218e-01]], dtype=float32)

## Early Stopping and the best weight

early stopping: the training will halt once the validation set no longer saw score improvements for a number of steps, the number of steps that early stopping will tolerate is calles patience

store the best weight when training data

In [34]:
from keras.callbacks import ModelCheckpoint

model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
# save the best weights
checkpoint = ModelCheckpoint(filepath='best_weights.hdf5', verbose=0, save_best_only=True)

model.fit(x, y, validation_data=(x_test, y_test), callbacks=[monitor, checkpoint], verbose=2, epochs=1000)
model.load_weights('best_weights.hdf5')

Train on 150 samples, validate on 38 samples
Epoch 1/1000
0s - loss: 1.2727 - val_loss: 1.1972
Epoch 2/1000
0s - loss: 1.2206 - val_loss: 1.1602
Epoch 3/1000
0s - loss: 1.1800 - val_loss: 1.1319
Epoch 4/1000
0s - loss: 1.1463 - val_loss: 1.1136
Epoch 5/1000
0s - loss: 1.1229 - val_loss: 1.1005
Epoch 6/1000
0s - loss: 1.1046 - val_loss: 1.0909
Epoch 7/1000
0s - loss: 1.0925 - val_loss: 1.0810
Epoch 8/1000
0s - loss: 1.0798 - val_loss: 1.0690
Epoch 9/1000
0s - loss: 1.0659 - val_loss: 1.0548
Epoch 10/1000
0s - loss: 1.0512 - val_loss: 1.0400
Epoch 11/1000
0s - loss: 1.0389 - val_loss: 1.0242
Epoch 12/1000
0s - loss: 1.0218 - val_loss: 1.0071
Epoch 13/1000
0s - loss: 1.0058 - val_loss: 0.9891
Epoch 14/1000
0s - loss: 0.9900 - val_loss: 0.9727
Epoch 15/1000
0s - loss: 0.9760 - val_loss: 0.9581
Epoch 16/1000
0s - loss: 0.9634 - val_loss: 0.9465
Epoch 17/1000
0s - loss: 0.9534 - val_loss: 0.9370
Epoch 18/1000
0s - loss: 0.9440 - val_loss: 0.9268
Epoch 19/1000
0s - loss: 0.9352 - val_loss: 0.

0s - loss: 0.4309 - val_loss: 0.3768
Epoch 164/1000
0s - loss: 0.4305 - val_loss: 0.3759
Epoch 165/1000
0s - loss: 0.4295 - val_loss: 0.3751
Epoch 166/1000
0s - loss: 0.4289 - val_loss: 0.3743
Epoch 167/1000
0s - loss: 0.4282 - val_loss: 0.3737
Epoch 168/1000
0s - loss: 0.4272 - val_loss: 0.3730
Epoch 169/1000
0s - loss: 0.4263 - val_loss: 0.3720
Epoch 170/1000
0s - loss: 0.4257 - val_loss: 0.3710
Epoch 171/1000
0s - loss: 0.4244 - val_loss: 0.3695
Epoch 172/1000
0s - loss: 0.4232 - val_loss: 0.3681
Epoch 173/1000
0s - loss: 0.4218 - val_loss: 0.3664
Epoch 174/1000
0s - loss: 0.4200 - val_loss: 0.3646
Epoch 175/1000
0s - loss: 0.4180 - val_loss: 0.3624
Epoch 176/1000
0s - loss: 0.4153 - val_loss: 0.3596
Epoch 177/1000
0s - loss: 0.4121 - val_loss: 0.3561
Epoch 178/1000
0s - loss: 0.4085 - val_loss: 0.3525
Epoch 179/1000
0s - loss: 0.4051 - val_loss: 0.3484
Epoch 180/1000
0s - loss: 0.4002 - val_loss: 0.3447
Epoch 181/1000
0s - loss: 0.3956 - val_loss: 0.3408
Epoch 182/1000
0s - loss: 0

0s - loss: 0.1063 - val_loss: 0.0754
Epoch 330/1000
0s - loss: 0.1057 - val_loss: 0.0753
Epoch 331/1000
0s - loss: 0.1059 - val_loss: 0.0757
Epoch 332/1000
0s - loss: 0.1046 - val_loss: 0.0730
Epoch 333/1000
0s - loss: 0.1049 - val_loss: 0.0737
Epoch 334/1000
0s - loss: 0.1040 - val_loss: 0.0733
Epoch 335/1000
0s - loss: 0.1041 - val_loss: 0.0747
Epoch 336/1000
0s - loss: 0.1027 - val_loss: 0.0729
Epoch 337/1000
0s - loss: 0.1023 - val_loss: 0.0723
Epoch 338/1000
0s - loss: 0.1024 - val_loss: 0.0708
Epoch 339/1000
0s - loss: 0.1015 - val_loss: 0.0714
Epoch 340/1000
0s - loss: 0.1014 - val_loss: 0.0726
Epoch 341/1000
0s - loss: 0.1006 - val_loss: 0.0719
Epoch 342/1000
0s - loss: 0.1003 - val_loss: 0.0704
Epoch 343/1000
0s - loss: 0.0999 - val_loss: 0.0713
Epoch 344/1000
0s - loss: 0.0997 - val_loss: 0.0714
Epoch 00343: early stopping


In [35]:
pred = model.predict(x_test)
pred[:5]

array([[  6.30629808e-03,   8.88681173e-01,   1.05012543e-01],
       [  9.99187529e-01,   8.12484708e-04,   1.17771098e-10],
       [  9.90120053e-01,   9.87990107e-03,   6.11138695e-09],
       [  7.07436772e-03,   9.50062931e-01,   4.28627096e-02],
       [  5.92995348e-05,   5.64648025e-02,   9.43475842e-01]], dtype=float32)

In [36]:
pre_ = np.argmax(pred, axis=1)
pre_

array([1, 0, 0, 1, 2, 2, 0, 0, 1, 0, 1, 2, 1, 1, 2, 2, 1, 2, 1, 0, 0, 2, 1,
       0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 1, 0, 0, 2])

In [37]:
act_y = np.argmax(y_test, axis=1)
act_y

array([1, 0, 0, 1, 2, 2, 0, 0, 1, 0, 1, 2, 1, 1, 2, 2, 1, 2, 1, 0, 0, 2, 2,
       0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 1, 0, 0, 2])

In [38]:
metrics.accuracy_score(pre_, act_y)

0.97368421052631582

## mean_square_error

$ RMSE = \sqrt{\frac{1}{n}\sum_{i=1}^{n}(\hat{y}_{i} - y_{i})^2} $