In [19]:
# Programming Assignment #2 by Jeff Heaton
# T81-558: Application of Deep Learning
import os
import sklearn
import pandas as pd
import numpy as np
import tensorflow.contrib.learn as skflow
from sklearn.cross_validation import KFold
from scipy.stats import zscore
from sklearn import metrics
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split

path = "./data/"

# These four functions will help you, they were covered in class.
# Encode a text field to dummy variables
def encode_text_dummy(df,name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name,x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

# Encode a text field to a single index value
def encode_text_index(df,name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Encode a numeric field to Z-Scores
def encode_numeric_zscore(df,name,mean=None,sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()        
    df[name] = (df[name]-mean)/sd


# Encode a numeric field to fill missing values with the median.
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

# Convert a dataframe to x/y suitable for training.
def to_xy(df,target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    return df.as_matrix(result),df[target]
    
# Encode the toy dataset
def encode_toy_dataset(name):
    df = pd.read_csv(name,na_values=['NA','?'])
    #    Generate dummy variables for the shape and metal.
    encode_text_dummy(df,'metal')
    encode_text_dummy(df,'shape')
    #     Encode high, width and length as z-scores
    encode_numeric_zscore(df,'height',mean=None,sd=None)
    encode_numeric_zscore(df,'width',mean=None,sd=None)
    encode_numeric_zscore(df,'length',mean=None,sd=None)

    return (df)

def question1():
    print()
    print("***Question 1***")
    
    path = "./data/"
    
    filename_read = os.path.join(path,"toy1.csv")
    filename_write = os.path.join(path,"submit-yutong-prog2q1.csv")
    df = encode_toy_dataset(filename_read) # You just have to implement encode_toy_dataset above
    df.to_csv(filename_write,index=False)

    print("Wrote {} lines.".format(len(df)))

def question2():
    print()
    print("***Question 2***")
    
    path = "./data/"
    
    filename_read = os.path.join(path,"toy1.csv")
    df = encode_toy_dataset(filename_read)
    
    # shuffle 
    df = df.reindex(np.random.permutation(df.index)) 
    df.reset_index(inplace=True, drop=True)
    # Encode to a 2D matrix for training
    x,y = to_xy(df,['weight'])
    # Create a deep neural network with 3 hidden layers of 50, 25, 10
    regressor = skflow.TensorFlowDNNRegressor(hidden_units=[50, 25, 10], steps=10000)
     #fit nutural network
    regressor.fit(x,y) 
    
    # How to make many predictions
    pred = regressor.predict(x)
    score = np.sqrt(metrics.mean_squared_error(pred,y))
    print("Out of sample (RMSE): {}".format(score))

def question3():
    print()
    print("***Question 3***")
    # Z-Score encode these using the mean/sd from the dataset (you got ←-this in question 2)
    testDF = pd.DataFrame([
                {'length':1, 'width':2, 'height': 3},
                {'length':3, 'width':2, 'height': 5},
                {'length':4, 'width':1, 'height': 3}
                ])
    
    path = "./data/"
    filename_read = os.path.join(path,"toy1.csv")
    filename_write = os.path.join(path,"submit-yutong-prog2q3.csv")
    df = pd.read_csv(filename_read,na_values=['NA','?'])
     #     calculate and report the mean and standard deviation for height, width and length
    print('length: ({}, {})'.format(df['length'].mean(), df['length'].std()))
    print('width: ({}, {})'.format(df['width'].mean(), df['width'].std()))
    print('height: ({}, {})'.format(df['height'].mean(), df['height'].std()))
   
    encode_numeric_zscore(testDF,'length',mean=df['length'].mean(),sd=df['length'].std())  
    encode_numeric_zscore(testDF,'width',mean=df['width'].mean(),sd=df['width'].std())  
    encode_numeric_zscore(testDF,'height',mean=df['height'].mean(),sd=df['height'].std())  
    

    testDF.to_csv(filename_write,index = False)
    print(testDF)
    
def question4():
    print()
    print("***Question 4***")

    path = "./data/"
    filename_read = os.path.join(path,"iris.csv")
    filename_write = os.path.join(path,"submit-yutong-prog2q4.csv")
    df = pd.read_csv(filename_read,na_values=['NA','?'])
    
    # create feature vector
    encode_numeric_zscore(df,'sepal_l')
    encode_numeric_zscore(df,'sepal_w')
    encode_numeric_zscore(df,'petal_l')
    encode_text_dummy(df,'species')
    
    # shuffle 
    np.random.seed(42)
    df = df.reindex(np.random.permutation(df.index))
    df.reset_index(inplace =True, drop = True)
    
    # Encode to a 2D matrix for training
    x,y = to_xy(df,'petal_w')#  predict petal-w
    
    # Cross validate
    kf = KFold(len(x),n_folds =5)
    
    oos_y = []
    oos_pred= []
    fold =1 
    for train,test in kf:
        print("Fold #{}".format(fold))
        fold+=1
        
        x_train = x[train]
        y_train = y[train]
        x_test = x[test]
        y_test = y[test]
        
        # Create a deep neural network with 3 hidden layers of 10, 20, 10
        regressor = skflow.TensorFlowDNNRegressor(hidden_units = [10,20,10],steps = 5000)
        
        # Early stopping
        early_stop = skflow.monitors.ValidationMonitor(x_test,y_test,early_stopping_rounds = 200, print_steps=50)
        
        # Fit/train neural network
        regressor.fit(x_train,y_train,monitor=early_stop)
        
        # Add the predictions to the oos prediction list
        pred = regressor.predict(x_test)
        
        oos_y.append(y_test)
        oos_pred.append(pred)
        
        # Measure accuracy
        score=np.sqrt(metrics.mean_squared_error(pred,y_test))
        print ("Fold score (RMSE): {}".format(score))    

    # Build the oos prediction list and calculate the error.    
    oos_y = np.concatenate(oos_y)
    oos_pred = np.concatenate(oos_pred)
    scoreFinal=np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))  
    print("Final, Out of Sample Score (RMSE): {}".format(scoreFinal))
    
    # Write the cross-validated prediction
    oos_y =pd.DataFrame(oos_y)
    oos_pred=pd.DataFrame(oos_pred)
    oosDF = pd.concat([df,oos_y,oos_pred],axis = 1)
    oosDF.to_csv(filename_write, index=False)

def question5():
    print()
    print("***Question 5***")

    path = "./data/"
    filename_read = os.path.join(path,"auto-mpg.csv")
    filename_write = os.path.join(path,"submit-yutong-prog2q5.csv")
    df = pd.read_csv(filename_read,na_values=['NA','?'])
    
    # create feature vector
    missing_median(df,'horsepower')
    nameDF = df['name']
    df.drop('name',1,inplace=True)
    encode_numeric_zscore(df,'horsepower')
    encode_numeric_zscore(df,'weight')
    encode_numeric_zscore(df,'displacement')
    encode_numeric_zscore(df,'mpg')
    encode_numeric_zscore(df,'acceleration')
    encode_numeric_zscore(df,'origin')
    cylinders = encode_text_index(df,'cylinders')
    num_classes= len(cylinders)
    
    # shuffle 
    np.random.seed(42)
    df = df.reindex(np.random.permutation(df.index))
    df.reset_index(inplace =True, drop = True)
    
    # Encode to a 2D matrix for training
    x,y = to_xy(df,'cylinders')#  predict cylinders
    
    # Cross validate
    kf = KFold(len(x),n_folds =5)
    
    oos_y = []
    oos_pred= []
    fold =1 
    for train,test in kf:
        print("Fold #{}".format(fold))
        fold+=1
        
        x_train = x[train]
        y_train = y[train]
        x_test = x[test]
        y_test = y[test]
        
        # Create a deep neural network with 3 hidden layers of 10, 20, 10
        classifier = skflow.TensorFlowDNNClassifier(hidden_units = [10,20,10], n_classes = num_classes, steps = 500)
        
        # Early stopping
        early_stop = skflow.monitors.ValidationMonitor(x_test, y_test, early_stopping_rounds=200, print_steps=50, n_classes = num_classes)
        
        # Fit/train neural network
        classifier.fit(x_train,y_train,early_stop)
        
        # Add the predictions to the oos prediction list
        pred = classifier.predict(x_test)
        
        oos_y.append(y_test)
        oos_pred.append(pred)
        
        # Measure accuracy
        score = np.sqrt(metrics.accuracy_score(y_test, pred))
        print ("Fold score: {}".format(score))    

    # Build the oos prediction list and calculate the error.    
    oos_y = np.concatenate(oos_y)
    oos_pred = np.concatenate(oos_pred)
    score = metrics.accuracy_score(oos_y, oos_pred)
    print("Final, Out of Sample Score: {}".format(score))
    
    oos_y = pd.DataFrame(oos_y)
    oos_pred = pd.DataFrame(oos_pred)
    
    oos_y =pd.DataFrame(oos_y)
    oos_pred=pd.DataFrame(oos_pred)
    df.insert(8,'name',nameDF)  
    df.insert(9,'ideal',oos_y)  
    df.insert(10,'predict',oos_pred) 

    df.to_csv(filename_write, index = False)

question1()
question2()
question3()
question4()
question5()


***Question 1***
Wrote 10000 lines.

***Question 2***
Step #99, avg. train loss: 296211.12500
Step #199, avg. train loss: 42216.94141
Step #299, avg. train loss: 23683.66992
Step #400, epoch #1, avg. train loss: 17236.32812
Step #500, epoch #1, avg. train loss: 12332.32617
Step #600, epoch #1, avg. train loss: 10105.65723
Step #700, epoch #2, avg. train loss: 8618.09961
Step #800, epoch #2, avg. train loss: 7134.90625
Step #900, epoch #2, avg. train loss: 5592.15674
Step #1000, epoch #3, avg. train loss: 5675.46143
Step #1100, epoch #3, avg. train loss: 4687.73047
Step #1200, epoch #3, avg. train loss: 3635.77002
Step #1300, epoch #4, avg. train loss: 3317.61719
Step #1400, epoch #4, avg. train loss: 3177.59448
Step #1500, epoch #4, avg. train loss: 2577.63745
Step #1600, epoch #5, avg. train loss: 2427.60620
Step #1700, epoch #5, avg. train loss: 2418.82910
Step #1800, epoch #5, avg. train loss: 1904.46033
Step #1900, epoch #6, avg. train loss: 1715.80701
Step #2000, epoch #6, avg. t

Stopping. Best step:
 step 97 with loss 0.013468504883348942


Fold score (RMSE): 0.17531210760832266
Fold #2
Step #50, epoch #12, avg. train loss: 0.07019, avg. val loss: 0.04958
Step #100, epoch #25, avg. train loss: 0.01796, avg. val loss: 0.01659
Step #150, epoch #37, avg. train loss: 0.01597, avg. val loss: 0.01305
Step #200, epoch #50, avg. train loss: 0.01384, avg. val loss: 0.01132
Step #250, epoch #62, avg. train loss: 0.01356, avg. val loss: 0.01145
Step #300, epoch #75, avg. train loss: 0.01263, avg. val loss: 0.01069
Step #350, epoch #87, avg. train loss: 0.01236, avg. val loss: 0.01079
Step #400, epoch #100, avg. train loss: 0.01195, avg. val loss: 0.01049
Step #450, epoch #112, avg. train loss: 0.01246, avg. val loss: 0.01069
Step #500, epoch #125, avg. train loss: 0.01185, avg. val loss: 0.01097
Step #550, epoch #137, avg. train loss: 0.01109, avg. val loss: 0.01040
Fold score (RMSE): 0.14272974831031332
Fold #3


Stopping. Best step:
 step 362 with loss 0.009098603390157223


Step #50, epoch #12, avg. train loss: 0.06759, avg. val loss: 0.06043
Step #100, epoch #25, avg. train loss: 0.01754, avg. val loss: 0.02284
Step #150, epoch #37, avg. train loss: 0.01408, avg. val loss: 0.01943
Step #200, epoch #50, avg. train loss: 0.01232, avg. val loss: 0.01788
Step #250, epoch #62, avg. train loss: 0.01224, avg. val loss: 0.01713
Step #300, epoch #75, avg. train loss: 0.01184, avg. val loss: 0.01636
Step #350, epoch #87, avg. train loss: 0.01112, avg. val loss: 0.01614
Step #400, epoch #100, avg. train loss: 0.01094, avg. val loss: 0.01589
Step #450, epoch #112, avg. train loss: 0.01137, avg. val loss: 0.01569
Step #500, epoch #125, avg. train loss: 0.01110, avg. val loss: 0.01536
Step #550, epoch #137, avg. train loss: 0.01038, avg. val loss: 0.01520
Step #600, epoch #150, avg. train loss: 0.01020, avg. val loss: 0.01473
Step #650, epoch #162, avg. train loss: 0.01022, avg. val loss: 0.01494
Step #700, epoch #175, avg. train loss: 0.01001, avg. val loss: 0.01469


Stopping. Best step:
 step 1106 with loss 0.011062948033213615


Step #50, epoch #12, avg. train loss: 0.06714, avg. val loss: 0.07087
Step #100, epoch #25, avg. train loss: 0.01583, avg. val loss: 0.02708
Step #150, epoch #37, avg. train loss: 0.01299, avg. val loss: 0.02386
Step #200, epoch #50, avg. train loss: 0.01127, avg. val loss: 0.02296
Step #250, epoch #62, avg. train loss: 0.01074, avg. val loss: 0.02246
Step #300, epoch #75, avg. train loss: 0.01066, avg. val loss: 0.02217
Step #350, epoch #87, avg. train loss: 0.01011, avg. val loss: 0.02159
Step #400, epoch #100, avg. train loss: 0.01047, avg. val loss: 0.02184
Step #450, epoch #112, avg. train loss: 0.01020, avg. val loss: 0.02160
Step #500, epoch #125, avg. train loss: 0.00995, avg. val loss: 0.02182
Step #550, epoch #137, avg. train loss: 0.00969, avg. val loss: 0.02158
Step #600, epoch #150, avg. train loss: 0.00964, avg. val loss: 0.02152


Stopping. Best step:
 step 438 with loss 0.020158855244517326


Fold score (RMSE): 0.21273141903164636
Fold #5
Step #50, epoch #12, avg. train loss: 0.06887, avg. val loss: 0.05595
Step #100, epoch #25, avg. train loss: 0.01576, avg. val loss: 0.02118
Step #150, epoch #37, avg. train loss: 0.01299, avg. val loss: 0.01812
Step #200, epoch #50, avg. train loss: 0.01231, avg. val loss: 0.01801
Step #250, epoch #62, avg. train loss: 0.01176, avg. val loss: 0.01777
Step #300, epoch #75, avg. train loss: 0.01124, avg. val loss: 0.01732
Step #350, epoch #87, avg. train loss: 0.01088, avg. val loss: 0.01720
Step #400, epoch #100, avg. train loss: 0.01070, avg. val loss: 0.01711
Step #450, epoch #112, avg. train loss: 0.01051, avg. val loss: 0.01707
Step #500, epoch #125, avg. train loss: 0.01010, avg. val loss: 0.01683
Step #550, epoch #137, avg. train loss: 0.00982, avg. val loss: 0.01661
Step #600, epoch #150, avg. train loss: 0.00970, avg. val loss: 0.01654
Step #650, epoch #162, avg. train loss: 0.01009, avg. val loss: 0.01689
Step #700, epoch #175, av

Stopping. Best step:
 step 786 with loss 0.011288278736174107


Fold score (RMSE): 0.18455804191270428
Final, Out of Sample Score (RMSE): 0.17696627225086287

***Question 5***
Fold #1
Step #50, epoch #5, avg. train loss: 1.51485, avg. val loss: 1.23125
Step #100, epoch #10, avg. train loss: 0.69187, avg. val loss: 0.53933
Step #150, epoch #15, avg. train loss: 0.44803, avg. val loss: 0.30869
Step #200, epoch #20, avg. train loss: 0.32595, avg. val loss: 0.22045
Step #250, epoch #25, avg. train loss: 0.28483, avg. val loss: 0.19665
Step #300, epoch #30, avg. train loss: 0.26852, avg. val loss: 0.17732
Step #350, epoch #35, avg. train loss: 0.24599, avg. val loss: 0.15890
Step #400, epoch #40, avg. train loss: 0.23328, avg. val loss: 0.15374
Step #450, epoch #45, avg. train loss: 0.21888, avg. val loss: 0.14830
Step #500, epoch #50, avg. train loss: 0.21994, avg. val loss: 0.15145
Fold score: 0.9810708435174292
Fold #2
Step #50, epoch #5, avg. train loss: 1.50325, avg. val loss: 1.35368
Step #100, epoch #10, avg. train loss: 0.68610, avg. val loss: 0


***Question 5***
Fold #1
Step #50, epoch #5, avg. train loss: 1.51485, avg. val loss: 1.23125
Step #100, epoch #10, avg. train loss: 0.69187, avg. val loss: 0.53933
Step #150, epoch #15, avg. train loss: 0.44803, avg. val loss: 0.30869
Step #200, epoch #20, avg. train loss: 0.32595, avg. val loss: 0.22045
Step #250, epoch #25, avg. train loss: 0.28483, avg. val loss: 0.19665
Step #300, epoch #30, avg. train loss: 0.26852, avg. val loss: 0.17732
Step #350, epoch #35, avg. train loss: 0.24599, avg. val loss: 0.15890
Step #400, epoch #40, avg. train loss: 0.23328, avg. val loss: 0.15374
Step #450, epoch #45, avg. train loss: 0.21888, avg. val loss: 0.14830
Step #500, epoch #50, avg. train loss: 0.21994, avg. val loss: 0.15145
Fold score: 0.9810708435174292
Fold #2
Step #50, epoch #5, avg. train loss: 1.50325, avg. val loss: 1.35368
Step #100, epoch #10, avg. train loss: 0.68610, avg. val loss: 0.79987
Step #150, epoch #15, avg. train loss: 0.44881, avg. val loss: 0.60935
Step #200, epoch