In [97]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low
        
def submit(data,key,no,source_file=None):
    if source_file is None: source_file = __file__
    with open(source_file, "rb") as image_file:
        encoded_python = base64.b64encode(image_file.read()).decode('ascii')
    ext = os.path.splitext(source_file)[-1].lower()
    if ext not in ['.ipynb','.py']: raise Exception("Source file is {} must be .py or .ipynb".format(ext))
    r = requests.post("https://zc565j2hsc.execute-api.us-east-1.amazonaws.com/prod/assignment-submit",
        headers={'x-api-key':key}, json={'csv':base64.b64encode(data.to_csv().encode('ascii')).decode("ascii"),
        'assignment': no, ext[1:]:encoded_python})
    if r.status_code == 200:
        print("Success: {}".format(r.text))
    else: print("Failure: {}".format(r.text))

In [167]:
#Load source files

# You must also identify your source file.  (modify for your local setup)
# file='/resources/t81_558_deep_learning/assignment_yourname_class1.ipynb'  # IBM Data Science Workbench
# file='C:\\Users\\jeffh\\projects\\t81_558_deep_learning\\t81_558_class1_intro_python.ipynb'  # Windows
file='C:\\Users\\kaeli\\Documents\\WUSTL\\Spring 2018\\Deep Neural Networks\\projects\\t81_558_deep_learning\\Kaggle_code.ipynb'  # Mac/Linux

# Begin assignment
path = "C:\\Users\\kaeli\\Documents\\WUSTL\\Spring 2018\\Deep Neural Networks\\projects\\t81_558_deep_learning\\data"

filename_train = os.path.join(path,"train.csv")
filename_test = os.path.join(path,"test.csv")
filename_submit = os.path.join(path,"submit.csv")

df_train = pd.read_csv(filename_train,na_values=['NA','?'])


In [168]:
#Feature engineering

#Parse strings

df_parse=pd.DataFrame(df_train.name.str.split(' ').tolist())

In [169]:
# Save a copy, if you like
df_parse.to_csv('kaggle_df_parse.csv',index=False)

In [170]:
filename_parse = os.path.join(path,"kaggle_df_parse.csv")
df_parse=pd.read_csv(filename_parse,na_values=['NA','?'])

In [171]:
df_parse[0:10]

Unnamed: 0,0,1,2,3,4,5
0,Large,Red,Generic,,,
1,High,Quality,Pink,Small,,
2,Medium,High,Quality,,,
3,Generic,Medium,,,,
4,High,Quality,Small,Black,,
5,,,High,Quality,Green,Large
6,Medium,High,Quality,Red,,
7,,,Small,Generic,White,
8,High,Quality,Green,,,
9,High,Quality,Blue,Tiny,,


In [172]:
#dictionary=pd.DataFrame(
#        {'key': ['Red', 'Pink','Green','White','Black','Brown','Blue','Quality', 'Generic','Large', 'Medium','Small','Tiny']})

#dict={'color':['Red', 'Pink','Green','White','Black','Brown','Blue'],'quality':['Quality', 'Generic'],'size':['Large', 'Medium','Small','Tiny']}

In [173]:
%matplotlib inline
from matplotlib.pyplot import figure, show
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
import tensorflow as tf

path = "C:\\Users\\kaeli\\Documents\\WUSTL\\Spring 2018\\Deep Neural Networks\\projects\\t81_558_deep_learning\\data"


encode_text_dummy(df_parse, '0')
encode_text_dummy(df_parse, '1')
encode_text_dummy(df_parse, '2')
encode_text_dummy(df_parse, '3')
encode_text_dummy(df_parse, '4')
encode_text_dummy(df_parse, '5')

In [174]:
df_train = df_train.join(df_parse)

In [175]:
df_train.columns[df_train.isnull().any()]

Index([], dtype='object')

In [176]:
df_train=df_train.drop(['name'], axis=1)

In [177]:
encode_text_dummy(df_train, 'manufacturer')


In [223]:
df_train

Unnamed: 0,id,pack,weight,height,width,length,cost,0-Black,0-Blue,0-Brown,...,5-Quality,5-Red,5-Small,5-Tiny,5-White,manufacturer-6% Solution,manufacturer-Deep Office Supplies,manufacturer-Duck Lake,manufacturer-Offices-R-Us,manufacturer-WizBang
0,1,390,2496.0,2.0,1.2,1.6,51.47,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,590,3776.0,2.0,1.2,1.6,213.76,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,210,2016.0,3.0,1.8,2.4,131.52,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,100,1120.0,3.5,2.1,2.8,117.83,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,670,3216.0,1.5,0.9,1.2,157.70,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,6,350,3920.0,3.5,2.1,2.8,151.64,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6,7,670,7504.0,3.5,2.1,2.8,417.51,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,8,200,960.0,1.5,0.9,1.2,63.37,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,9,790,10112.0,4.0,2.4,3.2,521.65,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,10,830,1328.0,0.5,0.3,0.4,89.03,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [179]:
#Train the neural network
import pandas as pd
import io
import requests
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint


x,y = to_xy(df_train,"cost")

x_train, x_test, y_train, y_test = train_test_split(    
    x, y, test_size=0.25, random_state=45)

model = Sequential()
model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
model.add(Dense(10))
model.add(Dense(10))
model.add(Dense(10))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
#checkpointer = ModelCheckpoint(filepath="best_weights.hdf5", verbose=0, save_best_only=True) 
checkpointer = ModelCheckpoint(filepath="C:\\Users\\kaeli\\Documents\\WUSTL\\Spring 2018\\Deep Neural Networks\\projects\\t81_558_deep_learning\\best_weights.hdf5", verbose=0, save_best_only=True) # save best model

model.fit(x,y,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=0,epochs=1000)
#model.load_weights('best_weights.hdf5')
model.load_weights("C:\\Users\\kaeli\\Documents\\WUSTL\\Spring 2018\\Deep Neural Networks\\projects\\t81_558_deep_learning\\best_weights.hdf5") # load weights from best model


pred = model.predict(x_test)
print("Shape: {}".format(pred.shape))
print(pred)

Epoch 00051: early stopping
Shape: (2500, 1)
[[ 491.52825928]
 [  67.37966156]
 [ 260.24267578]
 ..., 
 [ 104.58647156]
 [  18.69026184]
 [  59.27109909]]


In [208]:
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

Final score (RMSE): 55.50434494018555


In [224]:
df_test = pd.read_csv(filename_test,na_values=['NA','?'])

In [225]:
df_testparse=pd.DataFrame(df_test.name.str.split(' ').tolist())

In [194]:
#df_testparse.to_csv('kaggle_df_testparse.csv',index=False)

In [226]:
filename_testparse = os.path.join(path,"kaggle_df_testparse.csv")
df_testparse=pd.read_csv(filename_testparse,na_values=['NA','?'])

In [227]:
encode_text_dummy(df_testparse, '0')
encode_text_dummy(df_testparse, '1')
encode_text_dummy(df_testparse, '2')
encode_text_dummy(df_testparse, '3')
encode_text_dummy(df_testparse, '4')
encode_text_dummy(df_testparse, '5')

In [228]:
df_testparse[0:5]

Unnamed: 0,0-Black,0-Blue,0-Brown,0-Generic,0-Green,0-High,0-Large,0-Medium,0-Pink,0-Red,...,5-Brown,5-Green,5-Large,5-Medium,5-Pink,5-Quality,5-Red,5-Small,5-Tiny,5-White
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [229]:
encode_text_dummy(df_test, 'manufacturer')

In [230]:
df_test=df_test.drop(['name'], axis=1)

In [231]:
ids = df_test['id']



In [232]:
df_test = df_test.join(df_testparse)

In [233]:
df_test[0:10]

Unnamed: 0,id,pack,weight,height,width,length,manufacturer-6% Solution,manufacturer-Deep Office Supplies,manufacturer-Duck Lake,manufacturer-Offices-R-Us,...,5-Brown,5-Green,5-Large,5-Medium,5-Pink,5-Quality,5-Red,5-Small,5-Tiny,5-White
0,10001,320,512.0,0.5,0.3,0.4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10002,1,11.2,3.5,2.1,2.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10003,2,32.0,5.0,3.0,4.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,10004,5,48.0,3.0,1.8,2.4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10005,44,70.4,0.5,0.3,0.4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,10006,6,67.2,3.5,2.1,2.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,10007,650,5200.0,2.5,1.5,2.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,10008,3,38.4,4.0,2.4,3.2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,10009,660,3168.0,1.5,0.9,1.2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10010,600,4800.0,2.5,1.5,2.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [234]:
x = df_test.as_matrix().astype(np.float32)

# Generate predictions
pred_test = model.predict(x)

In [235]:
df_submit = pd.DataFrame(pred)
df_submit.insert(0,'id',ids)
df_submit.columns = ['id','cost']



print(df_submit)

         id        cost
0     10001  491.528259
1     10002   67.379662
2     10003  260.242676
3     10004   96.771111
4     10005  167.511734
5     10006  480.900635
6     10007   29.866041
7     10008   40.014633
8     10009   50.399689
9     10010  189.570862
10    10011  136.970886
11    10012  278.688232
12    10013   94.685822
13    10014   47.465454
14    10015   73.791016
15    10016   67.551849
16    10017   29.902502
17    10018   87.461464
18    10019  229.976822
19    10020  116.225288
20    10021   52.541023
21    10022   86.819000
22    10023  186.801224
23    10024  123.607735
24    10025  103.745865
25    10026  177.848602
26    10027   28.775667
27    10028   67.803970
28    10029  236.368698
29    10030   55.344540
...     ...         ...
2470  12471  244.196518
2471  12472   59.502487
2472  12473   36.981628
2473  12474  153.219955
2474  12475  147.345825
2475  12476  114.602188
2476  12477   41.492668
2477  12478   50.248798
2478  12479  115.008476
2479  12480  312

In [236]:
df_submit.to_csv('submit.csv', index=False)