In [None]:
import pandas as pd
import matplotlib as plt
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as pplt
from matplotlib.font_manager import FontProperties
from sklearn.manifold import TSNE
from scipy import stats
import tensorflow as tf

In [None]:
# Loading the data
# From this source https://onlinecourses.science.psu.edu/stat200/node/161
df = pd.read_excel("/Users/isabel/Desktop/SandboxProjects/shoesize.xls")
# so we see this dataset has 408 entries, with each an index, gender string, shoe size and height
df.info()
# looking at the values, it becomes obvious that this dataset uses US American metric for shoe size and 
# body height is measured in inches
# to convert these values to other metrics is quite bothersome, so it can be a homework task for later
#df.head()
df.tail()

In [None]:
# Plot size versus height to get a first impression
fontP = FontProperties()
fontP.set_size('small')
fig = pplt.figure()
fig.suptitle('Body height in inches versus shoe size in US size', fontsize=14, fontweight='bold')
ax = pplt.subplot(111)

#for item in newlistRate:
ax.plot(df['Size'],df['Height'], "o")#
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

ax.legend(title="all data points",loc='center left', prop = fontP, bbox_to_anchor=(1, 0.5))
pplt.show()
# the data looks like it's roughly linear, no crazy outliers are visible on this plot

In [None]:
# a few checks on the data
dfshape = df.shape
#count number of females in dataset 
print 'Number of females: ' + str(sum(df['Gender']=='F'))
#count number of females in dataset 
print 'Number of males: ' + str(sum(df['Gender']=='M'))
# so here we see that there are more samples of males than females but the proportion is still relatively close

#correctness check 
print 'Do the numbers add up: ' + str((sum(df['Gender']=='F')+sum(df['Gender']=='M') ) == dfshape[0])
# the test says that the number of females and males nicely add up to the total number of rows

# looking at simple statistics about our data (we can ignore the first column) 
df.describe()
# the data looks valid: shoe size ranges from 5 to 15 and height from 60 to 81 inches

In [None]:
# check statistically if this data has a linear correlation
print stats.pearsonr(df['Size'],df['Height'])

# The first value is close to 0.87 which is relatively close to 1 - this indicates positive linear correlation
# The second value is very small, so we can not reject the assumption that this data is uncorrelated

# The statistical test confirms our intuition that this data is indeed linearly correlated although not perfectly

In [None]:
# Now let's use tensorflow to train a simple model to learn a regression for this task
# ADAPT THIS PART !!!
# Data and model preparation for NN learning

# pediction of Dalc
def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.01))

# Our model is a standard 1-hidden-layer multi-layer-perceptron with ReLU
# activation. The softmax (which turns arbitrary real-valued outputs into
# probabilities) gets applied in the cost function.
def model(X, w_h, w_o):
    h = tf.nn.relu(tf.matmul(X, w_h))
    return tf.matmul(h, w_o)
# How many units in the hidden layer + batch size
NUM_HIDDEN = 200
BATCH_SIZE = 128
# Our variables. The input has width NUM_Feat, and the output has width 5
X = tf.placeholder("float", [None, num_feat])
Y = tf.placeholder("float", [None, 5])


#w = init_weights([num_feat, 1]) # like in linear regression, we need a shared variable weight matrix for logistic regression
w_h = init_weights([num_feat, NUM_HIDDEN])
w_o = init_weights([NUM_HIDDEN, 5])

#y = tf.matmul(tf.nn.relu(tf.matmul(X, w_h)), w_o)
y = tf.matmul(tf.nn.relu(tf.matmul(X, w_h)), w_o)
# Predict y given x using the model.
py_x = model(X, w_h, w_o)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(py_x, Y))
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(cost)
predict_op = tf.argmax(py_x, 1) # at predict time, evaluate the argmax 

p = np.random.permutation(range(len(dfs)))
allX, allY = dfs[p], Y_data[p]
border = int(0.8*num_samples)
hotY = pd.get_dummies(allY)
hoty = hotY.as_matrix()

trX = allX[:border]
teX = allX[border:]
trY = hoty[:border]
teY = hoty[border:]

In [None]:
# Launch the graph in an interactive session
sess = tf.InteractiveSession()
tf.initialize_all_variables().run()

for epoch in range(400):
    # Shuffle the data before each training iteration.
    p = np.random.permutation(range(len(trX)))
    trX, trY = trX[p], trY[p]

    # Train in batches 
    for start in range(0, len(trX), BATCH_SIZE):
        end = start + BATCH_SIZE        
        sess.run(train_op, feed_dict={X: trX[start:end], Y: trY[start:end]})

        
    # And print the current accuracy on the training data.
    if epoch % 20==0:
        #compute accuracy
        correct_prediction = tf.equal(tf.cast(tf.argmax(y,1), tf.float32),tf.cast(tf.argmax(Y,1), tf.float32) )
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        result = sess.run(accuracy, feed_dict={X: trX[start:end], Y: trY[start:end]})
        print 'Run {}, {}'.format(epoch+1, result)
        
        # average weight filter
        wb = sess.run(w_h)
        print np.mean(wb) 


In [None]:
# Check test error
correct_prediction = tf.equal(tf.cast(tf.argmax(y,1), tf.float32),tf.cast(tf.argmax(Y,1), tf.float32) )
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
result = sess.run(accuracy, feed_dict={X: teX[:], Y: teY[:]})
print 'Evaluation on test data {}'.format(result)

In [None]:
# Close the Session when we're done.
sess.close()