In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

In [21]:
## download data here: http://www.cpsc.gov/en/Research--Statistics/NEISS-Injury-Data/
file_path = '/Users/joshuamalina/Downloads/NEISS-data-2014-updated-12MAY2015.xlsx'

In [48]:
class Injury(object):
        
    def __init__(self, datapath):
        columnsToKeep = ['sex', 'age', 'narrative']
        self.datapath = datapath
        print("loading data from file ...")
        self.raw = pd.read_excel(datapath)
        self.data = self.raw[columnsToKeep]
        print("encoding gender ...")
        self.data['binary_sex'] = self.makeGenderBinary(self.data.sex)
        print("transforming age ...")
        self.data['age_transformed'] = self.transformAges(self.data.age) # returns some null when age is unknown
        self.data = self.data.dropna()
        print("building tfidf matrix ...")
        self.tfidfmatrix = self.vectorizeNarratives(self.data.narratives)
        
    # encode gender as 1 or 0    
    def makeGenderBinary(self, sex):
        genderLegend = {"Male": 0, "Female": 1 }
        return pd.Series([genderLegend["Male"] if x == "Male" else genderLegend["Female"] for x in sex])
    
    # ages are in years, but for children < 1 years, it is funky
    def transformAges(self, ages):
        return pd.Series([self.transformAge(x) for x in ages])
        
    # converts a single age to an age that makes sense, if age is unknown (i.e. age == 0) return None
    def transformAge(self, age):
        if (age == 0):
            return None
        elif (age < 200):
            return age
        else:
            stripped = int(str(age)[1:])
            return stripped / 12.0     
        
    def vectorizeNarratives(self, narratives):
        # genderedWords = ["him", "her", "his", "he", "she", "male", "female", "himself", "herself", "man", "woman", "penis", "scrotum", "vagina", "clitoris"] 
        # v = TfidfVectorizer(norm='l1', stop_words=genderedWords)
        v = TfidfVectorizer(norm='l1', stop_words=[])
        return v.fit_transform(narratives)                    
                                  
        
        

In [49]:
# this should take a bit of time
injury = Injury(file_path)

loading data from file ...
encoding gender ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


transforming age ...
building tfidf matrix ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


NameError: name 'vectorizeNarratives' is not defined

In [103]:
df['ageMonths'] = pd.Series([transformAgeToMonths(x) for x in df.age])
refined = df[['sex', 'narrative', 'ageMonths']] # only the fields we want
# refined['gender'] = add_gender(refined) # drop rows with NAs in gender
refined = refined[pd.isnull(refined.ageMonths) == False] # drop rows where ageMonths is null
# skip first five characters of narrative, reducing a lot of the gender information
refined['narrative_hemmed'] = pd.Series([str(x)[5:] or "null" for x in refined.narrative.values])

In [107]:
refined = refined[pd.isnull(refined.narrative_hemmed) == False]

In [109]:
vectorizer = TfidfVectorizer(norm='l1', stop_words=[]) # instantiate vectorizer
tfidfMatrix = vectorizer.fit_transform(refined['narrative_hemmed']) # vectorize corpus

In [110]:
X_train, X_test, y_train, y_test = train_test_split(tfidfMatrix, refined.ageMonths) # split data

In [112]:
model = LinearRegression()

In [None]:
parameters = {
    'loss': 'log',
    'penalty': 'l2',
    'n_iter': 50,
    'alpha': 0.00001,
    'fit_intercept': True,
}

In [None]:
model = SGDClassifier(**parameters)

In [113]:
model.fit(X_train, y_train) # train classifier

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [114]:
print("train acc: ", model.score(X_train, y_train)) # evaluate classifier on training
print("test acc: ", model.score(X_test, y_test)) # evaluate classifier on test

train acc:  0.199107238474
test acc:  -0.0442358620834


In [126]:
s = vectorizer.transform(["woman"])
print(model.predict(s)[0] / 12.0, " years")

-96.5514749587  years


In [121]:
def init_weights(shape):
    return tf.Variable(tf.random_normal(shape, stddev=0.01))

In [None]:
def model(X, w_h, w_o):
    h = tf.nn.sigmoid(tf.matmul(X, w_h)) # this is a basic mlp, think 2 stacked logistic regressions
    return tf.matmul(h, w_o) # note that we dont take the softmax at the end because our cost fn does that for us

In [None]:
input_layer_size = X_train.shape[1]
input_layer_size

In [None]:
x = tf.placeholder("float", [None, input_layer_size])
y = tf.placeholder("float", [None, 2])

In [None]:
# Network Parameters
n_hidden_1 = 256 # 1st layer num features
n_hidden_2 = 256 # 2nd layer num features
n_input = input_layer_size # MNIST data input (img shape: 28*28)
n_classes = 2 # MNIST total classes (0-9 digits)

In [None]:
# Create model
def multilayer_perceptron(_X, _weights, _biases):
    #Hidden layer with RELU activation
    layer_1 = tf.nn.relu(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1'])) 
    #Hidden layer with RELU activation
    layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])) 
    return tf.matmul(layer_2, weights['out']) + biases['out']

In [None]:
# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

In [None]:
# Parameters
learning_rate = 0.001
training_epochs = 15
batch_size = X_train.shape[0]
display_step = 1

In [None]:
# Construct model
pred = multilayer_perceptron(x, weights, biases)

In [None]:
# Define loss and optimizer
# Softmax loss
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y)) 
# Adam Optimizer
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) 

In [None]:
# Initializing the variables
init = tf.initialize_all_variables()

In [None]:
# unpack X_train
unpacked_x_train = X_train.toarray()

In [None]:
# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = X_train.shape[0]
        # Loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = unpacked_x_train, y_train
            # Fit training using batch data
            sess.run(optimizer, feed_dict={x: batch_xs, y: batch_ys})
            # Compute average loss
            avg_cost += sess.run(cost, feed_dict={x: batch_xs, y: batch_ys})/total_batch
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))

    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))

In [51]:
def transformAgeToMonths(age):
    if (age == 0):
        return None
    elif (age < 200):
        return age * 12
    else:
        stripped = int(str(age)[1:])
        return stripped
    

In [130]:
reducer = TruncatedSVD(n_components=50)

In [132]:
X_reduced = reducer.fit_transform(X_train)

In [1]:
X_reduced.shape

NameError: name 'X_reduced' is not defined

In [136]:
t = TSNE()

In [None]:
X_tsne = t.fit_transform(X_reduced)