In [1]:
import numpy as np
import tensorflow as tf
import json
import matplotlib.pyplot as plt
import numpy as np
import pprint
import argparse
import collections
import json
import nltk
import numpy as np
import sys

  from ._conv import register_converters as _register_converters


In [2]:
# Utils
def get_minibatches(data, minibatch_size, shuffle=True):
    """
    Iterates through the provided data one minibatch at at time. You can use this function to
    iterate through data in minibatches as follows:

        for inputs_minibatch in get_minibatches(inputs, minibatch_size):
            ...

    Or with multiple data sources:

        for inputs_minibatch, labels_minibatch in get_minibatches([inputs, labels], minibatch_size):
            ...

    Args:
        data: there are two possible values:
            - a list or numpy array
            - a list where each element is either a list or numpy array
        minibatch_size: the maximum number of items in a minibatch
        shuffle: whether to randomize the order of returned data
    Returns:
        minibatches: the return value depends on data:
            - If data is a list/array it yields the next minibatch of data.
            - If data a list of lists/arrays it returns the next minibatch of each element in the
              list. This can be used to iterate through multiple data sources
              (e.g., features and labels) at the same time.

    """
    list_data = type(data) is list and (type(data[0]) is list or type(data[0]) is np.ndarray)
    data_size = len(data[0]) if list_data else len(data)
    indices = np.arange(data_size)
    if shuffle:
        np.random.shuffle(indices)
    for minibatch_start in np.arange(0, data_size, minibatch_size):
        minibatch_indices = indices[minibatch_start:minibatch_start + minibatch_size]
        yield [minibatch(d, minibatch_indices) for d in data] if list_data \
            else minibatch(data, minibatch_indices)

def minibatch(data, minibatch_idx):
    return data[minibatch_idx] if type(data) is np.ndarray else [data[i] for i in minibatch_idx]

def pad(a, i):
    mask = [1] * len(a)
    if len(a) > i:
        return a[:i], mask[:i]
    padding = i - len(a)
    return a + [0] * padding, mask + [0] * padding

def loadComments(filename, maxComments, config):
    comments = []
    masks = []
    commentps = []
    maskps = []
    commentfs = []
    labels = []
    with open(filename, "r") as inFile:
        for i, line in enumerate(inFile, 1):
            if len(comments) >= maxComments:
                break
            comment = json.loads(line)

            commentInput, maskInput = pad(comment["body_t"], config["maxDocLength"])
            comments.append(commentInput)
            masks.append(maskInput)

            commentpInput, maskpInput= pad(comment["parent_comment_t"], config["maxDocLength"])
            commentps.append(commentpInput)
            maskps.append(maskpInput)

            commentf = []
            if config["addRT"]:
                commentf.append(comment["response_time_hours"])
            if config["addTime"]:
                commentf.append(comment["time_of_day"])
                commentf.append(comment["weekday"])
            if config["addLength"]:
                commentf.append(len(comment["body_t"]))
            commentfs.append(commentf)

            if comment["num_child_comments"] == 0:
                labels.append([1, 0])
            else:
                labels.append([0, 1])

            if i % 10000 == 0:
                print "Processed {} lines".format(i)

    return [comments, masks, commentps, maskps, commentfs, labels]

def printConfig(config):
    print "-----------------------------------------"
    print ["{}: {}".format(k, v) for k, v in sorted(config.iteritems())]
    print "-----------------------------------------"

def plot(losses, trainAccuracies, devAccuracies, outputFile="plot"):
    xs = range(1, len(losses) + 1)
    plt.figure()
    plt.plot(xs, losses, "r-", label="loss")
    plt.xlabel("epochs")
    plt.ylabel("loss")
    plt.savefig(outputFile + "1.png")

    plt.figure()
    trainAcc, = plt.plot(xs, trainAccuracies, "b-", label="trainAcc")
    devAcc, = plt.plot(xs, devAccuracies, "g-", label="devAcc")
    plt.xlabel("epochs")
    plt.legend(handles=[trainAcc, devAcc])
    plt.savefig(outputFile + "2.png")
    plt.show()

In [3]:
#preprep

# TODO: Uses too much RAM, can't go past 300,000 comments likely, will fix later
# TODO: Currently only supports a value of 1 for MIN_FREQ.
MIN_FREQ = 1
TOKEN_PAD = "TOKEN_PAD"
TOKEN_UNK = "TOKEN_UNK"

def process_comment(comment, vocab, frequencies):
    processedComment = []
    for word in nltk.word_tokenize(comment):
        word = process_word(word)
        if word not in vocab:
            vocab[word] = np.random.randn(len(vocab[TOKEN_PAD]))
            frequencies[word] = 0
        frequencies[word] += 1
        processedComment.append(word)
    return processedComment

def process_word(word):
    if 'http' in word:
        return 'TOKEN_HTTP_URL'
    if 'ftp' in word:
        return 'TOKEN_FTP_URL'
    if '@' in word:
        return 'TOKEN_AT_REFERENCE'
    word = word.lower()
    return word

def processComments(filename, numLines, vocab, frequencies):
    comments = []
    with open(filename, "r") as inFile:
        for i, line in enumerate(inFile, 1):
            if len(comments) >= numLines:
                break
            comment = json.loads(line)
            comment["body_t"] = process_comment(comment["body"], vocab, frequencies)
            comment["parent_comment_t"] = process_comment(comment["parent_comment"], vocab, frequencies)
            comments.append(comment)

            if i % 100000 == 0:
                print "Processed {} lines".format(i)

    return comments

def cleanFrequencies(vocab, frequencies):
    assert len(vocab) - 2 == len(frequencies)

    # Take care of special padding token.
    embed = [vocab[TOKEN_PAD], vocab[TOKEN_UNK]]
    vocab[TOKEN_PAD] = 0
    vocab[TOKEN_UNK] = 1

    # Loop through all words
    for word, count in frequencies.iteritems():
        if count < MIN_FREQ:
            del vocab[word]
            continue
        embed.append(vocab[word])
        vocab[word] = len(embed) - 1

    return vocab, np.asarray(embed)

def wordToIndex(word, vocab):
    if word in vocab:
        return vocab[word]
    return vocab[TOKEN_UNK]

def outputComments(comments, filename, vocab):
    with open(filename, "w") as outFile:
        for i, comment in enumerate(comments, 1):
            comment["body_t"] = [wordToIndex(word, vocab) for word in comment["body_t"]]
            comment["parent_comment_t"] = [wordToIndex(word, vocab) for word in comment["parent_comment_t"]]
            outFile.write(json.dumps(comment) + "\n")

            if i % 10000 == 0:
                print "Outputted {} lines".format(i)

def outputVocab(vocab, filename):
    vocabList = [TOKEN_UNK] * len(embed)
    for word, index in vocab.iteritems():
        vocabList[index] = word
    with open(filename, "w") as outFile:
        for word in vocabList:
            outFile.write(word.encode('utf-8') + "\n")

# Builds a vocab.
def loadWordVectors(inFilename):
    print "Loading word vectors"
    embedSize = 0
    vocab = {}
    frequencies = {}
    with open(inFilename, 'r') as inFile:
        for i, line in enumerate(inFile, 1):
            row = line.strip().split(' ')
            vocab[row[0]] = np.array([float(num) for num in row[1:]])
            frequencies[row[0]] = 0
            embedSize = len(row) - 1

            if i % 100000 == 0:
                print "Processed {} lines".format(i)
    vocab[TOKEN_PAD] = np.zeros(embedSize)
    vocab[TOKEN_UNK] = np.random.randn(embedSize)
    print "Loaded {} words".format(len(vocab))

    return vocab, frequencies

In [4]:
#LSTM

# IMPORTANT: Contains the configurations for the LSTM
def getConfig():
    config = {
        "maxDocLength": 250,  # Max is 2191
        "batchSize": 256,
        "addRT": True,
        "addTime": False,
        "addLength": True,
        "addCommentp": False
    }
    config["addCommentf"] = config["addRT"] or config["addTime"] or config["addLength"]
    config["learningRates"] = [0.01] * 5 + [0.005] * 5 + [0.003] * 5 + [0.002] * 5 + [0.001] * 5 + [0.0005] * 5 + [0.0003] * 5 + [0.0001] * 5
    config["lstmUnits"] = 64
    config["attentionUnits"] = 32
    config["layer2Units"] = 16
    config["numClasses"] = 2
    config["dropoutKeepProb"] = 0.9
    config["numTrain"] = 1000
    config["numDev"] = 200
    config["numEpochs"] = len(config["learningRates"])

    # Junk.
    # config["learningRates"] = [0.01] * 3 + [0.005] * 2 + [0.003] * 5 + [0.002] * 10 + [0.001] * 5 + [0.0005] * 5
    # learningRates = [0.01] * 10 + [0.005] * 10 + [0.003] * 10 + [0.002] * 10
    # learningRates = [0.01] * 3 + [0.005] * 2 + [0.003] * 5 + [0.002] * 10 + [0.001] * 5 + [0.0005] * 5 + [0.0004] * 5 + [0.0003] * 5 + [0.0002] * 5 + [0.0001] * 5
    # learningRates = [0.01] * 10 + [0.005] * 10

    return config

def attention(inputs, attention_size, time_major=False, return_alphas=False):
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.array_ops.transpose(inputs, [1, 0, 2])

    hidden_size = inputs.shape[2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    w_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))

    with tf.name_scope('v'):
        # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
        #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)

    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # (B,T) shape
    alphas = tf.nn.softmax(vu, name='alphas')         # (B,T) shape

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)

    if not return_alphas:
        return output
    else:
        return output, alphas

def getAttentionLSTMOutputs(embeddings, masks, dropoutKeepProb, scope, config):
    with tf.name_scope(scope):
        # LSTM
        seqLengths = tf.reduce_sum(masks, axis=1)
        lstmCell = tf.contrib.rnn.BasicLSTMCell(config["lstmUnits"])
        lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=dropoutKeepProb)
        cellOutputs, _ = tf.nn.dynamic_rnn(lstmCell, embeddings, sequence_length=seqLengths, dtype=tf.float32, scope=scope)

        # Attention layer
        attentionOutputs = attention(cellOutputs, config["attentionUnits"])

        # Dropout layer
        dropoutOutputs = tf.nn.dropout(attentionOutputs, dropoutKeepProb)

        return dropoutOutputs

def getLSTMOutputs(embeddings, masks, dropoutKeepProb, scope, config):
    with tf.name_scope(scope):
        # LSTM
        lstmCell = tf.contrib.rnn.BasicLSTMCell(config["lstmUnits"])
        lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=dropoutKeepProb)
        cellOutputs, _ = tf.nn.dynamic_rnn(lstmCell, embeddings, dtype=tf.float32, scope=scope)

        # Output to pred
        cellOutputs = tf.transpose(cellOutputs, [2, 0, 1]) # cells, batches, len
        maskedOutputs = tf.reduce_sum(cellOutputs * masks, axis=2) / tf.reduce_sum(masks, axis=1)
        lstmOutputs = tf.transpose(maskedOutputs, [1, 0]) # batches, cells

    return lstmOutputs

def train(embed, trainData, devData, config, trainableE=False, error_analysis=False):
    # Create input placeholders
    comments = tf.placeholder(tf.int32, [None, config["maxDocLength"]])
    masks = tf.placeholder(tf.float32, [None, config["maxDocLength"]])
    commentps = tf.placeholder(tf.int32, [None, config["maxDocLength"]])
    maskps = tf.placeholder(tf.float32, [None, config["maxDocLength"]])
    commentfs = tf.placeholder(tf.float32, [None, config["numCommentfs"]])
    labels = tf.placeholder(tf.float32, [None, config["numClasses"]])
    dropoutKeepProb = tf.placeholder(tf.float32)
    learningRate = tf.placeholder(tf.float32)

    # Create embedding tranform.
    with tf.variable_scope("embedding", reuse=tf.AUTO_REUSE):
        E = tf.get_variable("E", initializer=embed, trainable=trainableE)
        embeddings = tf.nn.embedding_lookup(E, comments)
        embeddingps = tf.nn.embedding_lookup(E, commentps)

    # LSTM
    lstmOutputs = None
    if config["attentionUnits"]:
        lstmOutputs = [getAttentionLSTMOutputs(embeddings, masks, dropoutKeepProb, "lstm", config)]
    else:
        lstmOutputs = [getLSTMOutputs(embeddings, masks, dropoutKeepProb, "lstm", config)]
    if config["addCommentp"]:
        lstmOutputs.append(getLSTMOutputs(embeddingps, maskps, dropoutKeepProb, "lstmp", config))
    if config["addCommentf"]:
        lstmOutputs.append(commentfs)
    lstmOutputs = tf.concat(lstmOutputs, axis=1)

    # Layer 1 ReLu
    W1 = tf.get_variable(
        "W1",
        shape=[config["numLSTMOutputs"], config["layer2Units"]],
        initializer=tf.initializers.truncated_normal())
    b1 = tf.get_variable(
        "b1", 
        shape=[config["layer2Units"]], 
        initializer=tf.constant_initializer(0.1))
    layer1Output = tf.nn.relu(tf.matmul(lstmOutputs, W1) + b1)

    # Dropout layer
    layer1Droutput = tf.nn.dropout(layer1Output, dropoutKeepProb)

    # layer 2 softmax
    with tf.name_scope("layer2"):
        W2 = tf.get_variable(
            "W2",
            shape=[config["layer2Units"], config["numClasses"]],
            initializer=tf.initializers.truncated_normal())
        b2 = tf.get_variable(
            "b2",
            shape=[config["numClasses"]],
            initializer=tf.constant_initializer(0.1))
    prediction = tf.matmul(layer1Droutput, W2) + b2

    # Accuracy
    correctPred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))
    confusion = tf.confusion_matrix(
        labels = tf.argmax(labels, 1),
        predictions = tf.argmax(prediction, 1)
    )
    # Loss and optimizer
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
    optimizer = tf.train.AdamOptimizer(learning_rate=learningRate).minimize(loss)

    # Saver
    # saver = tf.train.Saver()

    # Collect Info.
    losses = []
    trainAccuracies = []
    devAccuracies = []

    with tf.Session() as sess:
        # Variable Initialization.
        # if saveIn:
        #     saver.restore(sess, saveIn)
        # else:
        sess.run(tf.global_variables_initializer())

        for epoch in range(config["numEpochs"]):
            # Training.
            epochLoss = 0
            epochAccuracy = 0
            for batchNum, batches in enumerate(get_minibatches(trainData, config["batchSize"])):
                feedDict = {
                    comments: batches[0],
                    masks: batches[1],
                    labels: batches[5],
                    learningRate: config["learningRates"][epoch],
                    dropoutKeepProb: config["dropoutKeepProb"]
                }
                if config["addCommentp"]:
                    feedDict[commentps] = batches[2]
                    feedDict[maskps] = batches[3]
                if config["addCommentf"]:
                    feedDict[commentfs] = batches[4]

                batchSize = len(batches[0])
                batchAccuracy, batchLoss, _ = sess.run([accuracy, loss, optimizer], feedDict)
                epochLoss += batchLoss * batchSize
                epochAccuracy += batchAccuracy * batchSize
                if (batchNum + 1) % 100 == 0:
                    print "Epoch: {}, Batch: {}".format(epoch + 1, batchNum + 1)
            losses.append(epochLoss / float(config["numTrain"]))
            trainAccuracies.append(epochAccuracy / float(config["numTrain"]))
            print "Epoch: {}, Loss: {}, Accuracy: {}".format(epoch + 1, losses[-1], trainAccuracies[-1])

            # Dev.
            epochAccuracy = 0
            for batchNum, batches in enumerate(get_minibatches(devData, config["batchSize"])):
                feedDict = {
                    comments: batches[0],
                    masks: batches[1],
                    labels: batches[5],
                    learningRate: config["learningRates"][epoch],
                    dropoutKeepProb: 1.0
                }
                if config["addCommentp"]:
                    feedDict[commentps] = batches[2]
                    feedDict[maskps] = batches[3]
                if config["addCommentf"]:
                    feedDict[commentfs] = batches[4]

                batchSize = len(batches[0])
                epochAccuracy += sess.run(accuracy, feedDict) * batchSize
            devAccuracies.append(epochAccuracy / float(config["numDev"]))
            print "Dev Accuracy: {}".format(devAccuracies[-1])
        print('Confusion Matrix: \n\n', tf.Tensor.eval(confusion,feed_dict=None, session=None))

            # savePath = saver.save(sess, saveOut)
            # print "Model saved at {}".format(savePath)

    # Print out summary.
    bestDevAccuracy = 0
    bestIndex = 0
    for i, accuracy in enumerate(devAccuracies):
        if accuracy > bestDevAccuracy:
            bestDevAccuracy = accuracy
            bestIndex = i
            bestConfusion = confusion

    print "Best Dev of {} at epoch {}, train acc: {}, train loss: {}".format(
        bestDevAccuracy,
        bestIndex + 1,
        trainAccuracies[bestIndex],
        losses[bestIndex])

    # Return series.
    return losses, trainAccuracies, devAccuracies

In [None]:
# numLines = float('inf')

# vocab, frequencies = loadWordVectors("glove.6B.300d.txt")

# print "Processing Training Data"
# trainComments = processComments(
#     "Reddit2ndTrainTime",
#     numLines,
#     vocab,
#     frequencies)

# print "Processing Dev Data"
# devComments = processComments(
#     "Reddit2ndDevTime",
#     numLines,
#     vocab,
#     frequencies)

# print "Cleaning frequencies"
# vocab, embed = cleanFrequencies(vocab, frequencies)
# assert len(vocab) == len(embed)
# print "Vocab size: {}".format(len(vocab))

# print "Outputting train comments"
# outputComments(trainComments, "data/ProcessedTrain", vocab)

# print "Outputting dev comments"
# outputComments(devComments, "data/ProcessedDev", vocab)

# print "Outputting embeddings"
# np.savetxt("data/embed.txt", embed)

# print "Outputting vocab"
# outputVocab(vocab, "data/vocab.txt")

In [None]:
print "Loading config"
config = getConfig()

print "Loading embeddings"
embed = np.loadtxt("data/embed.txt", dtype=np.float32)
print embed.shape

print "Loading Training Data"
trainData = loadComments("data/ProcessedTrain", config["numTrain"], config)

print "Loading Dev Data"
devData = loadComments("data/ProcessedDev", config["numDev"], config)

# Additional configs
config["vocabSize"] = len(embed)
config["embedDim"] = len(embed[0])
config["numCommentfs"] = len(trainData[4][0])
config["numLSTMOutputs"] = config["lstmUnits"] + config["numCommentfs"]
if config["addCommentp"]:
    config["numLSTMOutputs"] += config["lstmUnits"]
printConfig(config)
# config["addTime"] = True
# config["addCommentp"] = True

print "Training"
losses, trainAccuracies, devAccuracies = train(
    embed, 
    trainData, 
    devData, 
    config,
    trainableE=False
    )

print "Plotting"
plot(losses, trainAccuracies, devAccuracies)

Loading config
Loading embeddings


In [7]:
import time
import json

corrects = []
incorrects = []
with open("lstmSummaryPred.json", "r") as inFile:
    for i, line in enumerate(inFile, 1):
        comment = json.loads(line)
        correct = 0
        if (comment["prediction"] == 1 and comment["num_child_comments"] > 0) or (comment["prediction"] == 0 and comment["num_child_comments"] == 0):
            correct = 1
        if correct == 1:
            corrects.append(comment)
        else:
            incorrects.append(comment)
        if i % 10 == 0: print("Processed {} lines.".format(i))

Processed 10 lines.
Processed 20 lines.
Processed 30 lines.
Processed 40 lines.
Processed 50 lines.
Processed 60 lines.
Processed 70 lines.
Processed 80 lines.
Processed 90 lines.
Processed 100 lines.
Processed 110 lines.
Processed 120 lines.
Processed 130 lines.
Processed 140 lines.
Processed 150 lines.
Processed 160 lines.
Processed 170 lines.
Processed 180 lines.
Processed 190 lines.
Processed 200 lines.
Processed 210 lines.
Processed 220 lines.
Processed 230 lines.
Processed 240 lines.
Processed 250 lines.
Processed 260 lines.
Processed 270 lines.
Processed 280 lines.
Processed 290 lines.
Processed 300 lines.
Processed 310 lines.
Processed 320 lines.
Processed 330 lines.
Processed 340 lines.
Processed 350 lines.
Processed 360 lines.
Processed 370 lines.
Processed 380 lines.
Processed 390 lines.
Processed 400 lines.
Processed 410 lines.
Processed 420 lines.
Processed 430 lines.
Processed 440 lines.
Processed 450 lines.
Processed 460 lines.
Processed 470 lines.
Processed 480 lines.
P

In [88]:
import numpy as np
from scipy.stats.stats import pearsonr
import matplotlib.pyplot as plt

def check_correlation(corrects, incorrects, feature):
    X1 = []
    y1 = []
    X0 = []
    y0 = []
    if feature == "length":
        for x in corrects:
            y1.append(1)
            X1.append(len(x["body"].split()))
        for x in incorrects:
            y0.append(0)
            X0.append(len(x["body"].split()))
    elif feature == "positivechildren":
        for x in corrects:
            if x['prediction'] > 0:
                y1.append(1)
                X1.append(x["num_child_comments"])
        for x in incorrects:
            if x['prediction'] > 0:
                y0.append(0)
                X0.append(x["num_child_comments"])
    elif feature == "negativechildren":
        for x in corrects:
            if x['prediction'] < 1:
                y1.append(1)
                X1.append(x["num_child_comments"])
        for x in incorrects:
            if x['prediction'] < 1:
                y0.append(0)
                X0.append(x["num_child_comments"])
    else:
        for x in corrects:
            y1.append(1)
            X1.append(x[feature])
        for x in incorrects:
            y0.append(0)
            X0.append(x[feature])
    X = np.concatenate((X1,X0))
    y = np.concatenate((y1,y0))
    p = pearsonr(list(X), list(y))
    print("Pearson Correlation Coefficient: {}".format(p))
        
    print("Correct Mean: {}, Incorrect Mean: {}".format(np.mean(X1), np.mean(X0)))
    print("Correct Median: {}, Incorrect Median: {}".format(np.median(X1), np.median(X0)))
    print("Correct Std: {}, Incorrect Std: {}".format(np.std(X1), np.std(X0)))

    xlabel = feature
    
#     plt.hist(X1)
#     plt.xlabel(xlabel)
#     plt.ylabel("Occurrence")
#     plt.title("{} of Corrects".format(feature))
#     plt.grid(True)
#     plt.show()
    
#     plt.hist(X0)
#     plt.xlabel(xlabel)
#     plt.ylabel("Occurrence")
#     plt.title("{} of Incorrects".format(feature))
#     plt.grid(True)
#     plt.show()

In [89]:
check_correlation(corrects, incorrects, "length")

Pearson Correlation Coefficient: (0.027090055827073817, 0.055437844487939175)
Correct Mean: 43.3674475956, Incorrect Mean: 39.9954441913
Correct Median: 25.0, Incorrect Median: 23.0
Correct Std: 60.0773340566, Incorrect Std: 58.1139892138


In [90]:
check_correlation(corrects, incorrects, "weekday")

Pearson Correlation Coefficient: (0.003991702049542469, 0.7778005010572098)
Correct Mean: 2.71085080148, Incorrect Mean: 2.6930523918
Correct Median: 2.0, Incorrect Median: 2.0
Correct Std: 2.13744564268, Incorrect Std: 2.11157392517


In [91]:
check_correlation(corrects, incorrects, "time_of_day")

Pearson Correlation Coefficient: (0.00589288468065817, 0.6769788340939595)
Correct Mean: 2.81257706535, Incorrect Mean: 2.78986332574
Correct Median: 3.0, Incorrect Median: 3.0
Correct Std: 1.83556846993, Incorrect Std: 1.84777455988


In [92]:
check_correlation(corrects, incorrects, "positivechildren")

Pearson Correlation Coefficient: (0.31739853367414167, 4.9383306196631316e-79)
Correct Mean: 5.4781420765, Incorrect Mean: 0.0
Correct Median: 3.0, Incorrect Median: 0.0
Correct Std: 9.57602954796, Incorrect Std: 0.0


In [93]:
check_correlation(corrects, incorrects, "negativechildren")

Pearson Correlation Coefficient: (-0.32617383457390564, 1.7941953668691163e-42)
Correct Mean: 0.0, Incorrect Mean: 3.79934747145
Correct Median: 0.0, Incorrect Median: 2.0
Correct Std: 0.0, Incorrect Std: 8.74641103175
