In [None]:
# Annotate messages with tangram

## Pull in raw data

d_msgs_raw = (pd.read_csv('../../data/tangrams_unconstrained/message/tangramsMessages.csv', escapechar='\\')
              .assign(tangramRef = 'None'))
d_boards = (pd.read_csv('reformattedBoards.csv'))

## Tag with super simple, conservative heuristic

The most obvious strategy is to (on a first pass) assume that the tangram the matcher moves in response to a message is the one the message is referring to. The second pass is to skip the ones where we know they got it wrong. We'll probably end up hand-tagging those or using some other strategy depending on how many there are.

# There are a few obvious problems here:

# 1. The director will sometimes send several messages before the matcher moves anything. So we can't just use the closest move in time... 
# 2. instead, we could use the *first* move action after the message and then rule it out so that we won't use it again even if it's the first after later message as well
# 3. **that**, though, also has a problem. Multiple messages are sent per tangram, and some messages are meta-chatter (e.g. "hello", "thanks", "good job", "this HIT is terrible"). If we assign the drop actions to the first $N$ messages, we'll have a bunch of actual messages about tangrams that aren't tagged and a bunch of messages **not** about tangrams incorrectly tagged.

# So... we'll do a simpler thing. Check for numbers occuring in the text and look them up in the board data...

pattern = re.compile('[\W_]+')
for index, row in d_msgs_raw.iterrows():
    stripedStr = pattern.sub(' ', row.contents)
    numbers = [int(s) for s in stripedStr.split() if s.isdigit()]
    gameid = row.gameid
    roundNum = row.roundNum
    if len(numbers) == 1 and 0 < numbers[0] <= 12 and row.sender == 'director':
        boardRow = d_boards.query('gameid == "{0}" and roundNum == {1} and trueLoc == {2}'
                                  .format(gameid, roundNum, numbers[0]))
        d_msgs_raw.set_value(index, 'tangramRef', boardRow.tangramName.tolist()[0])

# Check to see how many we tagged...

1 - Counter(d_msgs_raw['tangramRef'])['None'] / float(d_msgs_raw.shape[0])

# not bad for a conservative heuristic! Now we're going to use the tagged data to train a classifier that will make predictions for the other 40%.

## Train classifier

###  Set up training set

# Used `d_msgs_raw` in `d_combined` the first time and subsequently used the updated hand-tagged version

from sklearn.cross_validation import train_test_split
d_handtagged = pd.read_csv('handTagged.csv')
d_nicki = (pd.read_csv('../../data/tangrams_unconstrained/old/oldTangrams.csv')
    .query('tangram != "*"')
    .drop('sender', 1)
    .rename(columns = {'tangram' : 'tangramRef'}))
d_combined = (d_handtagged # d_msgs_raw
  .query('tangramRef != "None"')
  .query('tangramRef != "*"')
  .drop('sender', 1)
  .append(pd.DataFrame(data = d_nicki), ignore_index=True))
train_msg, test_msg = train_test_split(d_combined, test_size = 0.2)

len(d_nicki['tangramRef'])

### Build pipeline

# Largely drawn from [here](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html).

# Import necessary sklearn modules and grid search params

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)],
              'vect__stop_words': (None, 'english'),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3, 1e-4, 1e-5)
}

# Train bag-of-words LR classifier 

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='log', penalty='l2',n_iter=5)),
                    ])
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
_ = gs_clf.fit(train_msg.contents, train_msg.tangramRef)

### Look at performance on held-out test set

# Look at success!

predicted = gs_clf.predict(test_msg.contents)
correct = predicted == test_msg.tangramRef
print("test-split accuracy is...")
print(sum(correct)/float(len(correct)))

# Plot ROC curve

test_msg.loc[:, 'predicted'] = predicted
test_msg.loc[:, 'correct'] = test_msg['predicted'] == test_msg['tangramRef']
test_msg.loc[:, 'maxProb'] = [max(row) for row in gs_clf.predict_proba(test_msg['contents'])]
# We could also measure confidence using the distance between the top two categories, but this
# turns out not to be quite as good a metric
test_msg.loc[:, 'probDiff'] = [sorted(row)[-1] - sorted(row)[-2] 
                               for row in gs_clf.predict_proba(test_msg['contents'])]

actualNumPos= float(sum(test_msg['correct']))
actualNumNeg= len(test_msg['correct']) - float(sum(test_msg['correct']))

TPRs, FPRs, thresholds = [], [], []
for threshold in np.arange(0,1,.05) :
    thresholds.append(threshold)
    # Get the ones that our policy tags as "correct"
    predYes = test_msg.query('maxProb > {0}'.format(threshold))['correct']
    # TPR: number *correct* positive results relative to overall number positive samples 
    TPRs.append(sum(predYes)/actualNumPos)
    # FPR: number *incorrect* positive results relative to overall number negative samples 
    FPRs.append((len(predYes)-sum(predYes))/actualNumNeg)

fig = plt.figure()
ax = fig.add_subplot(111, aspect = 'equal')
ax.plot([0,1], [0,1])
ax.set_xlabel("False positive rate")
ax.set_ylabel("True positive rate")
ax.plot(FPRs, TPRs, label = 'maxProb') 

cautiousThreshold = [threshold for threshold, FPR in zip(thresholds, FPRs) if FPR < 0.05 ][0]
print(cautiousThreshold)

# What are best params?

best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

from sklearn import metrics
def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

plt.figure()
cm = metrics.confusion_matrix(test_msg.tangramRef, predicted)
tangramLabels = sorted(list(set(test_msg.tangramRef)))
plot_confusion_matrix(cm, tangramLabels)

## Tag full dataset using ROC threshold 

predicted_myData = gs_clf.predict(d_handtagged.contents)
maxProbs = [max(row) for row in gs_clf.predict_proba(d_handtagged.contents)]
existingTags = d_handtagged.tangramRef
autoTags = [prediction if maxProb > cautiousThreshold and existing == 'None' else existing
            for (existing, maxProb, prediction) 
            in zip(existingTags, maxProbs, predicted_myData)]
print(sum(autoTags != existingTags))
d_handtagged.loc[:, 'autoTags'] = autoTags

d_handtagged.drop('tangramRef', axis = 1).to_csv("autoTagged.csv", index = False)