# Amazon Mechanical Turk

The role of this script is to generate the data in a format that we can use to label on Mechanical Turk

1. Generate the next batch
2. Read the batch and write it to MongoDB

In [2]:
from data.dao import DataAccess, LabelGetter
from data.turk import TurkResults2Label

import pandas as pd
import datetime
import re
import pickle

%matplotlib inline

Getting the data from DataAccess is so easy!

In [4]:
%%time

X = DataAccess.get_not_labeled().sort("random_number")
X.head(10)

CPU times: user 2.67 s, sys: 2.71 s, total: 5.38 s
Wall time: 38.8 s


# Preprocessing

In [5]:
from __private import fs

In [6]:
fs.list()

['first_person_label|accuracy:0.52046783625731|f1:0.5009187878453114|type:SVC',
 'first_person_label|accuracy:0.5243664717348928|f1:0.508797109912656|type:RandomForestClassifier',
 'first_person_label|accuracy:0.5321637426900585|f1:0.5265047952844989|type:LogisticRegression',
 'first_person_label|accuracy:0.543859649122807|f1:0.5166687798266746|type:SVC',
 'first_person_label|accuracy:0.5536062378167641|f1:0.5349297830780407|type:SVC',
 'first_person_label|accuracy:0.5555555555555556|f1:0.53459052797563|type:RandomForestClassifier',
 'first_person|accuracy:0.665389527458493|f1:0.7816666666666666|type:RandomForestClassifier',
 'first_person|accuracy:0.6743295019157088|f1:0.7822374039282664|type:SVC',
 'first_person|accuracy:0.6807151979565773|f1:0.7743682310469315|type:RandomForestClassifier',
 'first_person|accuracy:0.6909323116219668|f1:0.7917383820998279|type:SVC',
 'first_person|accuracy:0.6934865900383141|f1:0.7637795275590552|type:LogisticRegression',
 'first_person|accuracy:0.694

In [6]:
best_clf = fs.find_one({"filename":"first_person|accuracy:0.6909323116219668|f1:0.7917383820998279|type:SVC"})

In [10]:
clf_str = best_clf.read()
clf = pickle.loads(clf_str)

In [34]:
XX = X.head(10000)

In [35]:
first_person_labels = clf.predict_proba(XX)[:,1]

In [36]:
XX["predicted_fp"] = first_person_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [55]:
X = XX[(XX.predicted_fp > 0.3) & (XX.predict > 0.3) & ~(XX.text.str.contains("Drinking a"))]

# Generating...

In [56]:
date = str(datetime.date.today())
candidate = X.sort("random_number").head(500)
key = candidate.random_number.apply(hash).sum()

In [57]:
candidate["text"] = candidate.text.str.encode("utf-8")

In [58]:
candidate[["text", "random_number"]].to_csv(
    "../amt/amt|{}|{}.csv".format(date, key),
)

In [1]:
%%bash

ls ../amt/

Batch_2055156_batch_results.csv
Batch_2064123_batch_results.csv
Batch_2064125_batch_results.csv
Batch_2101169_batch_results.csv
amt_2015-08-13_4952308500878643456.csv
amt_2015-08-17_2546340314496401664.csv
amt_2015-08-20_8806137010034599936.csv
amt_2015-09-10_6072541214091395328.csv
amt_2015-09-24_-7976654553505407232.csv
amt|2015-09-10|6072541214091395328.csv
amt|2015-09-24|-7976654553505407232.csv


# Parsing to Label

#### New Labels

Straight from Amazon Mechanical Turk

In [16]:
df = pd.DataFrame.from_csv("../amt/Batch_2101169_batch_results.csv")

In [19]:
df = df[(df.Answer != "{}") & 
        (df.Answer != "Alcohol Related") &
        (df.Answer != "First Person - Alcohol")]
df.Answer.value_counts()

Not Alcohol Related                               365
Alcohol Related::Discussion                        39
First Person - Alcohol::Reflecting on drinking     25
First Person - Alcohol::Looking to drink           18
First Person - Alcohol::Casual Drinking            14
First Person - Alcohol::Heavy Drinking             13
Alcohol Related::Promotional Content                9
dtype: int64

In [24]:
df = pd.concat([pd.DataFrame.from_csv("../amt/Batch_2064125_batch_results.csv"),
      pd.DataFrame.from_csv("../amt/Batch_2064123_batch_results.csv")]).set_index("_id")
df = df[df.Worker != "A3EBA6G9AG7CO9"]

df.Answer.value_counts() / df.Answer.value_counts().sum()

Not Alcohol Related                               0.390244
First Person - Alcohol::Looking to drink          0.125508
First Person - Alcohol::Casual Drinking           0.116362
Alcohol Related::Discussion                       0.114837
First Person - Alcohol::Reflecting on drinking    0.113821
First Person - Alcohol::Heavy Drinking            0.097561
Alcohol Related::Promotional Content              0.041667
dtype: float64

#### Writing Labels using TurkResults2Label

In [20]:
%%time
DataAccess.write_labels(df.Answer.apply(TurkResults2Label.parse_to_labels))

CPU times: user 413 ms, sys: 166 ms, total: 579 ms
Wall time: 18.4 s
