# Amazon Mechanical Turk

The role of this script is to generate the data in a format that we can use to label on Mechanical Turk

1. Generate the next batch
2. Read the batch and write it to MongoDB

In [2]:
from dao import DataAccess

import pandas as pd
import datetime
import re

%matplotlib inline

Getting the data from DataAccess is so easy!

In [4]:
# X = DataAccess.get_not_labeled()
# X.head(10)

# Generating...

In [41]:
date = str(datetime.date.today())
candidate = X.sort("random_number").head(1000)
key = candidate.random_number.apply(hash).sum()

In [53]:
candidate["text"] = candidate.text.str.encode("utf-8")

In [54]:
candidate[["text", "random_number"]].to_csv(
    "../amt/amt|{}|{}.csv".format(date, key),
)

In [55]:
%%bash

ls ../amt/

amt_2015-08-12_4952308500878643456.csv
amt|2015-08-12|4952308500878643456.csv


# Parsing to Label

#### New Labels

Straight from Amazon Mechanical Turk

In [5]:
labels = pd.DataFrame.from_csv("../amt/Batch_2055156_batch_results.csv")
labels.Answer.value_counts()

Not Alcohol Related                               691
Alcohol Related::Discussion                        83
Alcohol Related::Promotional Content               75
First Person - Alcohol::Casual Drinking            69
First Person - Alcohol::Looking to drink           54
First Person - Alcohol::Heavy Drinking             14
First Person - Alcohol::Reflecting on drinking     14
dtype: int64

#### Old Legacy Labels

This has been preprocessed and required some other work

In [6]:
labels2 = pd.DataFrame.from_csv("./twitter_labels.csv")
labels2.Answer.value_counts()

Alcohol Consumption                               900
Not Alcohol Related                               898
Alcohol Consumption::First Person                 399
Alcohol Related::Discussion                       294
First Person - Alcohol::Looking to drink          208
First Person - Alcohol::Reflecting on drinking    151
First Person - Alcohol::Casual Drinking            88
Alcohol Consumption::Second Person                 70
First Person - Alcohol::Heavy Drinking             63
Alcohol Related::Promotional Content               60
Alcohol Consumption::Third Person                  25
Alcohol Consumption::Other                         18
First Person - Alcohol                              1
dtype: int64

#### Joining them and applying a Transformatoin

In [7]:
final_labels = pd.concat([labels.Answer, labels2.Answer])
final_labels.head()

30ZKOOGW2WQVC2GQKM7V3LFZIBU1AI                         Not Alcohol Related
34XASH8KLQ6ZVV8H2R8QD8SXF8PMPR     First Person - Alcohol::Casual Drinking
306996CF6W40VCLIQ07RYWZ50TF1BD    First Person - Alcohol::Looking to drink
3GONHBMNHVIT9ZANP06S46UBP7WZMP                         Not Alcohol Related
3RDTX9JRTYLVI5LNHWA480VSFFC978                         Not Alcohol Related
Name: Answer, dtype: object

In [13]:
class TurkResults2Label:

    first = re.compile("First")
    alch = re.compile("Alcohol Consumption")

    drinking_level = {
        "First Person - Alcohol":0,
        "First Person - Alcohol::Casual Drinking":0,
        "First Person - Alcohol::Looking to drink":1,
        "First Person - Alcohol::Reflecting on drinking":2,
        "First Person - Alcohol::Heavy Drinking":3
    }

    related = {
        "Alcohol Related::Discussion":0,
        "Alcohol Related::Promotional Content":1
    }
    
    @classmethod
    def parse_to_labels(cls, string_label):
        label = {}

        if string_label == "Not Alcohol Related":
            label["alcohol"] = 0
            return label
        else:
            label["alcohol"] = 1

        if cls.alch.match(string_label) and not cls.first.match(string_label):
            return label

        if cls.first.match(string_label):
            label["first_person"] = 1
            label["first_person_level"] = cls.drinking_level[string_label]
            return label
        else:
            label["first_person"] = 0
            label["alcohol_related"] = cls.related[string_label]
        label["raw"] = string_label
        return label

In [16]:
%%time
DataAccess.write_labels(final_labels.apply(TurkResults2Label.parse_to_labels))

CPU times: user 2.39 s, sys: 425 ms, total: 2.81 s
Wall time: 42 s
