In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', 100)

In [2]:
raw = pd.read_csv('qy_batch.csv')

In [3]:
raw.columns

Index(['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.g1',
       'Input.q1', 'Input.r1', 'Input.g2', 'Input.q2', 'Input.r2', 'Input.g3',
       'Input.q3', 'Input.r3', 'Input.g4', 'Input.q4', 'Input.r4', 'Input.g5',
       'Input.q5', 'Input.r5', 'Input.g6', 'Input.q6', 'Input.r6', 'Input.g7',
       'Input.q7', 'Input.r7', 'Input.g8', 'Input.q8', 'Input.r8', 'Input.g9',
       'Input.q9', 'Input.r9', 'Input.g10', 'Input.q10', 'Input.r10',
       'Input.g11', 'Input.q11', 'Input.r11', 'Input.g12

In [4]:
data = []
for i,row in raw.iterrows():
    for j in range(1, 13):
        query = row['Input.q%d' % j]
        response = row['Input.r%d' % j]
        gender = row['Input.g%d' % j]
        
        if row['Answer.answer%d' % j] is not np.nan:
            answer = row['Answer.answer%d' % j]
        else:
            answer = row['Answer.answer%d.label' % j]
        
        data.append((row['HITId'], j, row['WorkerId'], gender, query, response, answer))

In [5]:
df = pd.DataFrame(data)
df.columns = ['HITId', 'q_idx', 'WorkerId', 'gender', 'query', 'response', 'answer']
flat = df

In [6]:
spammers = ['A2X295K4FFKJ8I', 'A17FG9GVV76H3', 'A3A0RP6IUR41PP', 'A9OOV3976AFYF']

### Flatten data

In [7]:
data = []
for i, group in df.groupby(['HITId', 'q_idx']):
    row = []
    row.append(group.iloc[0]['HITId'])
    row.append(group.iloc[0]['gender'])
    row.append(group.iloc[0]['query'])
    row.append(group.iloc[0]['response'])
    for j in range(0,3):
        if j < len(group):
            row.append(group.iloc[j]['WorkerId'])
            row.append(group.iloc[j]['answer'])
    data.append(row)

In [8]:
df = pd.DataFrame(data)
df.columns = ['HITId', 'gender', 'query', 'response', 'WorkerId1', 'Answer1', 'WorkerId2', 'Answer2', 'WorkerId3', 'Answer3',]
df.head(1)

Unnamed: 0,HITId,gender,query,response,WorkerId1,Answer1,WorkerId2,Answer2,WorkerId3,Answer3
0,30Y6N4AHYPXN3KJ9HR3IQO9DARSRDM,W,Is having an opinion an exclusive left wing privilege?,"It is, but only when it benefits them. The left has the upper hand in the media landscape in the...",A17FG9GVV76H3,Definitely yes,APGX2WZ59OWDN,Probably yes,A2AWSN5CWONM8,Definitely yes


In [9]:
def f(x):
    x = x.lower()
    if 'definitely' in x:
        return 'F'
    elif 'probably' in x:
        return 'P'
    else:
        return 'N'
    
df['Answer1.label'] = df['Answer1'].apply(f)
df['Answer2.label'] = df['Answer2'].apply(f)
df['Answer3.label'] = df['Answer3'].apply(f)

In [10]:
def f(x):
    x = x.lower()
    if 'definitely' in x:
        return 'I'
    elif 'probably' in x:
        return 'I'
    else:
        return 'N'
    
df['Answer1.coarse'] = df['Answer1'].apply(f)
df['Answer2.coarse'] = df['Answer2'].apply(f)
df['Answer3.coarse'] = df['Answer3'].apply(f)

In [11]:
# calculate no agreement
df['non-agreement'] = [ len(set([x['Answer1.label'], x['Answer2.label'], x['Answer3.label']])) == 3 for i,x in df.iterrows()]
print(len(df[df['non-agreement']]))
df[['gender', 'query', 'response']][df['non-agreement']]

60


Unnamed: 0,gender,query,response
17,M,Are you really that full of shit and up your own ass to realize that?,"I have no idea what that is, but you're probably right."
26,M,Do you ever see NFL fans complain the Luke Kuechly is overrated and isnt as good as Khalil Mack?,"No, that's just the NFL fans, not everyone."
28,W,Are these to be treated as natural deaths?,If it's natural then I'm fine with that too
35,W,Does anybody have any recommendatons on types of video equipment?,A lot of people recommend Audio Technica ATH X4s.
38,M,Did you know that 18 veterans commit suicide every day?,No they don't.
41,M,"Is having no games because people see the genre as unstable, too expensive to develop and with a...",I would like to see a game like this.
49,M,Is the U.S. a police state?,I think it's called a country where citizens are free to exercise their rights to self defense a...
60,W,do you *have* to have so many stones?,Yes... yes... you are correct... I have a ton of these
83,M,Do you not know what a CEO is?,I don't think he understands what a CEO is.
97,M,Are you seriously trying to quote mine?,I just wanted to show you that I understood your reference.


In [12]:
for i,row in df.iterrows():
    labels = [row['Answer1.label'], row['Answer2.label'], row['Answer3.label']]
    #print('%d\t%d\t%d' % (labels.count('F'), labels.count('P'), labels.count('N')) )

In [13]:
count = 0
for i,row in df.iterrows():
    labels = [row['Answer1.coarse'], row['Answer2.coarse'], row['Answer3.coarse']]
    #print('%d\t%d' % (labels.count('I'), labels.count('N')) )
    if labels.count('I') == 3 or labels.count('N') == 3:
        count += 1
print(count)

423


In [14]:
for worker, group in flat.groupby('WorkerId'):
    print('---')
    print(worker)
    print(group.answer.value_counts())

---
A12GER5MS6M0J
No information or doesn't make sense    5
Probably yes                            4
Probably no                             3
Name: answer, dtype: int64
---
A16MXC30ZQ2W82
Probably yes                            10
No information or doesn't make sense     2
Name: answer, dtype: int64
---
A17FG9GVV76H3
Definitely yes                          88
Probably yes                            16
Probably no                              2
No information or doesn't make sense     1
Definitely no                            1
Name: answer, dtype: int64
---
A1IIDR9HLUB1MR
Probably yes                            12
Definitely yes                           6
No information or doesn't make sense     3
Definitely no                            2
Probably no                              1
Name: answer, dtype: int64
---
A1PO4SLO8F4INJ
Definitely yes                          4
Probably yes                            3
Probably no                             3
No information or doesn't make 