### Cohort Analysis

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt


In [None]:
!curl https://dl.dropboxusercontent.com/u/16006464/client_secret.json -o client_secret.json

In [None]:
# Connect to the BigQuery API
from googleapiclient.discovery import build
from oauth2client import client
credentials = client._get_application_default_credential_from_file('client_secret.json')
credentials = credentials.create_scoped('https://www.googleapis.com/auth/bigquery')
bigquery_service = build('bigquery', 'v2', credentials=credentials)

In [None]:
# Run a SQL query
#
# The first subquery S computes the total amount of rewards posted by a requester
# which we then aggregate by month.
# The second subquery C computes the cohort of the requester; we define as cohort_id
# the month in which we have first seen a requester in our data.
#
# The string manipulation is just to get the dates
# represented as YYYY-MM
# 
# For efficiency, we store the results in a BigQuery table first, retrieve on Python directly
# from the materialized table.
'''
SELECT
  SUM(S.reward) AS reward, S.year_month AS year_month, S.requesterId AS requesterId, C.cohort AS cohort
FROM (
  SELECT
    -I.rewardDiff/100 AS reward,
    STRING(YEAR(I.timestamp)) + '-' + RIGHT('0' + STRING(MONTH(I.timestamp)), 2) AS year_month,
    G.requesterId AS requesterId
  FROM
    entities.HITinstance I
  INNER JOIN
    entities.HITgroup G
  ON
    G.groupId=I.groupId
  WHERE
    I.rewardDiff<0 AND I.rewardDiff>-1000000) S
INNER JOIN (
  SELECT
    requesterId,
    STRING(YEAR(MIN(firstSeen))) + '-' + RIGHT('0' + STRING(MONTH(MIN(firstSeen))), 2) AS cohort
  FROM
    entities.HITgroup
  GROUP BY
    requesterId) C
ON
  S. requesterId = C.requesterId
GROUP BY
  requesterId, year_month, cohort
ORDER BY
  requesterId, year_month
'''


query_request = bigquery_service.jobs()
query_data = {
    'query': (
        '''
SELECT * FROM entities.m_cohort_rewards_per_month
WHERE year_month!='2016-03' AND cohort!='2014-05'
ORDER BY cohort, requesterId, year_month
        ''')
}

query_response = query_request.query(
            projectId='crowd-power',
            body=query_data).execute()

In [None]:
# Put the SQL results in a Pandas Dataframe
import pandas as pd
import numpy as np
columns = [f.get('name') for f in query_response['schema']['fields']]
rows = [tuple([row['f'][i]['v'] for i in range(len(row['f']))]) for row in query_response['rows']]
df = pd.DataFrame(data=rows, columns=columns, dtype=float)
df.to_csv("cohort-analysis-rewards.csv")
df

In [None]:
# Transform the dataframe into a table with cohorts as rows and year_month of activity columns. Each cell contains the 
# sum of 
pivot = pd.pivot_table(df, 
                       values='requesterId', 
                       index=['year_month'], 
                       columns=['cohort'], 
                       aggfunc=lambda x: len(x.unique()))
pivot

In [None]:
f = plt.figure(edgecolor='k')
ax=f.gca()
pivot.plot(kind='area', stacked=True, legend=True, figsize=(16,8), cmap='Paired', grid=True, ax=ax);

plt.title('Amazon Mechanical Turk Cohort Analysis', color='black')
plt.legend(loc='lower center', ncol=8, bbox_to_anchor=[0.5, -0.25])
ax.set_ylabel("Requesters active at least once within the month")
ax.set_xlabel("Date")
plt.show()

f.savefig('mturk-cohort-analysis.png', bbox_inches='tight')

In [None]:
# Transform the dataframe into a table with cohorts as rows and year_month of activity columns. Each cell contains the 
# sum of 
pivot = pd.pivot_table(df, values='reward', index=['year_month'], columns=['cohort'], aggfunc=np.sum)
pivot

In [None]:
f = plt.figure(edgecolor='k')
ax=f.gca()
pivot.plot(kind='area', stacked=True, legend=True, figsize=(16,8), cmap='Paired', grid=True, ax=ax);

plt.title('Amazon Mechanical Turk Cohort Analysis', color='black')
plt.legend(loc='lower center', ncol=8, bbox_to_anchor=[0.5, -0.25])
ax.set_ylabel("Rewards Posted")
ax.set_xlabel("Date")
plt.show()

f.savefig('mturk-cohort-analysis.png', bbox_inches='tight')