Data Map lives at: https://docs.google.com/spreadsheets/d/1CgqTjdKizat-g7K7-AAuVIazQFKJ3WAAPHR-Qpa49lU/edit#gid=761153638

In [30]:
# Module imports
import copy
import datetime
import os
import pickle

import numpy as np
import pandas as pd

VERSION = 1.1

def fix_sample_number(x):
    """Make sure all samples numbers are of form: AXXX (where A is one of A, F, W and X is a digit)"""
    if isinstance(x, float) and np.isnan(x):
        return x # leave NaN's alone
    if (isinstance(x, str) or isinstance(x, unicode)) and len(x) == 0:
        return np.nan
    try:
        sn = 'F{:04d}'.format(int(x))
    except ValueError:
        # Assume string so make sure it's of the right format
        sn = str(x).strip().upper()
    len_sn = len(sn)
    if not ((len_sn == 5 and sn[0] in ['A', 'F', 'W', 'B']) or (len_sn == 6 and sn[0] == 'D')):
        print("!!! Bad ID \'%s\'" % sn)
    return sn

def now():
    return datetime.datetime.now().strftime("%d/%m/%y %H:%M:%S")

def enumerate_duplicates(row):
    """Append a counter to duplicate labels"""
    SEPARATOR = '.'
    duplicates = {}
    updated_row = []
    for r in row:
        count = duplicates.get(r, 0)
        if count > 0:
            label = "{}{}{}".format(r, SEPARATOR, count)
        else:
            label = r
        updated_row.append(label)
        duplicates[r] = count + 1
    return updated_row

# Need to define in main or we can't pickle the data objects
class DataFrames(object):
    def __init__(self):
        self.catalog = None
        self.ftir = None
        self.reagent = None
        self.mla = None
        self.hr = None
        self.combined = None

pd.options.mode.chained_assignment = 'raise'

def gsheets_service():
    from googleapiclient.discovery import build
    from httplib2 import Http
    from oauth2client import file, client, tools
    # If modifying these scopes, delete the file token.json.
    #Ensure that the creds file is always taken from the current working folder
        #This allows two people on different PCs to merge changes more easily.
    CREDS_FILE = os.path.join(os.path.realpath('./'),'JensDataExportJupyter_client_secret.json')
    SCOPES = 'https://www.googleapis.com/auth/spreadsheets.readonly'
    store = file.Storage('token.json')
    creds = store.get()
    if not creds or creds.invalid:
        import argparse
        flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args([])
        flow = client.flow_from_clientsecrets(CREDS_FILE, SCOPES)
        creds = tools.run_flow(flow, store, flags)
    service = build('sheets', 'v4', http=creds.authorize(Http()))
    return service

def get_df(spreadsheet_id, ss_range, mla=False):
    # Call the Sheets API
    result = GSHEETS_SERVICE.spreadsheets().values().get(spreadsheetId=spreadsheet_id,
                                                         range=ss_range).execute()
    values = result.get('values', [])
    if not values:
        print('*** No data found ***')
        return None

    # mla has irrelevant stuff in columns 1 and 3 and sample numbers in first column
    if mla:
        values.pop(0)
        values.pop(1)
        def not_blank(row):
            return len(row[0]) > 0       
    else:
        def not_blank(row):
            return sum(map(len, row[:6])) > 0

    rows = list(filter(not_blank, values))
    if not rows:
        print('*** No data found after pruning rows! ***')
        return None
    
    columns = enumerate_duplicates(rows[0])
    ncols = len(rows[0])
    row_max = max(map(len, rows[1:]))
    width = min(ncols, row_max)
    return pd.DataFrame(rows[1:], columns=columns[:width])

def canonicalise_df(df, source=None):
    """Initial cleaning of all dataframes"""
    #from pandas._libs.tslib import OutOfBoundsDatetime
    if source:
        print("Canonicalising %s" % source)
    # Standardise names
    df.rename(columns=RENAME_COLUMN_MAP, inplace=True)
    
    def fix_timestamp(x):
        return pd.to_datetime(str(x), format='%d/%m/%Y %H:%M:%S')
    if 'Timestamp' in df.columns:
        df.loc[:, 'Timestamp'] = df['Timestamp'].map(fix_timestamp)
    df.loc[:, 'SampleNumber'] = df['SampleNumber'].apply(fix_sample_number)
    df.dropna(subset=['SampleNumber'], inplace=True)
    #df.sort_values(['Sample Number'], ascending=True, inplace=True)
    # Make sure we don't have any blank columns
    if set(df.columns.values).intersection(set([np.nan, ''])):
        raise RuntimeError("Blank column names in Dataframe")
    return df

def get_data(spreadsheet_id):

    catalog_range = 'Catalog!A:R'
    ftir_range = 'FTIR!A:X'
    reagent_range = 'Reagent!A:W'
    mla_range = 'MLA!A:R'
    hr_range = 'Interventions!A:BJ'
    
    df_catalog = get_df(spreadsheet_id, catalog_range)
    df_catalog = canonicalise_df(df_catalog, source='catalog')
    df_ftir = get_df(spreadsheet_id, ftir_range)
    df_ftir = canonicalise_df(df_ftir, source='ftir')
    df_reagent = get_df(spreadsheet_id, reagent_range)
    df_reagent = canonicalise_df(df_reagent, source='reagent')
    df_mla = get_df(spreadsheet_id, mla_range, mla=True)
    df_mla = canonicalise_df(df_mla, source='mla')
    try:
        df_hr = get_df(spreadsheet_id, hr_range)
    except ValueError:
        df_hr = None
    if df_hr is not None:
        pass
        df_hr = canonicalise_df(df_hr, source='hr')

    df = DataFrames()
    df.catalog = df_catalog
    df.ftir = df_ftir
    df.reagent = df_reagent
    df.mla = df_mla
    df.hr = df_hr
    
    return df

def get_rename_columns_map():
    sheet_id = '1CgqTjdKizat-g7K7-AAuVIazQFKJ3WAAPHR-Qpa49lU'
    ss_range = 'ColumnMap!A:B'
    result = GSHEETS_SERVICE.spreadsheets().values().get(spreadsheetId=sheet_id,
                                                         range=ss_range).execute()
    values = result.get('values', [])
    assert values[0] == ['OriginalColumn', 'CanonicalColumn'], values[0]
    return { cm[0] : cm[1] for cm in values[1:] if len(cm) >= 2 }


In [2]:
#S how the folder where the code file is being run from        
print("Script running from: %s" % os.path.realpath(os.getcwd()))

# The ID and range of a sample spreadsheet.
BOOMTOWN2018_SPREADSHEET_ID = '1RiA-FwG_954Ger2VPsOSA3JLh-7sEoTYr40eVS0mp24'
MADE2018_SPREADSHEET_ID = '1daXdyL6uL8qnMsEsP0RLZE9nDzt6J7Zr1ygQdguvi-E'
BOARDMASTERS2018_SPREADSHEET_ID = '1U1lhUWLazDBN-wb2eZM8YV674f46npVfQK3XUVZjPow'
SW42018_SPREADSHEET_ID = '1agpMmJ9XukeWXS5_mwrDSKeshUaFtYwOzsPiR1DKsPU'
LOSTVILLAGE2018_SPREADSHEET_ID = '1OL0gyXrpZnJ8e7yR7eF6S2OaBYBiPDoVp5xGpdK4wlA'
BESTIVAL2018_SPREADSHEET_ID = '184qudGcw4PB0SMtOo0ZBDtckeGaH0RCLUXbA-u3BiHE'
YNOT2018_SPREADSHEET_ID = '1D01cj-Mra06TuoG_MsKuLq9OdtvKzrvRdiE255po_ag'
TRUCKFEST2018_SPREADSHEET_ID = '1sGG9WJxKyD2CGUjzJAXul3g9hVnRz6HbTiqKV5cUAyA'
LSTD2018_SPREADSHEET_ID = '1R8YqDnrhvuVMwPFShwaaAUIyCXQMeozA230OXsFsDQM'
KENDALCALLING2018_SPREADSHEET_ID = '16-PfwBOaUxwod3X75LGk1VAjBblkNsTJpCsX825aghI'
PARKLIFE2018_SPREADSHEET_ID = '1oO5sHcUhUn_7M1Hap73sOZHNEfWFMcDkQuWDRFf4d-w'


data = {}
GSHEETS_SERVICE = gsheets_service()
RENAME_COLUMN_MAP = get_rename_columns_map()
print("PROCESSING BOOMTOWN")
data['boomtown'] = get_data(BOOMTOWN2018_SPREADSHEET_ID)
print("PROCESSING BOARDMASTERS")
data['boardmasters'] = get_data(BOARDMASTERS2018_SPREADSHEET_ID)
print("PROCESSING MADE")
data['made'] = get_data(MADE2018_SPREADSHEET_ID)
print("PROCESSING SW4")
data['sw4'] = get_data(SW42018_SPREADSHEET_ID)
print("PROCESSING LOST VILLAGE")
data['lostvillage'] = get_data(LOSTVILLAGE2018_SPREADSHEET_ID)
print("PROCESSING BESTIVAL")
data['bestival'] = get_data(BESTIVAL2018_SPREADSHEET_ID)
print("PROCESSING YNOT")
data['ynot'] = get_data(YNOT2018_SPREADSHEET_ID)
print("PROCESSING TRUCKFEST")
data['truckfest'] = get_data(TRUCKFEST2018_SPREADSHEET_ID)
print("PROCESSING LSTD")
data['lstd'] = get_data(LSTD2018_SPREADSHEET_ID)
print( "PROCESSING KENDAL CALLING")
data['kc'] = get_data(KENDALCALLING2018_SPREADSHEET_ID)
print("PROCESSING PARKLIFE")
data['parklife'] = get_data(PARKLIFE2018_SPREADSHEET_ID)

with open('foo_multi.pkl','wb') as w:
    pickle.dump(data, w)

print("Finished importing data at %s" % now())

Script running from: /opt/random
PROCESSING BOOMTOWN
Canonicalising catalog
!!! Bad ID 'TF0579'
!!! Bad ID 'TF1665'
!!! Bad ID 'TF1660'
Canonicalising ftir
!!! Bad ID 'TF1665'
Canonicalising reagent
Canonicalising mla
Canonicalising hr
!!! Bad ID 'FXXX'
!!! Bad ID 'TF0653'
!!! Bad ID 'TF1172'
!!! Bad ID 'TF1762'
PROCESSING BOARDMASTERS
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
Canonicalising hr
PROCESSING MADE
Canonicalising catalog
Canonicalising ftir
!!! Bad ID 'XF0005'
Canonicalising reagent
Canonicalising mla
Canonicalising hr
PROCESSING SW4
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
PROCESSING LOST VILLAGE
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
PROCESSING BESTIVAL
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
Canonicalising hr
!!! Bad ID 'P1000'
!!! Bad ID 'F20005'
!!! Bad ID 'G9998'
PROCESSING YNOT
Canonicalising

In [3]:
def person_id_from_samplenumber(x):
    if isinstance(x, float) and np.isnan(x):
        return x # leave NaN's alone
    if (isinstance(x, str) or isinstance(x, unicode)) and len(x) == 0:
        return np.nan
    if len(x) != 5:
        return np.nan
    try:
        pn = 'P{:04d}'.format(int(x[-3:]))
    except ValueError:
        print("!!! Bad ID \'%s\'" % pn)
        pn = x
    return pn


def rm_bad_samplenumber(x):
    if pd.notnull(x):
        if len(x) != 5 and x[0] != 'F':
            return np.nan
    return x


def add_person_id(hr_df):
    """Create the unique PID column"""
    hr_df['Previous_sample'] = hr_df['Previous_sample'].apply(fix_sample_number)
    hr_df['Previous_sample'] = hr_df['Previous_sample'].apply(rm_bad_samplenumber)
    hr_df['PID'] = dfs.hr['SampleNumber']
    # Copy over SampleNumbers from Previous_sample
    mask = hr_df['Previous_sample'].isnull()
    hr_df['PID'] = hr_df['PID'].where(mask, hr_df['Previous_sample'])
    # Clean up values
    hr_df['PID'] = hr_df['PID'].apply(person_id_from_samplenumber)
    return hr_df


def merge_ftir_drug_columns(df):
    # Copy over 'Other' substances into the main column
    target_label = 'Substance detected'
    source_label = 'Compound detected'
    to_drop = [source_label, 'Hit Confidence.1']
    other_mask = ~df[target_label].str.startswith('Other').fillna(value=False)
    df[target_label].where(other_mask, df[source_label], inplace=True) # Copy values from source_label column over
    df.drop(to_drop, axis=1, inplace=True) # Remove now redundant columns
    df.rename(columns={target_label : 'Primary_hit', 'Hit Confidence' : 'Primary_confidence'}, inplace=True) # Rename Columns

    # Column names appear to be reversed - compound now is substance!!
    target_label = 'Compound detected (Subtraction)'
    source_label = 'Substance detected.1'
    to_drop = [source_label, 'Hit Confidence.3']
    other_mask = ~df[target_label].str.startswith('Other').fillna(value=False)
    df[target_label].where(other_mask, df[source_label], inplace=True) # Copy values from source_label column over
    df.drop(to_drop, axis=1, inplace=True) # Remove now redundant columns
    df.rename(columns={target_label : 'Secondary_hit', 'Hit Confidence.2' : 'Secondary_confidence'}, inplace=True) # Rename Columns

    
def clean_sample_form(df):
    """This cell cleans the "sample form" field """
    sample_form_d = { 'pill' : ['Ecstasy Tablet',
                                'ecstasy pill',
                                'ecstacy pill',
                                'Non-pharmaceutical tablet (ecstasy etc)',
                                'other recreational pill',
                                 'Whole pill',
                                'Other pill',
                                'Pharmaceutical'],
                      'partial pill' : ['Partial ecstasy pill',
                                        'Partial 2C-B pill',
                                        'Crushed tablet'],
                      'powder' : ['powder/capsule/bomb',
                                  'Powder/capsule/bomb/crystal',
                                  'Powder or crushed pill',
                                  'Crystal, Capsule or Powder'],
                      'liquid' : ['*Cannabinoid liquid',
                                   '*Viscous liquid',
                                  'Dissolved in Propylene Glycol',
                                  'Oil'],
                       'tab' : ['blotter', 'LSD Tab']
                      }


    # Firstly convert all columns to lower case and remove any spaces
    def lower(value):
        if type(value) is str:
            value = value.strip().lower()
        return value

    column = 'SampleForm'
    df[column] = df[column].map(lower, na_action='ignore')
    
    replace_d = {}
    replace_d[column] = {}
    for drug, names in sample_form_d.items():
        for name in names:
            replace_d[column][name.lower()] = drug
    
    # Replace values
    df.replace(replace_d, inplace=True)
    return df    


def rename_values(df, columns, replace_map):
    """Replace all names in columns with those from replace_map - lower cases everything"""
    # Firstly convert all columns to lower case and remove any spaces
    def clean(value):
        if type(value) is str:
            value = value.strip().lower()
        return value

    for column in columns:
        if column in df.columns:
            df[column] = df[column].map(clean, na_action='ignore')

    replace_d = {column : replace_map for column in columns}
    df.replace(replace_d, inplace=True)
    return df

def get_drugs_maps():
    """Query google sheets spreadsheet for the translation table for drug names"""
    upkl = 'user_drugs_map.pkl'
    tpkl = 'tester_drugs_map.pkl'
    if os.path.isfile(upkl) and os.path.isfile(tpkl):
        with open(upkl, 'rb') as u, open(tpkl, 'rb') as t:
            user_drugs_map = pickle.load(u)
            tester_drugs_map = pickle.load(t)
    else:
        # Get the drugs map
        if 'GSHEETS_SERVICE' not in locals():
            GSHEETS_SERVICE = gsheets_service()
        sheet_id = '1CgqTjdKizat-g7K7-AAuVIazQFKJ3WAAPHR-Qpa49lU'
        ss_range = 'UserDrugsMap!A:B'
        result = GSHEETS_SERVICE.spreadsheets().values().get(spreadsheetId=sheet_id,
                                                             range=ss_range).execute()
        values = result.get('values', [])
        assert values[0] == ['Drug name', 'Translation']
        user_drugs_map = { dt[0].lower().strip() : dt[1].lower().strip() for dt in values[1:] if len(dt) == 2 }
        user_drugs_map = { k : v for k, v in user_drugs_map.items() if k != v } # remove duplicates

        ss_range = 'TesterDrugsMap!A:B'
        result = GSHEETS_SERVICE.spreadsheets().values().get(spreadsheetId=sheet_id,
                                                             range=ss_range).execute()
        values = result.get('values', [])
        tester_drugs_map = { dt[0].lower().strip() : dt[1].lower().strip() for dt in values[1:] if len(dt) == 2 }
        tester_drugs_map = { k : v for k, v in tester_drugs_map.items() if k != v } # remove duplicates
        with open(upkl, 'wb') as u, open(tpkl, 'wb') as t:
            pickle.dump(user_drugs_map, u)
            pickle.dump(tester_drugs_map, t)
    return user_drugs_map, tester_drugs_map

def canonicalise_columns(dfs):
    """Select the column names we require"""
    catalog_columns = ['Timestamp', 'SampleNumber', 'Tester', 'SampleSource', 'SoldAs', 'AlreadyTried', 
                       'UserSuspicion', 'SampleForm', 'What is the logo?', 'Pill_mass_mg', 
                       'What is the shape of the pill?', 'Is a breakline present?', 'What colour is the pill?']
    ftir_columns = ['Timestamp', 'SampleNumber', 'Tester', 'SoldAs', 'SampleForm', 'AlreadyTried', 'UserSuspicion', 
                    'Substance detected', 'Hit Confidence', 'Compound detected', 'Hit Confidence.1', 
                    'Compound detected (Subtraction)', 'Hit Confidence.2', 'Substance detected.1', 
                    'Hit Confidence.3', 'Substance(s) detected', 'Powder_strength']
    mla_columns = ['SampleNumber', 'Tester', 'SampleForm', 'Logo', 'Colour', 'FTIR_hit1', 'MDMA / tablet (mg)', 
                   'Percent_MDMA']
    reagent_columns = ['Timestamp', 'SampleNumber', 'Tester', 'SoldAs', 'SampleForm', 'Froehde', 
                       'Froehde possible substances', 'Liebermann', 'Liebermann possible substances',
                       'Marquis', 'Marquis possible substances', 'Mandelin', 'Mandelin possible substances', 'Ehrlich', 
                       'Likely drug or class', 'Analysis required', 'Substance(s) detected', 'Powder_strength', 
                       'Pill Strength in mg (if known)', 'Matches_soldas']

    hr_columns = ['Timestamp', 'SampleNumber', 'HR_worker', 'Gender', 'Ethnicity', 'Age', 
                  'Number of people present', 'When was the last time you used this service?', 
                  'Previous_sample', 'Had_alcohol_today', 'DRINK 1 - Type', 'DRINK 1: Vessel',
                  'DRINK 1: Quantity', 'DRINK 2 - Type', 'DRINK 2: Vessel', 'DRINK 2: Quantity', 
                  'DRINK 3 - Type', 'DRINK 3: Vessel', 'DRINK 3: Quantity', 'DRINK 4 - Type', 
                  'DRINK 4: Vessel', 'DRINK 4: Quantity', 'Had_other_illegal_drugs', 'Prescribed Medication', 
                  'Over The Counter Medication', 'Feeling_concerns', 'Had_amphetamine', 'Had_cannabis', 
                  'Had_n2o', 'Had_cocaine', 'Had_ecstascy_pill', 'Had_mdma_crystal', 'Had_ketamine', 'Had_lsd',
                  'Had_2cb', 'Had_magic_mushrooms', 'Had_mephedrone', 'Had_spice', 'Had_codeine', 
                  'Had_tramadol', 'Had_heroin_or_opioids', 'Had_valium_or_benzos', 'Had_xanax', 
                  'Had_unknown_power', 'Had_other_drugs', 'SoldAs', 'AlreadyTried', 
                  'When did you first use this batch?', 'Bad_experience_with_batch', 
                  'Was the sample bought, given or found?', 'Where did you obtain this substance from?', 
                  'In general terms, who did you get the substance from?', 'Trust_supplier', 
                  'Why did you bring this substance to be tested?', 
                  'Have you ever accessed a health service for your alcohol or drug use?', 
                  'Require further advice', 'Plan to do?', 'What other actions will you do? (tick all that apply)', 
                  'Interview_abandonded']
    
    dfs.catalog = dfs.catalog[catalog_columns]
    dfs.ftir = dfs.ftir[ftir_columns]
    dfs.mla = dfs.mla[mla_columns]
    dfs.reagent = dfs.reagent[reagent_columns]
    if dfs.hr is not None:
        dfs.hr = dfs.hr[hr_columns]

In [4]:
# Clean the cells
import pickle
with open('foo_multi.pkl', 'rb') as f:
    data = pickle.load(f)
    
user_drugs_map, tester_drugs_map = get_drugs_maps()
user_drug_columns = ["Had_other_drugs", 'SoldAs', 'UserSuspicion']
tester_drug_columns = ['Primary_hit', 'Secondary_hit', 'FTIR_final_result', 'Substance(s) detected']

for festival, dfs in data.items():
    print("FESTIVAL ",festival)
    
    canonicalise_columns(dfs)
    
    # Clean up ftir sheet
    merge_ftir_drug_columns(dfs.ftir)
    
    # Determine final result
    #dfs.ftir = calculate_final_result(dfs.ftir)
    
    if dfs.hr is not None:
        # Add person ID to HD
        dfs.hr = add_person_id(dfs.hr)
    
    dfs.catalog = clean_sample_form(dfs.catalog)
    dfs.ftir = clean_sample_form(dfs.ftir)
    dfs.mla = clean_sample_form(dfs.mla)
    
    dfs.catalog = rename_values(dfs.catalog, user_drug_columns, user_drugs_map)
    dfs.catalog = rename_values(dfs.catalog, tester_drug_columns, tester_drugs_map)
    dfs.ftir = rename_values(dfs.ftir, user_drug_columns, user_drugs_map)
    dfs.ftir = rename_values(dfs.ftir, tester_drug_columns, tester_drugs_map)    
    dfs.mla = rename_values(dfs.mla, user_drug_columns, user_drugs_map)
    dfs.mla = rename_values(dfs.mla, tester_drug_columns, tester_drugs_map)
    dfs.reagent = rename_values(dfs.reagent, user_drug_columns, user_drugs_map)
    dfs.reagent = rename_values(dfs.reagent, tester_drug_columns, tester_drugs_map)
    if dfs.hr is not None:
        dfs.hr = rename_values(dfs.hr, user_drug_columns, user_drugs_map)
        dfs.hr = rename_values(dfs.hr, tester_drug_columns, tester_drugs_map)
    
print("Finished cleaning dataframes at %s" % now())

with open('latest.pkl', 'wb') as w:
    pickle.dump(data, w)

FESTIVAL  boomtown
!!! Bad ID 'TESTED SOME OTHER STUFF YESTERDAY THAT WASNT WHAT I THOUGHT'
!!! Bad ID 'KEPT THE TICKET'
!!! Bad ID 'F1868 GJ:THIS HAS TO BE INCORRECT. 1868 WAS GIVEN RESULTS MUCH LATER IN THE DAY'
!!! Bad ID 'G1879'
FESTIVAL  boardmasters
FESTIVAL  made
FESTIVAL  sw4
FESTIVAL  lostvillage
FESTIVAL  bestival
FESTIVAL  ynot
FESTIVAL  truckfest
FESTIVAL  lstd
FESTIVAL  kc
FESTIVAL  parklife
Finished cleaning dataframes at 17/12/18 12:14:03


In [31]:
# Merge of all data
with open('latest.pkl', 'rb') as f:
    data = pickle.load(f)

def calculate_final_results(df):
    """Calculate final result"""
    # Where 'ftir_Substance(s) detected' is null we use the ftir_Primary_hit
    mask = ~df['ftir_Substance(s) detected'].isin(['', np.nan, None])
    df['ftir_Substance(s) detected'] = df['ftir_Substance(s) detected'].where(mask, df['ftir_Primary_hit'])
    # Find where'reagent_Substance(s) detected' contains anything but 'No active component identified'
    mask = df['reagent_Substance(s) detected'].isin([None, np.nan,'No active component identified' ])
    # Default is 'ftir_Substance(s) detected'
    df['Final_result_calculated'] = df['ftir_Substance(s) detected']
    # Copy over anything from 'reagent_Substance(s) detected'
    df['Final_result_calculated'] = df['Final_result_calculated'].where(mask, df['reagent_Substance(s) detected'])
    # Need to lowercase for comparison
    df['Final_result_calculated'] = df['Final_result_calculated'].astype(str).str.lower()
    df['catalog_SoldAs'] = df['catalog_SoldAs'].astype(str).str.lower()
    
    # Calculate where they do/don't match
    df['As_expected'] = (df['Final_result_calculated'] == df['catalog_SoldAs']).map({True : 'Yes', False : 'No'})

    # Guy 28/10/18: 'As_expected' should be null whenever the sample is found,
    # when the submission 'acquired as" data is blank or unknown, or when the sample is from Amnesty
    mask1 = df['catalog_SoldAs'].isin(['found', 'found or otherwise not known', np.nan, None])
    mask2 = df['catalog_SampleSource'] != 'Public'
    mask = mask1 | mask2
    # jmht - could check against: 'hr_Was the sample bought, given or found?
    df.loc[mask, ['As_expected']] = np.nan
    return df


for festival, dfs in data.items():
    print("FESTIVAL ",festival)

    # Rename columns to identify source dataframe
    dfs.catalog.columns = ['catalog_'+ name if name != 'SampleNumber' else name for name in dfs.catalog.columns]
    dfs.ftir.columns = ['ftir_'+ name if name != 'SampleNumber' else name for name in dfs.ftir.columns]
    dfs.mla.columns = ['mla_'+ name if name != 'SampleNumber' else name for name in dfs.mla.columns]
    dfs.reagent.columns = ['reagent_'+ name if name != 'SampleNumber' else name for name in dfs.reagent.columns]
    if dfs.hr is not None:
        dfs.hr.columns = ['hr_'+ name if name != 'SampleNumber' else name for name in dfs.hr.columns]

    # Remove all but the last of any duplicate SampleNumber
    # want a list of all but the last duplicates
    mask = ~dfs.catalog['SampleNumber'].duplicated(keep=False) | ~dfs.catalog['SampleNumber'].duplicated(keep='last')
    dfs.catalog = dfs.catalog[mask]
    mask = ~dfs.ftir['SampleNumber'].duplicated(keep=False) | ~dfs.ftir['SampleNumber'].duplicated(keep='last')
    dfs.ftir = dfs.ftir[mask]
    mask = ~dfs.mla['SampleNumber'].duplicated(keep=False) | ~dfs.mla['SampleNumber'].duplicated(keep='last')
    dfs.mla = dfs.mla[mask]
    mask = ~dfs.reagent['SampleNumber'].duplicated(keep=False) | ~dfs.reagent['SampleNumber'].duplicated(keep='last')
    dfs.reagent = dfs.reagent[mask]
    if dfs.hr is not None:
        mask = ~dfs.hr['SampleNumber'].duplicated(keep=False) | ~dfs.hr['SampleNumber'].duplicated(keep='last')
        dfs.hr = dfs.hr[mask]

    # First outer join on catalog/ftir to make sure we collect all possible information - this will result in
    # some rows where there was no catalog data, only ftir data, but this is ok as when we merge with hr we will
    # throw away any row that doesn't have a corresponding sample number in HR. This was even if catalog data is
    # missing, we still get the FTIR data, which may be enough for our purposes
    df_all = pd.merge(dfs.catalog, dfs.ftir, how='outer', on=['SampleNumber'])
    # Add in mla data - only for when there are existing sample numbers
    df_all = pd.merge(df_all, dfs.mla, how='left', on=['SampleNumber'])
    df_all = pd.merge(df_all, dfs.reagent, how='left', on=['SampleNumber'])
    if dfs.hr is not None:
        # inner join -> merge only where there are matching sample numbers
        df_all = pd.merge(df_all, dfs.hr, how='inner', on=['SampleNumber'])
    dfs.combined = df_all

    # Calculate final result
    dfs.combined = calculate_final_results(dfs.combined)
    
    # Add unique columns
    dfs.combined.insert(loc=0, column='Festival', value=festival)
    dfs.combined.insert(loc=1, column='UID', value=dfs.combined[['Festival', 'SampleNumber']].apply(lambda x: '_'.join(x), axis=1))
    dfs.combined['Version'] = VERSION  
    
print("Finished merging first round at %s" % now())

# with open('foo_multi2.pkl','wb') as w:
#     pickle.dump(data, w)

FESTIVAL  boomtown
FESTIVAL  boardmasters
FESTIVAL  made
FESTIVAL  sw4
FESTIVAL  lostvillage
FESTIVAL  bestival
FESTIVAL  ynot
FESTIVAL  truckfest
FESTIVAL  lstd
FESTIVAL  kc
FESTIVAL  parklife
Finished merging first round at 17/12/18 21:14:34


In [32]:
pkl_file = 'final2018.pkl'
# # import pickle
# with open(pkl_file, 'rb') as f:
#     df_final = pickle.load(f)


def columns_strip_and_to_lower_case(df):
    cols = ['hr_Gender', 'hr_Ethnicity']
    df.loc[:,cols] = df[cols].apply(lambda x: x.str.lower().str.strip())
    return df


def add_pid(df):
    def join(x):
        if type(x[0]) is str and type(x[1]) is str:
            return "_".join(x)
        else:
            return np.nan
    df['PID'] = df[['Festival', 'hr_PID']].apply(join, axis=1)
    # # Need to set to null where 'hr_PID' is null
    df['PID'] = df['PID'].where(df['hr_PID'].notnull(), np.nan)
    df.drop('hr_PID', axis=1, inplace=True)    
    return df

def fix_ethnicities(df):
    ethnicity_map = { 'asian' : ['arabic', 'asian (including chinese)', 'indian', 'white asian'],
                      'black' : [],
                      'mixed_race' : ['mixed heritage'],
                      'other' : ['citizen of the world', 'glittery', 'prefer not to say'],
                      'white' : ['irish', 'portuguese brazillian', 'white (including european)', 'white other'] }

    replace_d = {}
    for category, possibilities in ethnicity_map.items():
        for possible in possibilities:
            replace_d[possible] = category    
    df['hr_Ethnicity'].replace(replace_d, inplace=True) 


df_final = pd.concat([d.combined for d in data.values()], ignore_index=True, sort=False)
df_final = add_pid(df_final)
df_final = columns_strip_and_to_lower_case(df_final)
fix_ethnicities(df_final)


# Reorder columns
columns = df_final.columns.values.tolist()
upfront = ['Festival', 'UID', 'PID']
for u in upfront:
    columns.remove(u)
columns = upfront + columns
df_final = df_final[columns]

with open(pkl_file,'wb') as w:
    pickle.dump(df_final, w)

# Remove all non-HR data for Fiona
out_dir = '/Users/jmht/Dropbox/TheLoop/Testing/2018_results'
df_hr = df_final[df_final['PID'].notnull()]
filename = 'Loop2018Data_HR_%2.1f.xls' % VERSION
filepath = os.path.join(out_dir, filename)
writer = pd.ExcelWriter(filepath)
df_hr.to_excel(writer, 'MergedData', index=False)
writer.save()
print(now() + " Wrote version %2.1f to file: %s" % (VERSION, filepath))

17/12/18 21:14:42 Wrote version 1.1 to file: /Users/jmht/Dropbox/TheLoop/Testing/2018_results/Loop2018Data_HR_1.1.xls


In [29]:
# print(len(df_final))
#pd.crosstab(df_final['hr_Age'], df_final['hr_Gender'], margins=True)
# pd.crosstab(df_final['hr_Had_ecstascy_pill'], df_final['hr_Gender'], margins=True)
pd.crosstab(df_final['hr_Ethnicity'], df_final['hr_Gender'], margins=True)


hr_Gender,Unnamed: 1_level_0,female,gender fluid,male,na,queer,All
hr_Ethnicity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,11,8,0,7,0,0,26
asian,0,12,0,28,0,0,40
black,0,6,0,15,0,0,21
mixed_race,1,24,0,42,0,1,68
other,0,2,0,2,0,0,4
white,7,632,1,1291,1,0,1932
All,19,684,1,1385,1,1,2091
