In [3]:
# Module imports
import copy
import datetime
import os
import pickle

import numpy as np
import pandas as pd

def fix_sample_number(x):
    """Make sure all samples numbers are of form: AXXX (where A is one of A, F, W and X is a digit)"""
    if isinstance(x, float) and np.isnan(x):
        return x # leave NaN's alone
    if (isinstance(x, str) or isinstance(x, unicode)) and len(x) == 0:
        return np.nan
    try:
        sn = 'F{:04d}'.format(int(x))
    except ValueError:
        # Assume string so make sure it's of the right format
        sn = str(x).strip().upper()
    len_sn = len(sn)
    if not ((len_sn == 5 and sn[0] in ['A', 'F', 'W', 'B']) or (len_sn == 6 and sn[0] == 'D')):
        print("!!! Bad ID \'%s\'" % sn)
    return sn

def now():
    return datetime.datetime.now().strftime("%d/%m/%y %H:%M:%S")

def enumerate_duplicates(row):
    """Append a counter to duplicate labels"""
    SEPARATOR = '.'
    duplicates = {}
    updated_row = []
    for r in row:
        count = duplicates.get(r, 0)
        if count > 0:
            label = "{}{}{}".format(r, SEPARATOR, count)
        else:
            label = r
        updated_row.append(label)
        duplicates[r] = count + 1
    return updated_row

# Need to define in main or we can't pickle the data objects
class DataFrames(object):
    def __init__(self):
        self.catalog = None
        self.ftir = None
        self.reagent = None
        self.mla = None
        self.hr = None
        self.combined = None

In [4]:
pd.options.mode.chained_assignment = 'raise'

def gsheets_service():
    from googleapiclient.discovery import build
    from httplib2 import Http
    from oauth2client import file, client, tools
    # If modifying these scopes, delete the file token.json.
    #Ensure that the creds file is always taken from the current working folder
        #This allows two people on different PCs to merge changes more easily.
    CREDS_FILE = os.path.join(os.path.realpath('./'),'JensDataExportJupyter_client_secret.json')
    SCOPES = 'https://www.googleapis.com/auth/spreadsheets.readonly'
    store = file.Storage('token.json')
    creds = store.get()
    if not creds or creds.invalid:
        import argparse
        flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args([])
        flow = client.flow_from_clientsecrets(CREDS_FILE, SCOPES)
        creds = tools.run_flow(flow, store, flags)
    service = build('sheets', 'v4', http=creds.authorize(Http()))
    return service

def get_df(service, SPREADSHEET_ID, SS_RANGE, mla=False):
    # Call the Sheets API
    result = service.spreadsheets().values().get(spreadsheetId=SPREADSHEET_ID,
                                                range=SS_RANGE).execute()
    values = result.get('values', [])
    if not values:
        print('*** No data found ***')
        return None

    # mla has irrelevant stuff in columns 1 and 3 and sample numbers in first column
    if mla:
        values.pop(0)
        values.pop(1)
        def not_blank(row):
            return len(row[0]) > 0       
    else:
        def not_blank(row):
            return sum(map(len, row[:6])) > 0

    rows = list(filter(not_blank, values))
    if not rows:
        print('*** No data found after pruning rows! ***')
        return None
    
    columns = enumerate_duplicates(rows[0])
    ncols = len(rows[0])
    row_max = max(map(len, rows[1:]))
    width = min(ncols, row_max)
    return pd.DataFrame(rows[1:], columns=columns[:width])

def canonicalise_df(df, source=None):
    """Initial cleaning of all dataframes"""
    #from pandas._libs.tslib import OutOfBoundsDatetime
    if source:
        print("Canonicalising %s" % source)
    # Standardise names
    d = {
        'Sample Code':'SampleNumber',
        'Sample Number:':'SampleNumber',
        'Sample Number':'SampleNumber',
        'Sample number':'SampleNumber',
        'Sample Num':'SampleNumber',
        'Sample Number i.e F0XXX' : 'SampleNumber',
        
        'Sample Advertised/Acquired/Sold As' : 'SoldAs',
        'Sample Sold As' : 'SoldAs',
        'You submitted a substance for analysis. What were you told it was when you got it?':  'SoldAs',
        
        
        'Sample Source' :'SampleSource',

        'User Suspicion' :'UserSuspicion',

        'Sample Form' :'SampleForm',

        'Has the Service User or a close friend tried this batch?' : 'AlreadyTried',
        'Had you already tried this substance before getting it tested?' : 'AlreadyTried',

        'Your initials' : 'Tester',
        'Your name and first initial' : 'Tester',
        'Your name and surname initial' : 'Tester'
    }
    df.rename(columns=d, inplace=True)
    
    def fix_timestamp(x):
        return pd.to_datetime(str(x), format='%d/%m/%Y %H:%M:%S')
    if 'Timestamp' in df.columns:
        df.loc[:, 'Timestamp'] = df['Timestamp'].map(fix_timestamp)
    df.loc[:, 'SampleNumber'] = df['SampleNumber'].apply(fix_sample_number)
    df.dropna(subset=['SampleNumber'], inplace=True)
    #df.sort_values(['Sample Number'], ascending=True, inplace=True)
    # Make sure we don't have any blank columns
    if set(df.columns.values).intersection(set([np.nan, ''])):
        raise RuntimeError("Blank column names in Dataframe")
    return df

def get_data(service, SPREADSHEET_ID):

    CATALOG_RANGE = 'Catalog!A:R'
    FTIR_RANGE = 'FTIR!A:X'
    REAGENT_RANGE = 'Reagent!A:W'
    MLA_RANGE = 'MLA!A:R'
    HR_RANGE = 'Interventions!A:BJ'

    df_catalog = get_df(service, SPREADSHEET_ID, CATALOG_RANGE)
    df_catalog = canonicalise_df(df_catalog, source='catalog')
    df_ftir = get_df(service, SPREADSHEET_ID, FTIR_RANGE)
    df_ftir = canonicalise_df(df_ftir, source='ftir')
    df_reagent = get_df(service, SPREADSHEET_ID, REAGENT_RANGE)
    df_reagent = canonicalise_df(df_reagent, source='reagent')
    df_mla = get_df(service, SPREADSHEET_ID, MLA_RANGE, mla=True)
    df_mla = canonicalise_df(df_mla, source='mla')
    try:
        df_hr = get_df(service, SPREADSHEET_ID, HR_RANGE)
    except ValueError:
        df_hr = None
    if df_hr is not None:
        pass
        df_hr = canonicalise_df(df_hr, source='hr')

    df = DataFrames()
    df.catalog = df_catalog
    df.ftir = df_ftir
    df.reagent = df_reagent
    df.mla = df_mla
    df.hr = df_hr
    
    return df


In [40]:
#S how the folder where the code file is being run from        
print("Script running from: %s" % os.path.realpath(os.getcwd()))

# The ID and range of a sample spreadsheet.
BOOMTOWN2018_SPREADSHEET_ID = '1RiA-FwG_954Ger2VPsOSA3JLh-7sEoTYr40eVS0mp24'
MADE2018_SPREADSHEET_ID = '1daXdyL6uL8qnMsEsP0RLZE9nDzt6J7Zr1ygQdguvi-E'
BOARDMASTERS2018_SPREADSHEET_ID = '1U1lhUWLazDBN-wb2eZM8YV674f46npVfQK3XUVZjPow'
SW42018_SPREADSHEET_ID = '1agpMmJ9XukeWXS5_mwrDSKeshUaFtYwOzsPiR1DKsPU'
LOSTVILLAGE2018_SPREADSHEET_ID = '1OL0gyXrpZnJ8e7yR7eF6S2OaBYBiPDoVp5xGpdK4wlA'
BESTIVAL2018_SPREADSHEET_ID = '184qudGcw4PB0SMtOo0ZBDtckeGaH0RCLUXbA-u3BiHE'
YNOT2018_SPREADSHEET_ID = '1D01cj-Mra06TuoG_MsKuLq9OdtvKzrvRdiE255po_ag'
TRUCKFEST2018_SPREADSHEET_ID = '1sGG9WJxKyD2CGUjzJAXul3g9hVnRz6HbTiqKV5cUAyA'
LSTD2018_SPREADSHEET_ID = '1R8YqDnrhvuVMwPFShwaaAUIyCXQMeozA230OXsFsDQM'
KENDALCALLING2018_SPREADSHEET_ID = '16-PfwBOaUxwod3X75LGk1VAjBblkNsTJpCsX825aghI'
PARKLIFE2018_SPREADSHEET_ID = '1oO5sHcUhUn_7M1Hap73sOZHNEfWFMcDkQuWDRFf4d-w'


data = {}
service = gsheets_service()
print("PROCESSING BOOMTOWN")
data['boomtown'] = get_data(service, BOOMTOWN2018_SPREADSHEET_ID)
print("PROCESSING BOARDMASTERS")
data['boardmasters'] = get_data(service, BOARDMASTERS2018_SPREADSHEET_ID)
print("PROCESSING MADE")
data['made'] = get_data(service, MADE2018_SPREADSHEET_ID)
print("PROCESSING SW4")
data['sw4'] = get_data(service, SW42018_SPREADSHEET_ID)
print("PROCESSING LOST VILLAGE")
data['lostvillage'] = get_data(service, LOSTVILLAGE2018_SPREADSHEET_ID)
print("PROCESSING BESTIVAL")
data['bestival'] = get_data(service, BESTIVAL2018_SPREADSHEET_ID)
print("PROCESSING YNOT")
data['ynot'] = get_data(service, YNOT2018_SPREADSHEET_ID)
print("PROCESSING TRUCKFEST")
data['truckfest'] = get_data(service, TRUCKFEST2018_SPREADSHEET_ID)
print("PROCESSING LSTD")
data['lstd'] = get_data(service, LSTD2018_SPREADSHEET_ID)
print( "PROCESSING KENDAL CALLING")
data['kc'] = get_data(service, KENDALCALLING2018_SPREADSHEET_ID)
print("PROCESSING PARKLIFE")
data['parklife'] = get_data(service, PARKLIFE2018_SPREADSHEET_ID)

with open('foo_multi.pkl','wb') as w:
    pickle.dump(data, w)

Script running from: /opt/random
PROCESSING BOOMTOWN
Canonicalising catalog
!!! Bad ID 'TF0579'
!!! Bad ID 'TF1665'
!!! Bad ID 'TF1660'
Canonicalising ftir
!!! Bad ID 'TF1665'
Canonicalising reagent
Canonicalising mla
Canonicalising hr
!!! Bad ID 'FXXX'
!!! Bad ID 'TF0653'
!!! Bad ID 'TF1172'
!!! Bad ID 'TF1762'
PROCESSING BOARDMASTERS
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
Canonicalising hr
PROCESSING MADE
Canonicalising catalog
Canonicalising ftir
!!! Bad ID 'XF0005'
Canonicalising reagent
Canonicalising mla
Canonicalising hr
PROCESSING SW4
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
PROCESSING LOST VILLAGE
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
PROCESSING BESTIVAL
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
Canonicalising hr
!!! Bad ID 'P1000'
!!! Bad ID 'F20005'
!!! Bad ID 'G9998'
PROCESSING YNOT
Canonicalising

In [3]:
import pickle
with open('foo_multi.pkl', 'rb') as f:
    data = pickle.load(f)

In [4]:
# # Merge of all data
# import pickle
# with open('foo_multi.pkl', 'rb') as f:
#     data = pickle.load(f)
for festival, dfs in data.items():
    # Rename columns to identify source dataframe
    dfs.catalog.columns = ['catalog_'+ name if name != 'SampleNumber' else name for name in dfs.catalog.columns]
    dfs.ftir.columns = ['ftir_'+ name if name != 'SampleNumber' else name for name in dfs.ftir.columns]
    dfs.mla.columns = ['mla_'+ name if name != 'SampleNumber' else name for name in dfs.mla.columns]
    if dfs.hr is not None:
        dfs.hr.columns = ['hr_'+ name if name != 'SampleNumber' else name for name in dfs.hr.columns]

    # Remove all but the last of any duplicate SampleNumber
    # want a list of all but the last duplicates
    mask = ~dfs.catalog['SampleNumber'].duplicated(keep=False) | ~dfs.catalog['SampleNumber'].duplicated(keep='last')
    dfs.catalog = dfs.catalog[mask]
    mask = ~dfs.ftir['SampleNumber'].duplicated(keep=False) | ~dfs.ftir['SampleNumber'].duplicated(keep='last')
    dfs.ftir = dfs.ftir[mask]
    mask = ~dfs.mla['SampleNumber'].duplicated(keep=False) | ~dfs.mla['SampleNumber'].duplicated(keep='last')
    dfs.mla = dfs.mla[mask]
    if dfs.hr is not None:
        mask = ~dfs.hr['SampleNumber'].duplicated(keep=False) | ~dfs.hr['SampleNumber'].duplicated(keep='last')
        dfs.hr = dfs.hr[mask]

    # First outer join on catalog/ftir to make sure we collect all possible information - this will result in
    # some rows where there was no catalog data, only ftir data, but this is ok as when we merge with hr we will
    # throw away any row that doesn't have a corresponding sample number in HR. This was even if catalog data is
    # missing, we still get the FTIR data, which may be enough for our purposes
    df_all = pd.merge(dfs.catalog, dfs.ftir, how='outer', on=['SampleNumber'])
    # Add in mla data - only for when there are existing sample numbers
    df_all = pd.merge(df_all, dfs.mla, how='left', on=['SampleNumber'])
    if dfs.hr is not None:
        # inner join -> merge only where there are matching sample numbers
        df_all = pd.merge(df_all, dfs.hr, how='inner', on=['SampleNumber'])
    dfs.combined = df_all
    
    #


In [30]:
# This cell cleans the "sample form" field 
def clean_sample_form(df):
    sample_form_d = { 'pill' : ['Ecstasy Tablet',
                                'ecstasy pill',
                                'ecstacy pill',
                                'Non-pharmaceutical tablet (ecstasy etc)',
                                'other recreational pill',
                                 'Whole pill',
                                'Other pill',
                                'Pharmaceutical'],
                      'partial pill' : ['Partial ecstasy pill',
                                        'Partial 2C-B pill',
                                        'Crushed tablet'],
                      'powder' : ['powder/capsule/bomb',
                                  'Powder/capsule/bomb/crystal',
                                  'Powder or crushed pill',
                                  'Crystal, Capsule or Powder'],
                      'liquid' : ['*Cannabinoid liquid',
                                   '*Viscous liquid',
                                  'Dissolved in Propylene Glycol',
                                  'Oil'],
                       'tab' : ['blotter', 'LSD Tab']
                      }


    # Firstly convert all columns to lower case and remove any spaces
    def lower(value):
        if type(value) is str:
            value = value.strip().lower()
        return value

    for column in ['SampleForm']:
        df[column] = df[column].map(lower, na_action='ignore')
    
    replace_d = {}
    for column in ['SampleForm']:
        replace_d[column] = {}
        for drug, names in sample_form_d.items():
            for name in names:
                replace_d[column][name.lower()] = drug
    
    # Replace values
    df.replace(replace_d, inplace=True)
    return df
    
dfs.catalog = clean_sample_form(dfs.catalog)
dfs.ftir = clean_sample_form(dfs.ftir)
dfs.reagent = clean_sample_form(dfs.reagent)
print("Finished cleaning 'Sample form' field at %s" % now())


Finished cleaning 'Sample form' field at 07/11/18 21:51:18


In [9]:
# Get the drugs map
if 'service' not in locals():
    service = gsheets_service()
sheet_id = '1CgqTjdKizat-g7K7-AAuVIazQFKJ3WAAPHR-Qpa49lU'
ss_range = 'FastUserdrugsMap!A:B'
result = service.spreadsheets().values().get(spreadsheetId=sheet_id,
                                            range=ss_range).execute()
values = result.get('values', [])
assert values[0] == ['Drug name', 'Translation']
user_drugs_map = { dt[0] : dt[1] for dt in values[1:] if len(dt) == 2 }
print(user_drugs_map)

ss_range = 'FastTesterdrugsMap!A:B'
result = service.spreadsheets().values().get(spreadsheetId=sheet_id,
                                            range=ss_range).execute()
values = result.get('values', [])
tester_drugs_map = { dt[0] : dt[1] for dt in values[1:] if len(dt) == 2 }
print(tester_drugs_map)


{'1 alpha alpha alpha trifluoro m tolyl piperazine': 'TFMPP', '2-cb': '2c-b', '2-ce': '2c-e', '250mg mdma': 'MDMA', '25i': '25i-nbome', '25i 3 year': '25i-nbome', '25i-nbome': '25i-nbome', '2c-b': '2c-b', '2c-b (or 2c-b derivative such as 2ci)': '2c-b', '2c-b-fly': '2c-b-fly', '2c-e': '2c-e', '2c/b': '2c-b', '2cb': '2c-b', '2cb pill': '2c-b', '2ce,': '2C-E', '2ci': '2C-I', '3-meo-pcp': '3-meo-pcp', '3meo pcp': '3-meo-pcp', '4-chloroethcathinone': '4-CEC', '4-ho-met,': '4-HO-MET', '4fa': '4-FA', 'aderol': 'amphetamine', 'amphetamine': 'amphetamine', 'asborbic acid (vitamin c)': 'vitamin C', 'ascorbic acid (vitamin c)': 'vitamin c', 'bag that was found': 'found', 'baking powder': 'sodium bicarbonate', 'baking power': 'sodium bicarbonate', 'baking soda (sodium bicarbonate)': 'sodium bicarbonate', 'caffeine': 'caffeine', 'chloroquine': 'chloroquine', 'chloroquine confirm with reagent test': 'chloroquine', 'choroquine': 'chloroquine', 'coacaine': 'cocaine', 'coc': 'cocaine', 'cocaime': 'coc

{'natural yellow 11': 'Binder', '1 alpha alpha alpha trifluoro m tolyl piperazine': 'TFMPP', '2c-b': '2C-B', '3,4-methylenedioxyamphetamine': 'MDA', '3-meo-pcp': '3-meo-pcp', '4-cec': '4-chloroethcathinone', '4-chloroethcathinone': '4-chloroethcathinone', '4-me-dimethylcathinone': '4-me-dimethylcathinone', '4-methyl-n-ethylpentedrone (4-methyl-nep)': '4-methyl-n-ethylpentedrone (4-methyl-nep)', '5-meo-mipt': '5-meo-mipt', '6-apb': '6-apb', 'alprazolam': 'Alprazolam', 'amphetamine': 'Amphetamine', 'ascorbic  acid': 'Ascorbic acid', 'ascorbic acid - vitamin c': 'Ascorbic acid', 'benzocaine': 'Benzocaine', 'bk-dmbdb': 'bk-dmbdb', 'boron trioxide': 'Boric Acid', 'caffeine': 'caffeine', 'caffiene': 'caffeine', 'cathinone (n-ethylamino-hexanophenone)': 'Hex-en', 'chloroquine': 'chloroquine', 'cocaine': 'cocaine', 'coconut oil': 'Coconut oil', 'coryzalia': 'coryzalia', 'creatine': 'creatine', 'dibenzoyl peroxide': 'dibenzoyl peroxide', 'dicyclohexyl phthalate': 'dicyclohexyl phthalate', 'dime

In [9]:
AMPHETAMINE = 'amphetamine'
BENZODIAZEPINE = 'benzodiazepine'
COCAINE = 'cocaine'
FOUND = 'found'
KETAMINE = 'ketamine'
LSD = 'lsd'
MEPHEDRONE = 'mephedrone'
MDMA = 'mdma'
NETHYLPENTYLONE = 'n-ethylpentylone'
PSYCHEDELIC = 'psychedelic'
TWOCB = '2cb'
UNKNOWN = 'unknown'

drugs_map = { 
    AMPHETAMINE : ['speed', 'Speed', 'base/speed', 'adderall'],
    BENZODIAZEPINE : ['chinese valium', ],
    COCAINE : ['coke', 'cut cocaine'],
    FOUND : ['unknow found'],
    KETAMINE : ['?ket', '/ketamie', 'maybe ketamine?', 'katamine', 'vanila ketamine', 
                'vetamine', 'not mdma. ketamine?', 'ketamoine'],
    LSD : ['acid', 'liquid lsd'],
    MEPHEDRONE : ['meow meow', 'mcat'],
    MDMA : ['mdxx', 'mda/mdea/mdma', 'mdma,', 'mandy', 'probaby mdma', 'mdma?', 'mdma with caffeine',
           '3/4 of pill green shooting star', 'ecstacy', 'ecstacy pill', 'ecstasy',
            'ecstasy pill', 'esctacy pill sample', 'estacy pill', 'pill'],
    #NETHYLPENTYLONE : ['n-ethylpentylone'],
    PSYCHEDELIC : [ '4-aco dmt', '4-aco-dmt', '4aco', '4acodmt', '5meomipit', 'dmt_2cb', 'dmt', 'ayahuasca'],
    TWOCB : ['2 cb', '2c-b'],
    UNKNOWN : ['unknown pill', 'unsure', 'unsure - maybe dmt', 'unsure of content', 
               'no effect', 'no idea', 'data missing', ''],   
    }




In [56]:
drugs = set()
for festival, dfs in data.items():
    columns.update(dfs.combined.columns.values)
for c in sorted(columns):
    print(c)

"Strength" of powdered substance
% MDMA content
Actual filename
After hearing today’s test results & advice, what do you plan to do?
After our conversation today, would you like to have any further advice or support from a treatment service for your alcohol or drug use?
Age
AlreadyTried
AlreadyTried_x
AlreadyTried_y
Analysis required
Are you currently taking any "Over the Counter" medication?
Are you currently taking any prescribed medication?
Brief Note
Brief Note.1
Colour
Compound detected
Compound detected (Subtraction)
DRINK 1 - Type
DRINK 1: Quantity
DRINK 1: Vessel (focus on size not drink type)
DRINK 2 - Type
DRINK 2: Quantity
DRINK 2: Vessel (focus on size not drink type)
DRINK 3 - Type
DRINK 3: Quantity
DRINK 3: Vessel (focus on size not drink type)
DRINK 4 - Type
DRINK 4: Quantity
DRINK 4: Vessel (focus on size not drink type)
Do you have any concerns about how you are feeling at the moment?
Do you trust the supplier of this substance (in terms of the substance)?
Does the sub

In [7]:
# Get list of already known drugs
# all_known = set()
# all_known.update(*drugs_map.values())

user_drug_columns = ["Have you ever taken any other drugs I didn't mention?", 'SoldAs', 'UserSuspicion', 'Substance(s) detected']

tester_drug_columns = ['Compound detected', 'Compound detected (Subtraction)', 'Substance detected',
                'Substance detected (subtraction)', 'Substance detected.1' ]



# prefixes = ['catalog_', 'ftir_', 'mla_', 'hr_']
# all_drug_columns = []
# for d in drug_columns:
#     for p in prefixes:
#         all_drug_columns.append(p+d)

drug_names = set()
for festival, dfs in data.items():
    df = dfs.combined
    for cname in user_drug_columns:
        if cname in df.columns.values:
            drug_names.update(df[cname].str.lower().unique())
if np.nan in drug_names:
    drug_names.remove(np.nan)
if None in drug_names:
    drug_names.remove(None)
drug_names = set([s.strip() for s in drug_names])

for d in sorted(drug_names):
    print("'%s'" % d)

''
'1 alpha alpha alpha trifluoro m tolyl piperazine'
'1 pill 2 powder'
'1plsd - years ago'
'2 5 i, 4acod, nmdmt'
'2-cb'
'2-ce'
'25-i 2yrs ago, salvia 2yrs,'
'25-me0 - few years mxe - 2 years ago'
'250mg mdma'
'250mg white mitsubushi'
'25b _ 2 years'
'25i'
'25i 3 year'
'25i-nbombe, 4 aco dmt, n,n-dmt'
'25i-nbome'
'25i-nbome, changa, dmt'
'2c-b'
'2c-b (or 2c-b derivative such as 2ci)'
'2c-b-fly'
'2c-e'
'2c/b'
'2cb'
'2cb pill'
'2ce,'
'2ci'
'2ci, 25i'
'2ci, nicotine'
'3-meo-pcp'
'3-methylcyclohexanol'
'3meo pcp'
'3mmt'
'4 aco dmt - last year'
'4 aco dmt, 2ce, 2ci,'
'4-aco-dmt'
'4-chloroethcathinone'
'4-ho-met,'
'4aco dmt'
'4fa'
'4ho-met (synthesised psilocybin), dmt, 2ci, salvia, mxe'
'4meo - 3 years'
'5-meo-mipt'
'6-apb'
'?4mm'
'a pill'
'a0256'
'acid'
'acne medication'
'aderol'
'al-lad'
'alcohol'
'alcohol nicotine'
'alcohol,  5-meo-mipt (moxy)'
'alcohol, bromo-dragonfly,'
'alcohol, nicotine'
'alcohol, nicotine,'
'alcohol, nicotine, poppers'
'ald-52'
'all tests negative, tests are unable 

In [102]:
# Here we overwrite the values - if necessary we could create separate columns
# Create dict for replace function is form {column : {value_to_replace, replacement_value}}
replace_d = {}
drug_columns = ['sold/acquired/advertised as', 'Client suspicion', 'Final Result', 'SubmittedSubstanceAs', 'other_specify']

# Firstly convert all columns to lower case and remove any spaces
def clean(value):
    if type(value) is str:
        value = value.strip().lower()
    return value

for column in drug_columns:
    df_final[column] = df_final[column].map(clean, na_action='ignore')

for column in drug_columns:
    replace_d[column] = {}
    for drug, names in drugs_map.items():
        for name in names:
            replace_d[column][name] = drug

# Replace values
df_final.replace(replace_d, inplace=True)
            
# NO_ANALYSIS as is treated separtely as only applies to Final Result - also can't include with other dict
# or the replacement values and keys overlap
NO_ANALYSIS = 'analysis_inconclusive'
no_analysis = ['compound not in library', 'inconclusive', 'insufficient quantity for testing', 
               'insufficient sample', 'insufficient sample', 'lost', 'no active component identified', 
               'no match', 'no match', 'none', 'nothing detected', 'result missing', 'unable to test', 'unknown']

# Fix 'Final Result' for NO_ANALYSIS
column = 'Final Result'
replace_d = {column: {}}
for name in no_analysis:
    replace_d[column][name] = NO_ANALYSIS

# Replace values
df_final.replace(replace_d, inplace=True)

# Additional grouping requested by Fiona
column = 'sold/acquired/advertised as'
replace_d = {column: {'found' : 'unknown',
                      "don't know" : 'unknown',
                      'not sure' : 'unknown',
                     }}
df_final.replace(replace_d, inplace=True)

['SampleNumber',
 'catalog_SampleSource',
 'catalog_SampleForm',
 'ftir_SampleForm',
 'mla_SampleForm',
 'hr_What was your first sample number at this event? Did you take a photo or keep the ticket?',
 'hr_Was the sample bought, given or found?']

In [4]:
# The code above runs across all festival data. The code below is for looking at the data for an individual
# festival (and so really should be in a function), but for the time being we just set the variables we
# require here
dfs = data['boomtown']
duplicates = Duplicates(dfs)

### 43 duplicated ftir SampleNumbers ['F0071', 'F0247', 'F0206', 'F0367', 'F0019', 'F0546', 'F0446', 'F0005', 'F0659', 'F1137', 'F0983', 'F0938', 'F0869', 'F0838', 'F0981', 'F0865', 'F0668', 'F0816', 'F0878', 'F0885', 'F0833', 'F0815', 'F1196', 'F1253', 'F1313', 'F1393', 'F1215', 'F1392', 'F1640', 'F1172', 'F1606', 'F1433', 'F1431', 'F1609', 'F1660', 'F1623', 'F1792', 'F0912', 'F1876', 'F1830', 'F1262', 'F1439', 'F1904'] ###
### 1 duplicated hr SampleNumbers ['F9999'] ###


In [32]:
def find_duplicate_matches(duplicates, df1, df2, df1_name='DataFrame1', df2_name='DataFrame2'):
    hr = False
    if df1_name.lower()[:2] == 'hr':
        hr = True
    duplicate_matches = {}
    min_stage_delay = 60 * 1
    max_stage_delay = 60 * 60
    for sample_number in duplicates:
        duplicate_matches[sample_number] = {}
        for df1_idx, df1_row in df1.loc[df1['SampleNumber'] == sample_number].iterrows():
            for df2_idx, df2_row in df2.loc[df2['SampleNumber'] == sample_number].iterrows():
                df1_data = df1_row.loc[['SoldAs', 'AlreadyTried']].values.tolist()
                if not hr:
                    df1_data.append(df1_row.SampleForm)
                df1_time = df1_row.Timestamp
                df2_data = df2_row.loc[['SoldAs', 'AlreadyTried']].values.tolist()
                if not hr:
                    df2_data.append(df2_row.SampleForm)
                df2_time = df2_row.Timestamp
                if df2_time >= df1_time:
                    delta_t = (df2_time - df1_time).seconds
                else:
                    delta_t = (df1_time - df2_time).seconds
                if df1_data == df2_data and min_stage_delay < delta_t <= max_stage_delay:
                    print("Duplicate %s SampleNumber %s (line: %d) MATCHES %s sample (line: %d)" % \
                          (df1_name, sample_number, df1_idx + 1, df2_name, df2_idx + 1))
                    duplicate_matches[sample_number][df1_idx] = True
                else:
                    print("Duplicate %s SampleNumber %s (line: %d) DIFFERENT %s sample (line: %d)\n%s %s\n%s %s" % \
                          (df1_name, sample_number, df1_idx + 1, df2_name, df2_idx + 1,
                           df1_data, df1_time,
                           df2_data, df2_time))
                    duplicate_matches[sample_number][df1_idx] = False
    return duplicate_matches

def match_orphans_to_duplicates(df1_orphans, duplicate_matches, df1, df2):
    for orphan_sample_number in df1_orphans:
        df1_data = df1.loc[df1['SampleNumber'] == orphan_sample_number, ['SampleForm', 'SoldAs', 'AlreadyTried', 'Timestamp']]
        df1_data = df1_data.values.tolist()[0]
        df1_time = df1_data.pop()
        for sample_number, indexd in duplicate_matches.items():
            for k, v in indexd.items():
                if not v:
                    df2_data = dfs.catalog.iloc[k][['SampleForm', 'SoldAs', 'AlreadyTried', 'Timestamp']].values.tolist()
                    df2_time = df2_data.pop()
                    if df2_time >= df1_time:
                        delta_t = (df2_time - df1_time).seconds
                    else:
                        delta_t = (df1_time - df2_time).seconds
                    if df2_data == df1_data and min_stage_delay < delta_t <= max_stage_delay:
                        print("Orphan {} could be match for duplicate {} (line: {})\n{} {}\n{} {}".format(orphan_sample_number,
                                                                                                  sample_number, k+1,
                                                                                                  df2_data, df2_time,
                                                                                                  df1_data, df1_time))


#duplicate_matches = find_duplicate_matches(duplicates.catalog, dfs.catalog, dfs.ftir, df1_name='Catalog', df2_name='FTIR')
duplicate_matches = find_duplicate_matches(duplicates.ftir, dfs.ftir, dfs.catalog, df1_name='FTIR', df2_name='Catalog')
#duplicate_matches = find_duplicate_matches(duplicates.hr, dfs.hr, dfs.catalog, df1_name='HR', df2_name='Catalog')

#match_orphans_to_duplicates(ftir_orphan, duplicate_matches, dfs.ftir, dfs.catalog)
# match_orphans_to_duplicates(catalog_orphan, duplicate_matches, dfs.catalog, dfs.ftir)

Duplicate FTIR SampleNumber B0103 (line: 28) DIFFERENT Catalog sample (line: 36)
['Found or otherwise not known', '', 'powder'] 2018-08-25 15:56:29
['Found or otherwise not known', 'No', 'powder'] 2018-08-25 15:02:33
Duplicate FTIR SampleNumber B0103 (line: 29) DIFFERENT Catalog sample (line: 36)
['Found or otherwise not known', '', 'pill'] 2018-08-25 15:58:35
['Found or otherwise not known', 'No', 'powder'] 2018-08-25 15:02:33
Duplicate FTIR SampleNumber B0131 (line: 50) MATCHES Catalog sample (line: 55)
Duplicate FTIR SampleNumber B0131 (line: 51) MATCHES Catalog sample (line: 55)
Duplicate FTIR SampleNumber B0221 (line: 136) DIFFERENT Catalog sample (line: 132)
['Found or otherwise not known', 'No', 'powder'] 2018-08-26 15:42:33
['', '', 'powder'] 2018-08-26 15:30:58
Duplicate FTIR SampleNumber B0221 (line: 137) DIFFERENT Catalog sample (line: 132)
['Found or otherwise not known', 'No', 'powder'] 2018-08-26 15:58:20
['', '', 'powder'] 2018-08-26 15:30:58
Duplicate FTIR SampleNumber 

In [98]:
# Check orphans against the FTIR sheet using just their numbers
def match_orphans_with_sample_integer(orphans, orphan_df, ref_df):
#     min_stage_delay = 60 * 1
#     max_stage_delay = 60 * 60
    def to_int(sn):
        if type(sn) is str:
            try:
                sn = int(sn[-4:])
            except ValueError:
                print("Bad SampleNumber %s" % sn)
                sn = np.nan
        return sn
    orphan_df['SampleInteger'] = orphan_df['SampleNumber'].apply(to_int)
    ref_df['SampleInteger'] = ref_df['SampleNumber'].apply(to_int)
    orphan_ints = map(to_int, orphans)
    skipform = True
    for orphan_sample_number, oint in zip(orphans, orphan_ints):
        for orphan_idx, orphan_row in orphan_df.loc[orphan_df['SampleNumber'] == orphan_sample_number].iterrows():
            for ref_idx, ref_row in ref_df.loc[ref_df['SampleInteger'] == oint].iterrows():
                orphan_data = orphan_row.loc[['SampleForm', 'SoldAs', 'AlreadyTried']].values.tolist()
                orphan_time = orphan_row.Timestamp
                ref_sample_number = ref_row.SampleNumber
                ref_data = ref_row.loc[['SampleForm', 'SoldAs', 'AlreadyTried']].values.tolist()
                ref_time = ref_row.Timestamp
                delta_t = (ref_time - orphan_time).seconds
#                 if orphan_data == ref_data and min_stage_delay < delta_t <= max_stage_delay:
                if skipform:
                    orphan_data.pop(0)
                    ref_data.pop(0)
                if orphan_data == ref_data:
                    print("HR orphan %s (line: %d) could be match for FTIR SampleNumber %s (line: %d)\n%s %s\n%s %s" % \
                          (orphan_sample_number, orphan_idx + 1, ref_sample_number, ref_idx + 1, orphan_data, orphan_time, ref_data, ref_time))

# match_orphans_with_sample_integer(catalog_orphan, dfs.catalog, dfs.ftir)
match_orphans_with_sample_integer(hr_orphan, dfs.hr, dfs.catalog)

Bad SampleNumber FXXX
Bad SampleNumber FXXX


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [99]:
# Check orphans against other orphans just using data
def match_orphans_vs_orphans(orphan1_list, orphan1_df, orphan2_list, orphan2_df, hr=False):
    min_stage_delay = 60 * 1
    max_stage_delay = 60 * 60
    for orphan1 in orphan1_list:
        orphan1_row = orphan1_df.loc[orphan1_df['SampleNumber'] == orphan1].iloc[0]
        for orphan2 in orphan2_list:
            orphan2_row = orphan2_df.loc[orphan2_df['SampleNumber'] == orphan2].iloc[0]
            orphan1_data = orphan1_row.loc[['SampleForm', 'SoldAs', 'AlreadyTried']].values.tolist()
            orphan1_time = orphan1_row.Timestamp
            orphan2_data = orphan2_row.loc[['SampleForm', 'SoldAs', 'AlreadyTried']].values.tolist()
            orphan2_time = orphan2_row.Timestamp
            delta_t = (orphan2_time - orphan1_time).seconds
            if orphan1_data == orphan2_data and min_stage_delay <= delta_t <= max_stage_delay:
                print("orphan1 %s could be match for orphan2 %s\n%s %s\n%s %s" % \
                      (orphan1, orphan2, orphan1_data, orphan1_time, orphan1_data, orphan1_time))

# match_orphans_with_sample_integer(catalog_orphan, dfs.catalog, dfs.ftir)
match_orphans_vs_orphans(catalog_orphan, dfs.catalog, ftir_orphan, dfs.ftir)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  if __name__ == '__main__':


In [25]:
#dfs.catalog[dfs.catalog['SampleNumber'].isin(catalog.duplicates)]
#pd.DataFrame({'SampleNumber' : duplicates.ftir}).to_csv('ftir_sn.csv', index=False)

In [6]:
# Clean up catalog
# Drop all unwanted columns

#  or 'Your initials'
l = set(['Your initials',
         'Your name and first initial',
         'Which device was a photo taken with? Who does it belong to?',
         'Is a breakline present?',
         'Unusual appearance'
        ])

to_drop = set(dfs.catalog.columns).intersection(l)
dfs.catalog.drop(to_drop, axis=1, inplace=True)

d = {
    'Timestamp' : 'Catalog timestamp',
    'Sample Advertised/Acquired/Sold As': 'Catalog_SoldAs',
    'Sample Form' : 'Catalog_Form',
    'Has the Service User or a close friend tried this batch?': 'Catalog_Tried',
    'What is the mass? (mg)': 'FullPillMass',
    'What is the shape of the pill?': 'PillShape',
    'What is the logo?': 'PillLogo',
    'What colour is the pill?': 'PillColour'
}
dfs.catalog.rename(columns=d, inplace=True)

In [7]:
# For FTIR columns need to merge the data from the 'Compound detected', 'Hit Confidence.1' columns into the
# 'Substance detected', 'Hit Confidence' column where the substance detected was 'other'
print("COLS ",dfs.ftir.columns)
print("SS ",dfs.ftir['Substance detected'][:5])
mask = dfs.ftir['Substance detected'] != 'Other'
dfs.ftir['Substance detected'].where(mask, dfs.ftir['Compound detected'], inplace=True) # Copy values from 'Compound detected'
dfs.ftir['Hit Confidence'].where(mask, dfs.ftir['Hit Confidence.1'], inplace=True)
dfs.ftir.drop(['Compound detected', 'Hit Confidence.1', 'Brief Note'], axis=1, inplace=True)

mask = dfs.ftir['Compound detected (Subtraction)'] != 'Other'
dfs.ftir['Compound detected (Subtraction)'].where(mask, df_ftir['Substance detected.1'], inplace=True) # Copy values from 'Compound detected'
dfs.ftir['Hit Confidence.2'].where(mask, dfs.ftir['Hit Confidence.3'], inplace=True)
dfs.ftir.drop(['Substance detected.1', 'Hit Confidence.3', 'Brief Note.1'], axis=1, inplace=True)

# Drop all unwanted columns
l = ['Your name and surname initial',
     'User Suspicion',
     'Is anything detected after subtraction analysis?',
     'Analysis required', 
     'Next action(s)',
     'Send to HR team'
    ]
#'Note for harm reduction worker'
to_drop = set(dfs.ftir.columns).intersection(l)
dfs.ftir.drop(to_drop, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'FTIR timestamp',
    'Sample Sold As': 'FTIR Sold As',
    'Sample Form' : 'FTIR form',
    'Has the Service User or a close friend tried this batch?': 'FTIR tried',
    'Substance(s) detected' : 'FTIR final result',
    'Substance detected' : 'FTIR result1',
    'Hit Confidence' :  'FTIR hit1',
    'Is anything detected after subtraction analysis?' : 'FTIR subtraction positive',
    'Compound detected (Subtraction)' :  'FTIR result2',
    'Hit Confidence.2' :  'FTIR hit2',
    '"Strength" of powdered substance' : 'FTIR Powder Strength',
    'Does the substance detected match the substance that was advertised?' : 'FTIR Matches Sold As',
}
dfs.ftir.rename(columns=d, inplace=True)

('COLS ', Index([                                                           u'Timestamp',
                                                              u'Sample Number',
                                                                     u'Tester',
                                                                    u'Sold As',
                                                                u'Sample Form',
                                                              u'Already Tried',
                                                             u'User Suspicion',
                                                         u'Substance detected',
                                                             u'Hit Confidence',
                                                          u'Compound detected',
                                                           u'Hit Confidence.1',
                                                                 u'Brief Note',
                           u'I

NameError: name 'df_ftir' is not defined

In [None]:
# Clean up HR form

# Drop all unwanted columns
l = ['HR worker name:']
dfs.hr.drop(l, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'HR timestamp',
    'You submitted a substance for analysis. What were you told it was when you got it?': 'HR Sold as',
    'Had you already tried this substance before getting it tested?': 'HR tried',
    'What was your first sample number at this event? Did you take a photo or keep the ticket?': 'Previous Sample Number'
}
dfs.hr.rename(columns=d, inplace=True)

In [None]:
# Fix column orders
prefix = ['Sample Number',
          'Catalog timestamp', 'FTIR timestamp', 'HR timestamp',
          'Catalog Sold As', 'FTIR Sold As','HR Sold as', 
          'Catalog form', 'FTIR form',
          'Catalog tried', 'FTIR tried', 'HR tried']
columns = [c for c in df_all.columns if c not in prefix]
columns = prefix + columns
df_all = df_all[columns]
df_all.to_csv('foo.csv')