In [1]:
# Module imports
import copy
import datetime
import os
import numpy as np
import pandas as pd

def fix_sample_number(x):
    """Make sure all samples numbers are of form: AXXX (where A is one of A, F, W and X is a digit)"""
    if isinstance(x, float) and np.isnan(x):
        return x # leave NaN's alone
    if (isinstance(x, str) or isinstance(x, unicode)) and len(x) == 0:
        return np.nan
    try:
        sn = 'F{:04d}'.format(int(x))
    except ValueError:
        # Assume string so make sure it's of the right format
        sn = str(x).strip().upper()
    if len(sn) != 5 or sn[0] not in ['A', 'F', 'W', 'B']:
        if sn[:2] != 'DF': # Duplicate labels
            print("!!! Bad ID \'%s\'" % sn)
    return sn

def now():
    return datetime.datetime.now().strftime("%d/%m/%y %H:%M:%S")

def enumerate_duplicates(row):
    """Append a counter to duplicate labels"""
    SEPARATOR = '.'
    duplicates = {}
    updated_row = []
    for r in row:
        count = duplicates.get(r, 0)
        if count > 0:
            label = "{}{}{}".format(r, SEPARATOR, count)
        else:
            label = r
        updated_row.append(label)
        duplicates[r] = count + 1
    return updated_row


In [2]:
pd.options.mode.chained_assignment = 'raise'

# Need to define in main or we can't pickle the data objects
class DataFrames(object):
    def __init__(self):
        catalog = None
        ftir = None
        reagent = None
        mla = None
        hr = None

def gsheets_service():
    from googleapiclient.discovery import build
    from httplib2 import Http
    from oauth2client import file, client, tools
    # If modifying these scopes, delete the file token.json.
    #Ensure that the creds file is always taken from the current working folder
        #This allows two people on different PCs to merge changes more easily.
    CREDS_FILE = os.path.join(os.path.realpath('./'),'JensDataExportJupyter_client_secret.json')
    SCOPES = 'https://www.googleapis.com/auth/spreadsheets.readonly'
    store = file.Storage('token.json')
    creds = store.get()
    if not creds or creds.invalid:
        import argparse
        flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args([])
        flow = client.flow_from_clientsecrets(CREDS_FILE, SCOPES)
        creds = tools.run_flow(flow, store, flags)
    service = build('sheets', 'v4', http=creds.authorize(Http()))
    return service

def get_df(service, SPREADSHEET_ID, SS_RANGE, mla=False):
    # Call the Sheets API
    result = service.spreadsheets().values().get(spreadsheetId=SPREADSHEET_ID,
                                                range=SS_RANGE).execute()
    values = result.get('values', [])
    if not values:
        print('*** No data found ***')
        return None

    # mla has irrelevant stuff in columns 1 and 3 and sample numbers in first column
    if mla:
        values.pop(0)
        values.pop(1)
        def not_blank(row):
            return len(row[0]) > 0       
    else:
        def not_blank(row):
            return sum(map(len, row[:6])) > 0

    rows = filter(not_blank, values)
    if not rows:
        print('*** No data found after pruning rows! ***')
        return None
    
    columns = enumerate_duplicates(rows[0])
    ncols = len(rows[0])
    row_max = max(map(len, rows[1:]))
    width = min(ncols, row_max)
    return pd.DataFrame(rows[1:], columns=columns[:width])

def canonicalise_df(df, source=None):
    """Initial cleaning of all dataframes"""
    #from pandas._libs.tslib import OutOfBoundsDatetime
    if source:
        print("Canonicalising %s" % source)
    # Standardise names
    d = {
        'Sample Code':'SampleNumber',
        'Sample Number:':'SampleNumber',
        'Sample Number':'SampleNumber',
        'Sample number':'SampleNumber',
        'Sample Num':'SampleNumber',
        'Sample Number i.e F0XXX' : 'SampleNumber',
        
        'Sample Advertised/Acquired/Sold As' : 'SoldAs',
        'Sample Sold As' : 'SoldAs',
        'You submitted a substance for analysis. What were you told it was when you got it?':  'SoldAs',
        
        
        'Sample Source' :'SampleSource',

        'User Suspicion' :'UserSuspicion',

        'Sample Form' :'SampleForm',

        'Has the Service User or a close friend tried this batch?' : 'AlreadyTried',
        'Had you already tried this substance before getting it tested?' : 'AlreadyTried',

        'Your initials' : 'Tester',
        'Your name and first initial' : 'Tester',
        'Your name and surname initial' : 'Tester'
    }
    df.rename(columns=d, inplace=True)
    
    def fix_timestamp(x):
        return pd.to_datetime(str(x), format='%d/%m/%Y %H:%M:%S')
    if 'Timestamp' in df.columns:
        df.loc[:, 'Timestamp'] = df['Timestamp'].map(fix_timestamp)
    df.loc[:, 'SampleNumber'] = df['SampleNumber'].apply(fix_sample_number)
    df.dropna(subset=['SampleNumber'])
    #df.sort_values(['Sample Number'], ascending=True, inplace=True)
    return df

def get_data(service, SPREADSHEET_ID):

    CATALOG_RANGE = 'Catalog!A:R'
    FTIR_RANGE = 'FTIR!A:X'
    REAGENT_RANGE = 'Reagent!A:W'
    MLA_RANGE = 'MLA!A:R'
    HR_RANGE = 'Interventions!A:BJ'

    df_catalog = get_df(service, SPREADSHEET_ID, CATALOG_RANGE)
    df_catalog = canonicalise_df(df_catalog, source='catalog')
    df_ftir = get_df(service, SPREADSHEET_ID, FTIR_RANGE)
    df_ftir = canonicalise_df(df_ftir, source='ftir')
    df_reagent = get_df(service, SPREADSHEET_ID, REAGENT_RANGE)
    df_reagent = canonicalise_df(df_reagent, source='reagent')
    df_mla = get_df(service, SPREADSHEET_ID, MLA_RANGE, mla=True)
    df_mla = canonicalise_df(df_mla, source='mla')
    try:
        df_hr = get_df(service, SPREADSHEET_ID, HR_RANGE)
    except ValueError:
        df_hr = None
    if df_hr is not None:
        df_hr = canonicalise_df(df_hr, source='hr')

    df = DataFrames()
    df.catalog = df_catalog
    df.ftir = df_ftir
    df.reagent = df_reagent
    df.mla = df_mla
    df.hr = df_hr
    
    return df


In [6]:
#S how the folder where the code file is being run from        
print("Script running from: %s" % os.path.realpath(os.getcwd()))

# The ID and range of a sample spreadsheet.
BOOMTOWN2018_SPREADSHEET_ID = '1RiA-FwG_954Ger2VPsOSA3JLh-7sEoTYr40eVS0mp24'
MADE2018_SPREADSHEET_ID = '1daXdyL6uL8qnMsEsP0RLZE9nDzt6J7Zr1ygQdguvi-E'
BOARDMASTERS2018_SPREADSHEET_ID = '1U1lhUWLazDBN-wb2eZM8YV674f46npVfQK3XUVZjPow'
SW42018_SPREADSHEET_ID = '1agpMmJ9XukeWXS5_mwrDSKeshUaFtYwOzsPiR1DKsPU'
LOSTVILLAGE2018_SPREADSHEET_ID = '1OL0gyXrpZnJ8e7yR7eF6S2OaBYBiPDoVp5xGpdK4wlA'
BESTIVAL2018_SPREADSHEET_ID = '184qudGcw4PB0SMtOo0ZBDtckeGaH0RCLUXbA-u3BiHE'
YNOT2018_SPREADSHEET_ID = '1D01cj-Mra06TuoG_MsKuLq9OdtvKzrvRdiE255po_ag'
TRUCKFEST2018_SPREADSHEET_ID = '1sGG9WJxKyD2CGUjzJAXul3g9hVnRz6HbTiqKV5cUAyA'
LSTD2018_SPREADSHEET_ID = '1R8YqDnrhvuVMwPFShwaaAUIyCXQMeozA230OXsFsDQM'
KENDALCALLING2018_SPREADSHEET_ID = '16-PfwBOaUxwod3X75LGk1VAjBblkNsTJpCsX825aghI'
PARKLIFE2018_SPREADSHEET_ID = '1oO5sHcUhUn_7M1Hap73sOZHNEfWFMcDkQuWDRFf4d-w'


data = {}
service = gsheets_service()
print("PROCESSING BOOMTOWN")
data['boomtown'] = get_data(service, BOOMTOWN2018_SPREADSHEET_ID)
print("PROCESSING BOARDMASTERS")
data['boardmasters'] = get_data(service, BOARDMASTERS2018_SPREADSHEET_ID)
print("PROCESSING MADE")
data['made'] = get_data(service, MADE2018_SPREADSHEET_ID)
print("PROCESSING SW4")
data['sw4'] = get_data(service, SW42018_SPREADSHEET_ID)
print("PROCESSING LOST VILLAGE")
data['lostvillage'] = get_data(service, LOSTVILLAGE2018_SPREADSHEET_ID)
print("PROCESSING BESTIVAL")
data['bestival'] = get_data(service, BESTIVAL2018_SPREADSHEET_ID)
print("PROCESSING YNOT")
data['ynot'] = get_data(service, YNOT2018_SPREADSHEET_ID)
print("PROCESSING TRUCKFEST")
data['truckfest'] = get_data(service, TRUCKFEST2018_SPREADSHEET_ID)
print("PROCESSING LSTD")
data['lstd'] = get_data(service, LSTD2018_SPREADSHEET_ID)
print( "PROCESSING KENDAL CALLING")
data['kc'] = get_data(service, KENDALCALLING2018_SPREADSHEET_ID)
print("PROCESSING PARKLIFE")
data['parklife'] = get_data(service, PARKLIFE2018_SPREADSHEET_ID)

import pickle
with open('foo_multi.pkl','w') as w:
    pickle.dump(data, w)
# dfs = data['boomtown']

Script running from: /opt/random
PROCESSING BOOMTOWN
Canonicalising catalog
!!! Bad ID 'TF0579'
!!! Bad ID 'TF1665'
!!! Bad ID 'TF1660'
Canonicalising ftir
!!! Bad ID 'TF1665'
Canonicalising reagent
Canonicalising mla
Canonicalising hr
!!! Bad ID 'TF0653'
!!! Bad ID 'TF1172'
!!! Bad ID 'TF1762'
PROCESSING BOARDMASTERS
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
Canonicalising hr
PROCESSING MADE
Canonicalising catalog
Canonicalising ftir
!!! Bad ID 'XF0005'
Canonicalising reagent
Canonicalising mla
Canonicalising hr
PROCESSING SW4
Canonicalising catalog
Canonicalising ftir
Canonicalising reagent
Canonicalising mla
PROCESSING LOST VILLAGE
Canonicalising catalog
!!! Bad ID 'DB0136'
!!! Bad ID 'DB0355'
!!! Bad ID 'DB0354'
!!! Bad ID 'DB0001'
!!! Bad ID 'DB0001'
!!! Bad ID 'DB0053'
!!! Bad ID 'DB0053'
Canonicalising ftir
!!! Bad ID 'DB0001'
!!! Bad ID 'DB0004'
!!! Bad ID 'DB0006'
!!! Bad ID 'DB0010'
!!! Bad ID 'DB0032'
!!! Bad ID 'DB0041'
!!! Bad ID 

In [66]:
# with open('foo_multi.pkl') as f:
#     data = pickle.load(f)
# dfs = data['boomtown']

# Need to define in main or we can't pickle the data objects
class Duplicates(object):
    def __init__(self, dfs):
        self.dfs = dfs
        dtypes = ['catalog', 'ftir', 'reagent', 'mla', 'hr']
        for t in dtypes:
            setattr(self, t, None)
        for t in dtypes:
            self.find_duplicates(t)
        
    def find_duplicates(self, dtype):
        dataframe = getattr(self.dfs, dtype)
        if dataframe is None:
            return
        duplicates = dataframe['SampleNumber'].duplicated()
        if duplicates.any():
            duplicates = list(dataframe.loc[duplicates, 'SampleNumber'].values)
            print("### %d duplicated %s SampleNumbers %s ###" % (len(duplicates), dtype, duplicates))
#             dataframe[datafra,e['SampleNumber'].duplicated(keep=False)].to_csv('{}_duplicates.csv'.format(dtype))
        else:
            duplicates = None
        setattr(self, dtype, duplicates)
        
    def has_hr_duplicates(self):
        if self.hr:
            outs = 'Please fix HR duplicates'
            raise RuntimeError(outs)

for festival, dfs in data.items():
    print("CHECKING ",festival)
    duplicates = Duplicates(dfs)
    duplicates.has_hr_duplicates()

### 3 duplicated catalog SampleNumbers ['B0174', 'B0255', 'B0298'] ###
### 4 duplicated ftir SampleNumbers ['B0103', 'B0131', 'B0221', 'B0229'] ###
### 2 duplicated reagent SampleNumbers ['B0135', 'B0235'] ###
### 28 duplicated mla SampleNumbers ['B0130', 'B0131', 'B0133', 'B0128', 'B0128', 'B0128', 'B0128', 'B0128', 'B0128', 'B0128', 'B0128', 'B0150', 'B0150', 'B0150', 'B0150', 'B0150', 'B0115', 'B0172', 'B0172', 'B0200', 'B0194', 'B0206', 'B0206', 'B0206', 'B0206', 'B0206', 'B0206', 'B0258'] ###


In [46]:
# Check there are no SampleNumbers in any of the other spreadsheets that aren't in the cataolog sheet
catalog_unique = set(dfs.catalog['SampleNumber'].unique())

ftir_unique = set(dfs.ftir['SampleNumber'].unique())
ftir_orphan = ftir_unique.difference(catalog_unique)
if ftir_orphan:
    print("Orphaned FTIR SampleNumbers: %s" % sorted(ftir_orphan))

reagent_unique = set(dfs.reagent['SampleNumber'].unique())
reagent_orphan = reagent_unique.difference(catalog_unique)
if reagent_orphan:
    print("Orphaned Reagent Test SampleNumbers: %s" % sorted(reagent_orphan))

hr_orphan = None
if dfs.hr is not None:
    hr_unique = set(dfs.hr['SampleNumber'].unique())
    # HR need to be both in catalog and ftir
    hr_orphan = hr_unique.difference(ftir_unique.union(catalog_unique))
    if hr_orphan:
        print("Orphaned HR SampleNumbers: %s" % sorted(hr_orphan))
    
mla_unique = set(dfs.mla['SampleNumber'].unique()).difference(catalog_unique)
mla_orphan = mla_unique.difference(catalog_unique)
if mla_orphan:
    print("Orphaned MLA SampleNumbers: %s" % sorted(mla_orphan))
    
# Check for any that are only in the catalog
outside_catalog = set.union(ftir_unique, reagent_unique, hr_unique, mla_unique)
catalog_orphan = catalog_unique.difference(outside_catalog)
if catalog_orphan:
    print("Orphaned catalog SampleNumbers: %s" % sorted(catalog_orphan))
    
# Check for any that aren't in FTIR and don't have anything in reagent test
ftir_missing = catalog_unique.difference(ftir_unique).difference(reagent_unique).difference(catalog_orphan)
if len(ftir_missing):
    print("Samples not in FTIR or Reagent: %s" % sorted(ftir_missing))

all_unique = copy.copy(ftir_unique)
all_unique.update(reagent_unique, hr_unique, mla_unique)
if (all_unique or catalog_only):
    outs = "### Please fix orphaned/catalog only samples ###"
    print(outs)
    #raise RuntimeError(outs)

Orphaned FTIR SampleNumbers: ['A0297', 'A0351', 'A0500', 'A0545', 'A1272', 'A1289', 'A1320', 'A1339', 'A1340', 'A1368', 'A1467', 'A2011', 'A2015', 'A2038', 'A2049', 'A2245', 'A2250', 'A2255', 'A2259', 'A2260', 'A2266', 'A2267', 'DA0268', 'DA0279', 'DA0290', 'DA0309', 'DA0329', 'DA0349', 'DA0366', 'DA0426', 'DA0446', 'DA0492', 'DA0500', 'DA1324', 'DA1331', 'DA1344', 'DA1349', 'DA1364', 'DA1448', 'DA2010', 'DA2015', 'DA2022', 'DA2041', 'DA2047', 'DA2054', 'DA2066', 'DA2120', 'DW1369']
Orphaned Reagent Test SampleNumbers: ['A1467', 'NOT A1451']
Orphaned MLA SampleNumbers: ['A0500', 'A2049']
Orphaned catalog SampleNumbers: ['A0274', 'A0299', 'A0364', 'A0373', 'A0384', 'A0403', 'A0411', 'A0432', 'A0459', 'A0462', 'A0489', 'A0490', 'A1256', 'A1268', 'A1269', 'A1270', 'A1273', 'A1276', 'A1282', 'A1326', 'A1327', 'A1345', 'A1346', 'A1356', 'A1375', 'A2035', 'A2054', 'A2113', 'A2124', 'A2128', 'A2130', 'A2133', 'A2148', 'A2149', 'A2154', 'A2159', 'A2160', 'A2161', 'A2163', 'A2165', 'A2174', 'A2

In [41]:
# This cell cleans the "sample form" field 
def clean_df(df):
    sample_form_d = { 'pill' : ['Ecstasy Tablet',
                                'ecstasy pill',
                                'ecstacy pill',
                                'Non-pharmaceutical tablet (ecstasy etc)',
                                'other recreational pill',
                                 'Whole pill',
                                'Other pill',
                                'Pharmaceutical'],
                      'partial pill' : ['Partial ecstasy pill',
                                        'Partial 2C-B pill',
                                        'Crushed tablet'],
                      'powder' : ['powder/capsule/bomb',
                                  'Powder/capsule/bomb/crystal',
                                  'Powder or crushed pill',
                                  'Crystal, Capsule or Powder'],
                      'liquid' : ['*Cannabinoid liquid',
                                   '*Viscous liquid',
                                  'Dissolved in Propylene Glycol',
                                  'Oil'],
                       'tab' : ['blotter', 'LSD Tab']
                      }


    # Firstly convert all columns to lower case and remove any spaces
    def lower(value):
        if type(value) in [str, unicode]:
            value = value.strip().lower()
        return value

    for column in ['SampleForm']:
        df[column] = df[column].map(lower, na_action='ignore')
    
    replace_d = {}
    for column in ['SampleForm']:
        replace_d[column] = {}
        for drug, names in sample_form_d.items():
            for name in names:
                replace_d[column][name.lower()] = drug
    
    # Replace values
    df.replace(replace_d, inplace=True)
    return df
    
dfs.catalog = clean_df(dfs.catalog)
dfs.ftir = clean_df(dfs.ftir)
dfs.reagent = clean_df(dfs.reagent)
print("Finished cleaning 'Sample form' field at %s" % now())


In [42]:
def find_duplicate_matches(duplicates, df1, df2, df1_name='DataFrame1', df2_name='DataFrame2'):
    hr = False
    if df1_name.lower()[:2] == 'hr':
        hr = True
    duplicate_matches = {}
    min_stage_delay = 60 * 1
    max_stage_delay = 60 * 60
    for sample_number in duplicates:
        duplicate_matches[sample_number] = {}
        for df1_idx, df1_row in df1.loc[df1['SampleNumber'] == sample_number].iterrows():
            for df2_idx, df2_row in df2.loc[df2['SampleNumber'] == sample_number].iterrows():
                df1_data = df1_row.loc[['SoldAs', 'AlreadyTried']].values.tolist()
                if not hr:
                    df1_data.append(df1_row.SampleForm)
                df1_time = df1_row.Timestamp
                df2_data = df2_row.loc[['SoldAs', 'AlreadyTried']].values.tolist()
                if not hr:
                    df2_data.append(df2_row.SampleForm)
                df2_time = df2_row.Timestamp
                delta_t = (df2_time - df1_time).seconds
                if df1_data == df2_data and min_stage_delay < delta_t <= max_stage_delay:
                    print("Duplicate %s SampleNumber %s (line: %d) MATCHES %s sample (line: %d)" % \
                          (df1_name, sample_number, df1_idx + 1, df2_name, df2_idx + 1))
                    duplicate_matches[sample_number][df1_idx] = True
                else:
                    print("Duplicate %s SampleNumber %s (line: %d) DIFFERENT %s sample (line: %d)\n%s %s\n%s %s" % \
                          (df1_name, sample_number, df1_idx + 1, df2_name, df2_idx + 1,
                           df1_data, df1_time,
                           df2_data, df2_time))
                    duplicate_matches[sample_number][df1_idx] = False
    return duplicate_matches

def match_orphans_to_duplicates(df1_orphans, duplicate_matches, df1, df2):
    for orphan_sample_number in df1_orphans:
        df1_data = df1.loc[df1['SampleNumber'] == orphan_sample_number, ['SampleForm', 'SoldAs', 'AlreadyTried', 'Timestamp']]
        df1_data = df1_data.values.tolist()[0]
        df1_time = df1_data.pop()
        for sample_number, indexd in duplicate_matches.items():
            for k, v in indexd.items():
                if not v:
                    df2_data = dfs.catalog.iloc[k][['SampleForm', 'SoldAs', 'AlreadyTried', 'Timestamp']].values.tolist()
                    df2_time = df2_data.pop()
                    delta_t = (df1_time - df2_time).seconds
                    if df2_data == df1_data and min_stage_delay < delta_t <= max_stage_delay:
                        print("Orphan {} could be match for duplicate {} (line: {})\n{} {}\n{} {}".format(orphan_sample_number,
                                                                                                  sample_number, k+1,
                                                                                                  df2_data, df2_time,
                                                                                                  df1_data, df1_time))


duplicate_matches = find_duplicate_matches(catalog_duplicates, dfs.catalog, dfs.ftir, df1_name='Catalog', df2_name='FTIR')
# duplicate_matches = find_duplicate_matches(ftir_duplicates, dfs.ftir, dfs.catalog, df1_name='FTIR', df2_name='Catalog')
# duplicate_matches = find_duplicate_matches(hr_duplicates, dfs.hr, dfs.catalog, df1_name='HR', df2_name='Catalog')

#match_orphans_to_duplicates(ftir_orphan, duplicate_matches, dfs.ftir, dfs.catalog)
# match_orphans_to_duplicates(catalog_orphan, duplicate_matches, dfs.catalog, dfs.ftir)

Duplicate Catalog SampleNumber A0348 (line: 91) DIFFERENT FTIR sample (line: 92)
[u'', u'', 'pill'] 2018-06-09 15:29:16
[u'Found or otherwise not known', u'No', 'pill'] 2018-06-09 15:45:16
Duplicate Catalog SampleNumber A0348 (line: 92) DIFFERENT FTIR sample (line: 92)
[u'', u'', 'pill'] 2018-06-09 15:32:36
[u'Found or otherwise not known', u'No', 'pill'] 2018-06-09 15:45:16
Duplicate Catalog SampleNumber A0372 (line: 117) DIFFERENT FTIR sample (line: 156)
[u'', u'', 'powder'] 2018-06-09 15:49:08
[u'Found or otherwise not known', u'No', 'powder'] 2018-06-09 17:12:27
Duplicate Catalog SampleNumber A0372 (line: 119) DIFFERENT FTIR sample (line: 156)
[u'', u'', 'powder'] 2018-06-09 15:49:48
[u'Found or otherwise not known', u'No', 'powder'] 2018-06-09 17:12:27
Duplicate Catalog SampleNumber A0424 (line: 167) DIFFERENT FTIR sample (line: 323)
[u'', u'', 'powder'] 2018-06-09 16:23:46
[u'Found or otherwise not known', u'No', 'powder'] 2018-06-09 20:47:26
Duplicate Catalog SampleNumber A0424 

In [571]:
# Check orphans against the FTIR sheet using just their numbers
def match_orphans_with_sample_integer(orphans, orphan_df, ref_df):
#     min_stage_delay = 60 * 1
#     max_stage_delay = 60 * 60
    def to_int(sn):
        if type(sn) in [str, unicode]:
            try:
                sn = int(sn[-4:])
            except ValueError:
                print("Bad SampleNumber %s" % sn)
        return sn
    orphan_df['SampleInteger'] = orphan_df['SampleNumber'].apply(to_int)
    ref_df['SampleInteger'] = ref_df['SampleNumber'].apply(to_int)
    orphan_ints = map(to_int, orphans)
    
    skipform = True
    for orphan_sample_number, oint in zip(orphans, orphan_ints):
        for orphan_idx, orphan_row in orphan_df.loc[orphan_df['SampleNumber'] == orphan_sample_number].iterrows():
            for ref_idx, ref_row in ref_df.loc[ref_df['SampleInteger'] == oint].iterrows():
                orphan_data = orphan_row.loc[['SampleForm', 'SoldAs', 'AlreadyTried']].values.tolist()
                orphan_time = orphan_row.Timestamp
                ref_sample_number = ref_row.SampleNumber
                ref_data = ref_row.loc[['SampleForm', 'SoldAs', 'AlreadyTried']].values.tolist()
                ref_time = ref_row.Timestamp
                delta_t = (ref_time - orphan_time).seconds
#                 if orphan_data == ref_data and min_stage_delay < delta_t <= max_stage_delay:
                if skipform:
                    orphan_data.pop(0)
                    ref_data.pop(0)
                if orphan_data == ref_data:
                    print("HR orphan %s (line: %d) could be match for FTIR SampleNumber %s (line: %d)\n%s %s\n%s %s" % \
                          (orphan_sample_number, orphan_idx + 1, ref_sample_number, ref_idx + 1, orphan_data, orphan_time, ref_data, ref_time))

# match_orphans_with_sample_integer(catalog_orphan, dfs.catalog, dfs.ftir)
match_orphans_with_sample_integer(hr_orphan, dfs.hr, dfs.catalog)

HR orphan P0306 (line: 271) could be match for FTIR SampleNumber F0306 (line: 281)
[u'MDMA', u'Yes'] 2018-08-04 18:16:47
[u'MDMA', u'Yes'] 2018-08-04 16:09:16
HR orphan F20387 (line: 305) could be match for FTIR SampleNumber F0387 (line: 343)
[u'MDMA', u'Yes'] 2018-08-05 13:33:36
[u'MDMA', u'Yes'] 2018-08-05 13:02:51
HR orphan F20357 (line: 297) could be match for FTIR SampleNumber F0357 (line: 322)
[u'Cocaine', u'Yes'] 2018-08-05 12:54:45
[u'Cocaine', u'Yes'] 2018-08-04 18:23:24
HR orphan P0315 (line: 286) could be match for FTIR SampleNumber F0315 (line: 302)
[u'MDMA', u'No'] 2018-08-04 18:53:56
[u'MDMA', u'No'] 2018-08-04 17:26:33


In [540]:
# Check orphans against other orphans just using data
def match_orphans_vs_orphans(orphan1_list, orphan1_df, orphan2_list, orphan2_df, hr=False):
    min_stage_delay = 60 * 1
    max_stage_delay = 60 * 60
    for orphan1 in orphan1_list:
        orphan1_row = orphan1_df.loc[orphan1_df['SampleNumber'] == orphan1].iloc[0]
        for orphan2 in orphan2_list:
            orphan2_row = orphan2_df.loc[orphan2_df['SampleNumber'] == orphan2].iloc[0]
            orphan1_data = orphan1_row.loc[['SampleForm', 'SoldAs', 'AlreadyTried']].values.tolist()
            orphan1_time = orphan1_row.Timestamp
            orphan2_data = orphan2_row.loc[['SampleForm', 'SoldAs', 'AlreadyTried']].values.tolist()
            orphan2_time = orphan2_row.Timestamp
            delta_t = (orphan2_time - orphan1_time).seconds
            if orphan1_data == orphan2_data and min_stage_delay <= delta_t <= max_stage_delay:
                print("orphan1 %s could be match for orphan2 %s\n%s %s\n%s %s" % \
                      (orphan1, orphan2, orphan1_data, orphan1_time, orphan1_data, orphan1_time))

# match_orphans_with_sample_integer(catalog_orphan, dfs.catalog, dfs.ftir)
match_orphans_vs_orphans(catalog_orphan, dfs.catalog, ftir_orphan, dfs.ftir)

GOT  [u'n/a', u'No substance (empty baggy) ', u'No'] ['pill', u'MDMA', u'No'] 12253
GOT  ['pill', u'Found or otherwise not known', u'No'] ['pill', u'MDMA', u'No'] 83605
GOT  ['powder', u'Found or otherwise not known', u'No'] ['pill', u'MDMA', u'No'] 84606
GOT  ['powder', u'Found or otherwise not known', u'No'] ['pill', u'MDMA', u'No'] 84648
GOT  ['powder', u'Found or otherwise not known', u'No'] ['pill', u'MDMA', u'No'] 84683
GOT  ['powder', u'Found or otherwise not known', u'No'] ['pill', u'MDMA', u'No'] 84715


In [6]:
# Clean up catalog
# Drop all unwanted columns

#  or 'Your initials'
l = set(['Your initials',
         'Your name and first initial',
         'Which device was a photo taken with? Who does it belong to?',
         'Is a breakline present?',
         'Unusual appearance'
        ])

to_drop = set(dfs.catalog.columns).intersection(l)
dfs.catalog.drop(to_drop, axis=1, inplace=True)

d = {
    'Timestamp' : 'Catalog timestamp',
    'Sample Advertised/Acquired/Sold As': 'Catalog_SoldAs',
    'Sample Form' : 'Catalog_Form',
    'Has the Service User or a close friend tried this batch?': 'Catalog_Tried',
    'What is the mass? (mg)': 'FullPillMass',
    'What is the shape of the pill?': 'PillShape',
    'What is the logo?': 'PillLogo',
    'What colour is the pill?': 'PillColour'
}
dfs.catalog.rename(columns=d, inplace=True)

In [7]:
# For FTIR columns need to merge the data from the 'Compound detected', 'Hit Confidence.1' columns into the
# 'Substance detected', 'Hit Confidence' column where the substance detected was 'other'
print("COLS ",dfs.ftir.columns)
print("SS ",dfs.ftir['Substance detected'][:5])
mask = dfs.ftir['Substance detected'] != 'Other'
dfs.ftir['Substance detected'].where(mask, dfs.ftir['Compound detected'], inplace=True) # Copy values from 'Compound detected'
dfs.ftir['Hit Confidence'].where(mask, dfs.ftir['Hit Confidence.1'], inplace=True)
dfs.ftir.drop(['Compound detected', 'Hit Confidence.1', 'Brief Note'], axis=1, inplace=True)

mask = dfs.ftir['Compound detected (Subtraction)'] != 'Other'
dfs.ftir['Compound detected (Subtraction)'].where(mask, df_ftir['Substance detected.1'], inplace=True) # Copy values from 'Compound detected'
dfs.ftir['Hit Confidence.2'].where(mask, dfs.ftir['Hit Confidence.3'], inplace=True)
dfs.ftir.drop(['Substance detected.1', 'Hit Confidence.3', 'Brief Note.1'], axis=1, inplace=True)

# Drop all unwanted columns
l = ['Your name and surname initial',
     'User Suspicion',
     'Is anything detected after subtraction analysis?',
     'Analysis required', 
     'Next action(s)',
     'Send to HR team'
    ]
#'Note for harm reduction worker'
to_drop = set(dfs.ftir.columns).intersection(l)
dfs.ftir.drop(to_drop, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'FTIR timestamp',
    'Sample Sold As': 'FTIR Sold As',
    'Sample Form' : 'FTIR form',
    'Has the Service User or a close friend tried this batch?': 'FTIR tried',
    'Substance(s) detected' : 'FTIR final result',
    'Substance detected' : 'FTIR result1',
    'Hit Confidence' :  'FTIR hit1',
    'Is anything detected after subtraction analysis?' : 'FTIR subtraction positive',
    'Compound detected (Subtraction)' :  'FTIR result2',
    'Hit Confidence.2' :  'FTIR hit2',
    '"Strength" of powdered substance' : 'FTIR Powder Strength',
    'Does the substance detected match the substance that was advertised?' : 'FTIR Matches Sold As',
}
dfs.ftir.rename(columns=d, inplace=True)

('COLS ', Index([                                                           u'Timestamp',
                                                              u'Sample Number',
                                                                     u'Tester',
                                                                    u'Sold As',
                                                                u'Sample Form',
                                                              u'Already Tried',
                                                             u'User Suspicion',
                                                         u'Substance detected',
                                                             u'Hit Confidence',
                                                          u'Compound detected',
                                                           u'Hit Confidence.1',
                                                                 u'Brief Note',
                           u'I

NameError: name 'df_ftir' is not defined

In [None]:
# Clean up HR form

# Drop all unwanted columns
l = ['HR worker name:']
dfs.hr.drop(l, axis=1, inplace=True)

# Rename shared columns so that we can check for any errors and remove any columns not of interest to the master df
d = {
    'Timestamp' : 'HR timestamp',
    'You submitted a substance for analysis. What were you told it was when you got it?': 'HR Sold as',
    'Had you already tried this substance before getting it tested?': 'HR tried',
    'What was your first sample number at this event? Did you take a photo or keep the ticket?': 'Previous Sample Number'
}
dfs.hr.rename(columns=d, inplace=True)

In [None]:
# Catalog and FTIR data frames
df_all = pd.merge(dfs.catalog, dfs.ftir, how='left', on=['Sample Number'])

In [None]:
# Merge in any reagent test data
df_all = pd.merge(df_all, dfs.reagent[['Sample Number', 'Reagent Result']], how='left', on=['Sample Number'])

In [None]:
# Merge in any pill strength data
df_all = pd.merge(df_all, dfs.mla[['Sample Number', 'MDMA / tablet (mg)', '% MDMA content']], how='left', on=['Sample Number'])

In [None]:
# Merge in HR data
df_all = pd.merge(df_all, dfs.hr, how='left', on=['Sample Number'])

In [None]:
# Fix column orders
prefix = ['Sample Number',
          'Catalog timestamp', 'FTIR timestamp', 'HR timestamp',
          'Catalog Sold As', 'FTIR Sold As','HR Sold as', 
          'Catalog form', 'FTIR form',
          'Catalog tried', 'FTIR tried', 'HR tried']
columns = [c for c in df_all.columns if c not in prefix]
columns = prefix + columns
df_all = df_all[columns]
df_all.to_csv('foo.csv')