The Data sources are:

### 1. Liam's SPSS coded data
**File:** The Loop 2017 Final Interventions.xlsx

Exported as Excel from SPSS, keeping the variable names.

This file contains 1325 entries.


27 have null festival or sample numbers so can't be used, leaving 1298


One has sample number 12151, two have sample number 0 - these cannot be merged.


This leaves 1295 - all of which can be merged

### 2. Guy's cleaned up lab data
**File:** Loop 2017 Lab fixed data.xlsm

From: Dropbox/Testing/2017 results processing/Loop 2017 Lab fixed data.xlsm


Data is in the ‘Raw Lab Data’ sheet


This file contains 2544 entries


1900 entries start with F


621 entries begin with A (amnesty) so can't be merged


23 Begin with W? so can't be merged


Entry SGP2017 F0465 needs editing as 'Client gender' is FemaleaMalee 

### 3. Boomtown Intervention Questionnaire
**File:** BTReport 2017 - Form responses 3.csv

Exported from: https://docs.google.com/spreadsheets/d/15pdETY0HK-VbBcV-N0swt6ZrRBbeDnZR5RGDzfq95dg

This file contains 194 entries

### 4. 'Straggling' Boomtown Intervention Questionnaire
**File:** Reports V2.6 Branch 2 - Form responses 2.csv

https://docs.google.com/spreadsheets/d/1sZXFdiOaUX6n9HGq9s-t8T_zxNZhKyjxY83aY8mvUIo/edit#gid=1291806732

This file contains 9 entries

### Merging the data

Merging the data on Festival and SampleNumber resulted in 1295 entries



In [None]:
# Module imports
import datetime
import os
import numpy as np
import pandas as pd

def fix_sample_number(x):
    """Make sure all samples numbers are of form: AXXX (where A is one of A, F, W and X is a digit)"""
    if isinstance(x, float) and np.isnan(x):
        return x # leave NaN's alone    
    try:
        sn = int(x)
        sn = 'F{:04d}'.format(int(x))
    except ValueError:
        # Assume string so make sure it's of the right format
        sn = str(x).capitalize()
    return sn

def now():
    return datetime.datetime.now().strftime("%d/%m/%y %H:%M:%S")

In [None]:
#
# Liam's SPSS data exported as Excel
#
spssdata = 'The Loop 2017 Final Interventions.xlsx'

spss_df = pd.read_excel(spssdata)
print("Read in from SPSS ",len(spss_df))

# Change festival names
spss_df['Festival'].replace(['BoomTown', 'KC', 'SGP'], ['BT2017', 'KC2017', 'SGP2017'], inplace=True)

# Fix/update column labels
d = {'spice_yesterday' : 'spice_legals_yesterday',
     'Cannabis_ever' : 'cannabis_ever',
     'Ethnictiy_other' : 'Ethnicity_other'
    }
spss_df.rename(columns=d, inplace=True)

# Ensure all Sample numbers are consistent
# 1. Delete any rows where SampleNumber or Festival is NA as we can't do anything with it
# There are 45 entries that go, but none of them contain any valid data
spss_df.dropna(subset=['SampleNumber', 'Festival'], inplace=True)

# 2. Make all sample numbers a 4-digit code starting with F
spss_df['SampleNumber'] = spss_df['SampleNumber'].apply(fix_sample_number)

# Combine date and time columns into new single column
spss_df['Date'] = pd.to_datetime(spss_df['Date']) # Convert Date to datetime object
spss_df['Date & Time of intervention'] = spss_df.apply(lambda r : pd.datetime.combine(r['Date'], r['Time']), 1)

# Remove Day, Date, Time and SurveyID columns
spss_df.drop(['Day', 'Date', 'Time', 'SurveyID'], axis=1, inplace=True)

# Fix dodgy sample number - Guy confirmed this with Liam
spss_df.at[spss_df['SampleNumber'] == 'F12151', 'SampleNumber'] = 'F1215'

# Sort on Festival then SampleNumber
spss_df.sort_values(['Festival','SampleNumber'], ascending=True, inplace=True)

print(now() + " Final SPSS ",len(spss_df)) # shows we are left with 1298 datasets

In [None]:
#
# Code to read the Boomtown straggling results from the Google Forms data
#
bt_straggling = 'Reports V2.6 Branch 2 - Form responses 2.csv'
date_cols = ['Timestamp']
bt_df2 = pd.read_csv(bt_straggling, engine="python", parse_dates=date_cols)

# 2 columns are missing so need to be added as Nan
bt_df2['Which drugs have you used? [Non-prescribed opiods]'] = np.nan
bt_df2['Are you planning to take any of these drugs later?'] = np.nan

# Sample 'F0001' and 'F0535' appear to be errors - one taken by Mike Capper - another a 13 year old female
bt_df2.drop(bt_df2[(bt_df2['Sample Number'] == '1') | (bt_df2['Sample Number'] == 'F0535')].index, inplace=True)

# Delete the columns that are only in straggling results
to_drop = ['Which risks are you aware of that exist when using this substance?', 
           'Please could you tell me exactly what you have had to drink today: [Wine]',
           'The service user abandoned the intervention before completion.',
           'Please could you tell me exactly what you have had to drink today: [Beer]',
           'Please could you tell me exactly what you have had to drink today: [Spirits]',
           'Please could you tell me exactly what you have had to drink today: [Alcopops]',
           'Do you know any ways to reduce those risks?',
           'Did you understand the disclaimer explaining the limitations that was read to you?',
           'Which of the drugs are you planning to use later today?']
bt_df2.drop(to_drop, axis=1, inplace=True)

In [None]:
#
# The Google Forms version of the Intervention Questionnaire
#
bt_interventions = 'BTReport 2017 - Form responses 3.csv'
date_cols = ['Timestamp']
bt_df = pd.read_csv(bt_interventions, engine="python", parse_dates=date_cols)

# Add the 'straggling' results
bt_df = pd.concat([bt_df, bt_df2], axis='rows', ignore_index=True)

# Map Columns in Google Forms to SPSS
d = {'Sample Number': 'SampleNumber',
 'Timestamp' : 'Date & Time of intervention',
 'Number of friends present with primary respondent': 'FriendsPresent',
 'Gender of primary respondent': 'Gender',
 'Ethnicity': 'Ethnicity',
 'Age': 'Age',
 'Have you had any alcohol to drink today?': 'ConsumedAlcohol',
 'How much spirits have you had today?': 'Spirits',
 'How much wine have you had today?': 'Wine',
 'How many alcopops have you had today?': 'Alcopops',
 'Are you currently taking any prescribed medication?': 'PrescribedDrugs',
 'Are you currently taking any "Over the Counter" medication?': 'OverTheCounter',
 'Do you have any concerns about how you are feeling at the moment?': 'ConcernsWithCurrentFeelings',
 'You submitted a substance of concern for analysis, what do you believe it to be?': 'SubmittedSubstanceAs',
 'Where did you obtain the sample?': 'Obtained',
 'Very roughly, how often do you use this drug?': 'EverHadSubstance',
 'When did you first use this batch?': 'WhereAndWhen',
 'Have you or anyone you know ever had negative experiences taking this substance?': 'NegativeExperieces',
 'How many times have you used this batch?': 'ConsumedFromBatchAlready',
 'Do you have any concerns about using this sample from this batch or any other concerns about the result?': 'PriorConcerns',
 'Have you ever accessed a treatment service for your alcohol or drug use?': 'AccessedSupportBefore',
 'After our conversation today, would you like to have any further advice or support from a treatment service for your alcohol or drug use?': 'WantFurtherAdvice',
 # Need to check this one
 'Have you ever taken any other drugs I didn\'t mention?' : 'other_specify'
}

bt_df.rename(columns=d, inplace=True)

# Delete all columns that are only in Google Forms
to_drop = ['Volunteer Name', 
           'When was the last time you used this service?',
           'What was your first sample number at this event? Did you take a photo or keep the ticket?',
           'Which drugs have you used? [Non-prescribed opiods]',
           'Have you had any other legal or illegal drugs today?',
           'Are you planning to take any of these drugs later?']
bt_df.drop(to_drop, axis=1, inplace=True)

# Add Festival Column
bt_df['Festival'] = 'BT2017'

# Add As_expected Column - is all null in Liams
bt_df['As_expected'] = np.nan

# Fix SampleNumber
bt_df['SampleNumber'] = bt_df['SampleNumber'].apply(fix_sample_number)

# Fix the broken sample numbers - have already made sure they're not in the bt_interventions set 
# SNBAD = 'F00119'
# SNNEW = 'F0119'
# print("GF ", bt_df.loc[bt_df['SampleNumber'] == SNBAD, ['Gender', 'Age', 'SubmittedSubstanceAs', 'Date & Time of intervention']])
# print("LAB ", lab_df.loc[(lab_df['SampleNumber'] == SNNEW) & (lab_df['Festival'] == 'BT2017'), ['Client gender', 'Client age', 'Bought as', 'Date & Time of return']]) 
# print("SPSS ", spss_df.loc[(spss_df['SampleNumber'] == SNNEW) & (spss_df['Festival'] == 'BT2017'), ['Gender', 'Date & Time of intervention', 'Age', 'SubmittedSubstanceAs']])

# '000-04' - assume 'F0004' as in lab data and date/time of return/intervention are ~ 30 min
bt_df.at[bt_df['SampleNumber'] == '000-04', 'SampleNumber'] = 'F0004'

# '5f009' - 'F0059' as nothing in SPSS and date/time of return/intervention are ~ 5 min
bt_df.at[bt_df['SampleNumber'] == '5f009', 'SampleNumber'] = 'F0059'

# 'F00117' - 'F0117' as nothing in SPSS and date/time of return/intervention are ~ 30 min
bt_df.at[bt_df['SampleNumber'] == 'F00117', 'SampleNumber'] = 'F0117'

# 'F00119' - 'F0119' as nothing in SPSS and date/time of return/intervention are ~ 30 min and samples math
bt_df.at[bt_df['SampleNumber'] == 'F00119', 'SampleNumber'] = 'F0119'

# Sample 'F308470234987' is rubbish
bt_df.drop(bt_df[bt_df['SampleNumber'] == 'F308470234987'].index, inplace=True)

print("DONE COLUMNS at ", now())

In [None]:
#
# Attempt to disentangle the Form responses into a form they can be merged with Liam's data
#
PERIODS = ['ever', 'year', 'month', 'week', 'yesterday', 'today', 'tonight', 'today_tonight']

def includes_frequency(cell, period):
    "Return boolean indicating if this cell contains frequencies >= period"
    if isinstance(cell, float) and np.isnan(cell):
        return False # nan's is considered not having the value
    
    #periods = ['ever', 'year', 'month', 'week', 'yesterday', 'today', 'tonight']
    form_responses = ['Had in my life', 'Had in last year', 'Had in last month', 'Had in last week', 
                      'Had yesterday', 'Had today', '(Probably) planning later']
    
    assert period in PERIODS, "Invalid period: {0}".format(period)
    
    values = [v.strip() for v in cell.split(',')]   
    idx = PERIODS.index(period)
    # check if any of the periods >= this have been checked
    for i in range(idx, len(PERIODS) - 1):
        if form_responses[i] in values:
            return True
    return False

def get_value(column, period):
    "Return boolean Series indicating if this columns contains frequencies >= period"
    result = None
    if period == 'today_tonight':
        today = column.apply(includes_frequency, period='today')
        tonight = column.apply(includes_frequency, period='tonight')
        result = today | tonight
    else:
        result = column.apply(includes_frequency, period=period)
    return result

def add_columns(bt_df, label_map):
    """For all of the drugs in the label_map expand them out to match Liam's data and then delete the original column
    """
    for gdrug in label_map.keys():
        gcolumn_label = 'Which drugs have you used? [{}]'.format(gdrug)
        gcolumn = bt_df[gcolumn_label]
        sdrug = label_map[gdrug]
        # Add columns for each period
        for period in PERIODS:
            column_name = '{}_{}'.format(sdrug, period)
            bt_df[column_name] = get_value(gcolumn, period)

        # Delete the original column
        bt_df.drop([gcolumn_label], axis=1, inplace=True)

def freq_summary(drugs, column_prefix):
    """Run boolean OR (any) for the give set of drugs and put result in column with 
    name {period}_{column_prefix}"""
    for period in PERIODS:
        labels = ['{}_{}'.format(drug, period) for drug in drugs]
        clabel = '{}_{}'.format(column_prefix, period)
        bt_df[clabel] = bt_df[labels].any(axis=1)
     
column_map_core = {'Cannabis' : 'cannabis',
                   'Cocaine' : 'cocaine',
                   'Ecstasy pills' : 'ecstasy',
                   'Nitrous (NOS, laughing gas)' : 'nitrous_oxide',
                   'MDMA crystal/powder' : 'mdma',
                   'Ketamine' : 'ketamine',
                   'Magic mushrooms' : 'mushrooms',
                   'LSD' : 'lsd',
                   'Mephedrone (M-Cat)' : 'Mephedrone',
                   'Synthetic cannabinoids ("Spice")' : 'spice_legals',
                   'A powder which I had no idea what it was' : 'unknown_powder',
                  }

# Add core drug columns
add_columns(bt_df, column_map_core)

# Extra columns that aren't present in Liam's data
column_map_extra = {'2C-B' : '2cb',
                    'Amphetamine (speed)' : 'speed',
                    'Codeine' : 'coedine',
                    'Valium or other benzodiazepines' : 'valium',
                   }
add_columns(bt_df, column_map_extra)


# Now need to add calculated data for the other drug uses:
# Legal (any_legal):    balloons, poppers, spice, other legal highs
# core drugs (core):    cannabis, cocaine, ecstasy, mdma, ketamine, mephodrone, speed, heroin
# polydrug ():    2 + illegal drugs
# polysubstance:    2 + illegal drigs and usual alcohol frequency      

all_drugs = list(column_map_core.values()) + list(column_map_extra.values())
# ['cannabis', 'cocaine', 'ecstasy', 'nitrous_oxide', 'mdma', 'ketamine', 'mushrooms', 'lsd', 'Mephedrone', 'spice_legals', 'unknown_powder', '2cb', 'speed', 'coedine', 'valium']
freq_summary(all_drugs, 'any')
        
legal_drugs = ['nitrous_oxide', 'spice_legals', 'coedine', 'valium']
freq_summary(all_drugs, 'any_legal')

core_drugs = ['cannabis', 'cocaine', 'ecstasy', 'mdma', 'ketamine', 'Mephedrone', 'speed']
freq_summary(core_drugs, 'core')

# Now poly drug use
illegal_drugs = ['cannabis', 'cocaine', 'ecstasy', 'mdma', 'ketamine', 'mushrooms', 'lsd', 'Mephedrone', '2cb', 'speed']
column_prefix = 'polydrug'
for period in PERIODS:
    labels = ['{}_{}'.format(drug, period) for drug in illegal_drugs]
    clabel = '{}_{}'.format(column_prefix, period)
    bt_df[clabel] = bt_df[labels].sum(axis=1) >= 2

# Now poly substance use - NEED TO DECIDE ON ALCOHOL COLUMN
# column_prefix = 'polysubstance'
# for period in PERIODS:
#     clabel = '{}_{}'.format(column_prefix, period)
#     labels = ['polydrug_{}'.format(period), 'polydrug_{}'.format(period)]
#     bt_df[clabel] = bt_df[labels].sum(axis=1) >= 2

# Delete the drug columns that don't match Liam's data
for drug in column_map_extra.values():
    labels = ["{}_{}".format(drug, period) for period in PERIODS]
    bt_df.drop(labels, axis=1, inplace=True)

# Covert all booleans to Yes/No strings
# 'polysubstance'
prefixes = list(column_map_core.values()) + ['any', 'any_legal', 'core', 'polydrug' ]
columns = []
for prefix in prefixes:
    for period in PERIODS:
        columns.append("{}_{}".format(prefix, period))
for column in columns:
    bt_df[column] = bt_df[column].map({True : 'Yes', False : 'No'})
        
#bt_df.to_csv('foo.csv')
print("DONE DRUGS at ", now())

In [None]:
#
# Now disentangle the disposals columns
#
# Map of Google Forms responses to Liam's categories
GMAP = {
    'I will ask the Loop to safely dispose of the rest of the sample in my possession' : 'a',
    'I will throw it away myself' : 'b',
    'I will take a smaller amount of it' : 'c',
    'I will take a larger amount of it' : 'd',
    'I will take the same amount as usual' : 'e',
    'I will take it over a longer time period' : 'f',
    'I will be more careful about mixing it with other substances' : 'g',
    'I will give it away instead of taking it myself' : 'h',
    'I will sell it' : 'i',
    'I will obtain more on site' : 'j',
    'I will warn my friends and acquaintances' : 'k',
    'I will warn others via social media and public websites' : 'l',
    'I will tell my dealer' : 'm',
    'I will return it to my dealer' : 'n',
    'I will ask for a refund from my dealer' : 'o',
    'I will go to another dealer' : 'p',
    'I will keep it to take it elsewhere, after the festival' : 'q',
    'I will do something else' : 'r'
}
CATEGORIES = list(sorted(GMAP.values()))

def get_disposal(cell):
    """Return a list of booleans depending on which categories were seen for this cell."""
    result = [False] * len(CATEGORIES)
    if isinstance(cell, float) and np.isnan(cell):
        return result # nan's is considered not having the value
    values = cell.split(',')
    for v in values:
        v = v.strip()
        if v in GMAP.keys():
            idx = CATEGORIES.index(GMAP[v])
            result[idx] = True
    return result

label1 = 'After hearing today’s test results and harm reduction advice from The Loop, what do you plan to do with the sample?'
label2 = 'What other actions will you do?'

# Create two Series with the results of parsing the two columns
series1 = bt_df[label1].apply(get_disposal).apply(pd.Series)
series2 = bt_df[label2].apply(get_disposal).apply(pd.Series)

# OR them to get the final column as a dataframe and name the columns accordingly
df_tmp = series1 | series2

# Rename columns to match
df_tmp.columns = CATEGORIES

# add to the bt_df
bt_df = pd.concat([bt_df, df_tmp], axis=1)

# CONVERT TRUE/FALSE TO Yes/No
for column in CATEGORIES:
    bt_df[column] = bt_df[column].map({True : 'Yes', False : 'No'})

# Delete redundant columns
bt_df.drop([label1, label2], axis=1, inplace=True)

print("DONE DISPOSALS at ", now())

In [None]:
# Fix remaining non-matching columns

# ConcernsWithCurrentFeelings' and any non yes/no answers to'WhatConcerns'
label1 = 'ConcernsWithCurrentFeelings'
label2 = 'WhatConcerns'
bt_df[label1].fillna('No', inplace=True)
mask = bt_df[label1] == 'No'
bt_df[label2] = np.nan
bt_df[label2].where(mask, bt_df[label1], inplace=True) # Copy values from label1 column over
bt_df[label1].where(mask, 'Yes', inplace=True) # Set copied over values to 'Yes'

label1 = 'PriorConcerns'
label2 = 'Why'
bt_df[label1].fillna('No', inplace=True)
mask = bt_df[label1] == 'No'
bt_df[label2] = np.nan
bt_df[label2].where(mask, bt_df[label1], inplace=True) # Copy values from label1 column over
bt_df[label1].where(mask, 'Yes', inplace=True) # Set copied over values to 'Yes'

# Fix Ethnicity / Ethnicity_Other
label1 = 'Ethnicity'
label2 = 'Ethnicity_other'
#Find where the values aren't the 5 core
mask = bt_df[label1].apply(lambda x: x not in ['White', 'Black', 'Asian', 'Mixed Race', 'Other'])
bt_df[label2] = np.nan
bt_df[label2].where(~mask, bt_df[label1], inplace=True) # Copy values from label1 column over
bt_df[label1].where(~mask, 'Other', inplace=True) # Set copied over values to 'Other'

# Can't currently process data for other_ or polysubstance_ so just set to nan
columns = []
for prefix in ['other', 'polysubstance']:
    for period in PERIODS:
        columns.append("{}_{}".format(prefix, period))
# Add null columns
bt_df = pd.concat([bt_df, pd.DataFrame(columns=columns)], axis='columns')

# For now we just delete the beer and cider and set BeerCider to nan as we can't merge
bt_df.drop(['How much cider have you had today?', 'How much beer have you had today?'], axis=1, inplace=True)
bt_df['BeerCider'] = np.nan

# Can't calculate UnitsConsumed
bt_df['UnitsConsumed'] = np.nan

# PriorConcerns': and any non yes/no answers to 'Why'
print("DONE FIX COLUMNS at ", now())
#bt_df.to_csv('foo.csv')


In [None]:
# Make sure columns match between two dataframes
btdfc = set(bt_df.columns.values)
spssc = set(spss_df.columns.values)
assert btdfc == spssc, "Differing columns: %s" % str(btdfc - spssc)

# Rename bt_df columns to match spss_df
bt_df = bt_df[spss_df.columns.values]

# Join the two dataframes
spss_df = pd.concat([spss_df, bt_df], axis='rows', ignore_index=True)

# Sort
spss_df.sort_values(['Festival','SampleNumber'], ascending=True, inplace=True)

In [None]:
# Read in Guy's lab data
labdata = 'Loop 2017 Lab fixed data.xlsm'
lab_df = pd.read_excel(labdata, sheet_name='Raw LabData')
print("Read in from LAB ",len(lab_df))

# Remame 'Event Name' and 'Sample Number' columns so they match
lab_df.rename(columns={'Event  Name': 'Festival', 'Sample Number': 'SampleNumber'}, inplace=True)

# Delete any rows where SampleNumber or Festival is NA as we can't do anything with it
lab_df.dropna(subset=['SampleNumber', 'Festival'], inplace=True) # This just drops one case

# Uppercase all sample numbers
labels = ['SampleNumber']
lab_df.loc[:, labels] = lab_df[labels].apply(lambda x: x.str.capitalize())

# Change 'Matches Sold as?' to be yes/no
column = 'Matches Sold as?'
lab_df[column] = lab_df[column].map({1.0 : 'Yes', 0.0 : 'No'})

# Delete redundant columns
# 'Sample Source' is just the first letter of 'Source of Sample'
# 'Source of Sample' is only present for SGP2017 so can't be analysed across all festivals
columns = ['SPSS UID', 'Sample Source', 'Source of Sample'] 
lab_df.drop(columns, axis=1, inplace=True)

# Some sample numbers begin with W or F 
#print(len(lab_df[ ~ (lab_df['SampleNumber'].str.startswith('F') | lab_df['SampleNumber'].str.startswith('A')) ]))

# BT2017-F0334 - Guy's email of the 4th May - spectrum matches 2C-B
lab_df.at[lab_df['SampleNumber'] == 'F0334', 'Final Result'] = '2cb'
print(now() + " Final LAB ",len(lab_df)) # shows we are left with 2543 datasets

In [None]:
# This cell is for sorting out the duplicate entries.

# This is clunky and can probably be done better - possibly with a MultiIndex?
# Reindex twice to create an in index column that can be used to identify individual samples
spss_df.reset_index(drop=True, inplace=True)
spss_df.reset_index(drop=False, inplace=True)

# Get all spss duplicated entries
df = pd.DataFrame()
for festival in ['BT2017', 'KC2017', 'SGP2017']:
    sample_numbers = spss_df.loc[spss_df['Festival'] == festival, ['SampleNumber']]    
    indexes = sample_numbers[sample_numbers.duplicated(keep=False)].index
    print("Festival {}: {} entries {} duplicates".format(festival, len(sample_numbers), len(indexes)))
    df = pd.concat([df, spss_df.iloc[indexes,]], axis='rows')

# Merge in the relevant lab data
print("%d duplicates" % len(df))
dfa = pd.merge(df, lab_df, how='left', on=['Festival','SampleNumber'])
print("%d entries were merged" % len(dfa))

# Get list of all columns
columns = dfa.columns.values.tolist()

# Remove the ones we want to look at together
spss_cols = ['Age', 'Gender', 'Date & Time of intervention', 'SubmittedSubstanceAs']
lab_cols = ['Client age', 'Client gender', 'Date & Time of return', 'Bought as', 'Client suspicion', 'Matches Sold as?', 'Final Result']
for c in spss_cols + lab_cols + ['index', 'Festival', 'SampleNumber']:
    columns.remove(c)
# Put the ones we want together at the front
columns =  ['index', 'Festival', 'SampleNumber'] + \
           ['Age', 'Client age',
            'Gender', 'Client gender',
            'Date & Time of intervention', 'Date & Time of return',
            'SubmittedSubstanceAs', 'Bought as', 'Client suspicion', 'Matches Sold as?', 'Final Result'
           ] + columns
# Reorder the columns
dfa = dfa[columns]
dfa.sort_values(['Festival','SampleNumber'], ascending=True, inplace=True)
#dfa.to_csv('duplicates.csv')

# Festival BT2017: 873 entries 30 duplicates
# Festival KC2017: 67 entries 4 duplicates
# Festival SGP2017: 557 entries 36 duplicates
# The following entries appear to be genuine duplicates and can just be removed
duplicated = [16, 278, 435, 679, 783, 820, 824, 881, 1016, 1349, 1365]
# The below would need to be manually linked to lab data
orphans = [3, 28, 87, 213, 249, 274, 353, 396, 896, 975, 980, 986, 990, 
           991, 1056, 1074, 1076, 1077, 1107, 1158, 1159, 1174, 1290, 1291, 1383, 1426]

print(len(duplicated))
print(len(orphans))
l1 = len(spss_df)

# Save orphans to see if we can find them later
spss_orphan_df = spss_df[spss_df['index'].isin(orphans)].copy()

# Change all orphan sample numbers to FXXX
spss_orphan_df.loc[:,'SampleNumber'] = 'FXXX'

# Drop the unwanted rows
spss_df = spss_df[~spss_df['index'].isin(duplicated + orphans)]

l2 = len(spss_df)
print("Deleted {} entries based on duplication criteria".format(l1 - l2))

# Now delete the index column
spss_df.drop(['index'], axis=1, inplace=True)

In [None]:
# Merge the lab and spss dataframes where Festival and SampleNumber match
df_final = pd.merge(spss_df, lab_df, how='inner', on=['Festival','SampleNumber'])
print("%d entries were merged" % len(df_final))

# For checking which entries can't be merged - check for right_only
#pd.merge(lab_df, spss_df, how='outer', indicator=True)

# Append the orphan entries
df_final = df_final.append(spss_orphan_df, ignore_index=True)

# Rename columns
d = { 'Bought as' : 'sold/acquired/advertised as' }
df_final.rename(columns=d, inplace=True)

# Sort first by Festival, then SampleNumber
df_final.sort_values(['Festival', 'SampleNumber'], ascending=True, inplace=True)

# Here we reorder columns that should be identical to:
# 1. spot data errors
# 2. remove duplicate columns once we're happy data is consistent
prefix_cols = ['Festival', 'SampleNumber',
             'Sample submission time', 'Date & Time of return', 'Date & Time of intervention', 
             'Client age', 'Age', 'Client gender', 'Gender',
              'sold/acquired/advertised as', 'SubmittedSubstanceAs', 'Client suspicion', 'Final Result',
              'As_expected','Matches Sold as?' ]

# Get the list of columns excluding the ones in prefix_cols
cols = [c for c in df_final.columns.tolist() if c not in prefix_cols]
# Prepend prefix_cols to create the new list
cols = prefix_cols + cols
# Reorder columns
df_final = df_final[cols]

# capitalize all genders for consistency
labels = ['Client gender', 'Gender']
df_final.loc[:, labels] = df_final[labels].apply(lambda x: x.str.capitalize())

# Set any MISSING to be nan
df_final.loc[:, labels] = df_final.loc[:, labels].replace({'Missing':np.nan})

# Fix case across all columns
cprefix = ['cannabis', 'cocaine', 'ecstasy', 'mdma', 'ketamine', 'lsd', 'nitrous_oxide', 
         'mushrooms', 'Mephedrone', 'spice_legals','unknown_powder', 'other', 'any', 'any_legal', 'core', 
         'polydrug', 'polysubstance']
columns = []
for prefix in cprefix:
    for period in PERIODS:
        columns.append("{}_{}".format(prefix, period))
columns += list('abcdefghijklmnopqr')

# Apply mapping
df_final.loc[:, columns] = df_final[columns].apply(lambda x: x.str.capitalize())

# Create an initial UID column at beginning
uid = df_final['Festival'] + "-" + df_final['SampleNumber'] + '-' + df_final.index.to_series().astype(str)
df_final.insert(loc=0, column='UID', value=uid)

# for c in df_final.columns.values:
#     print("COLUMN %s: %s" %(c, df_final[c].unique()))

In [None]:
# This cell is for canonicalising the drug names

# sold_as = set(df_final['sold/acquired/advertised as'].unique())
# submitted_as = set(df_final['SubmittedSubstanceAs'].unique())
# client_suspicion = set(df_final['Client suspicion'].unique())
# final_result = set(df_final['Final Result'].unique())
# all_drugs = sold_as.union(submitted_as, final_result, all_drugs)
# print(all_drugs)

# bought_as = unique(lab_df['Bought as'])
# client_suspicion = unique(lab_df['Client suspicion'])
# final_result = unique(lab_df['Final Result'])
# submitted_as = unique(spss_df['SubmittedSubstanceAs'])
# other_specify = unique(spss_df['other_specify'])

# print("Bought as:", bought_as)
# print("Client suspicion:", client_suspicion)
# print("Final Result:", final_result)
# print("SubmittedSubstanceAs:", submitted_as)
# print("other_specify:", other_specify)

# x = bought_as + client_suspicion + final_result + submitted_as + other_specify
# x = sorted(set(x))
# print("X: ",x)

AMPHETAMINE = 'amphetamine'
BENZODIAZEPINE = 'benzodiazepine'
COCAINE = 'cocaine'
FOUND = 'found'
KETAMINE = 'ketamine'
LSD = 'lsd'
MEPHEDRONE = 'mephedrone'
MDMA = 'mdma'
NETHYLPENTYLONE = 'n-ethylpentylone'
PSYCHEDELIC = 'psychedelic'
TWOCB = '2cb'
UNKNOWN = 'unknown'

drugs_map = { 
    AMPHETAMINE : ['speed', 'Speed', 'base/speed', 'adderall'],
    BENZODIAZEPINE : ['chinese valium', ],
    COCAINE : ['coke', 'cut cocaine'],
    FOUND : ['unknow found'],
    KETAMINE : ['?ket', '/ketamie', 'maybe ketamine?', 'katamine', 'vanila ketamine', 
                'vetamine', 'not mdma. ketamine?', 'ketamoine'],
    LSD : ['acid', 'liquid lsd'],
    MEPHEDRONE : ['meow meow', 'mcat'],
    MDMA : ['mdxx', 'mda/mdea/mdma', 'mdma,', 'mandy', 'probaby mdma', 'mdma?', 'mdma with caffeine',
           '3/4 of pill green shooting star', 'ecstacy', 'ecstacy pill', 'ecstasy',
            'ecstasy pill', 'esctacy pill sample', 'estacy pill', 'pill'],
    #NETHYLPENTYLONE : ['n-ethylpentylone'],
    PSYCHEDELIC : [ '4-aco dmt', '4-aco-dmt', '4aco', '4acodmt', '5meomipit', 'dmt_2cb', 'dmt', 'ayahuasca'],
    TWOCB : ['2 cb', '2c-b'],
    UNKNOWN : ['unknown pill', 'unsure', 'unsure - maybe dmt', 'unsure of content', 
               'no effect', 'no idea', 'data missing', ''],   
    }

# Here we overwrite the values - if necessary we could create separate columns
# Create dict for replace function is form {column : {value_to_replace, replacement_value}}
replace_d = {}
drug_columns = ['sold/acquired/advertised as', 'Client suspicion', 'Final Result', 'SubmittedSubstanceAs', 'other_specify']

# Firstly convert all columns to lower case and remove any spaces
def clean(value):
    if type(value) is str:
        value = value.strip().lower()
    return value

for column in drug_columns:
    df_final[column] = df_final[column].map(clean, na_action='ignore')

for column in drug_columns:
    replace_d[column] = {}
    for drug, names in drugs_map.items():
        for name in names:
            replace_d[column][name] = drug

# Replace values
df_final.replace(replace_d, inplace=True)
            
# NO_ANALYSIS as is treated separtely as only applies to Final Result - also can't include with other dict
# or the replacement values and keys overlap
NO_ANALYSIS = 'analysis_inconclusive'
no_analysis = ['compound not in library', 'inconclusive', 'insufficient quantity for testing', 
               'insufficient sample', 'insufficient sample', 'lost', 'no active component identified', 
               'no match', 'no match', 'none', 'nothing detected', 'result missing', 'unable to test', 'unknown']

# Fix 'Final Result' for NO_ANALYSIS
column = 'Final Result'
replace_d = {column: {}}
for name in no_analysis:
    replace_d[column][name] = NO_ANALYSIS

# Replace values
df_final.replace(replace_d, inplace=True)

# Additional grouping requested by Fiona
column = 'sold/acquired/advertised as'
replace_d = {column: {'found' : 'unknown',
                      "don't know" : 'unknown',
                      'not sure' : 'unknown',
                     }}
df_final.replace(replace_d, inplace=True)

# Calculate where they do/don't match
df_final['As_expected'] = (df_final['Final Result'] == df_final['sold/acquired/advertised as']).map({True : 'Yes', False : 'No'})
# Guy 28/10/18: 'As_expected' should be null whenever the sample is found or when the submission 'acquired as" data is blank or unkknown
mask1 = df_final['Obtained'].isin(['Found elsewhere', 'Found at this event'])
mask2 = df_final['sold/acquired/advertised as'].isin(['unknown', np.nan])
mask = mask1 | mask2
df_final.loc[mask, ['As_expected']] = np.nan

# BT2017 F0641 was submitted as 'ketamine or mdma' so we need to manually set the As_expected result
df_final.at[(df_final['SampleNumber'] == 'F0641') & (df_final['Festival'] == 'BT2017'), 'As_expected'] = 'Yes'
#
print("DONE CANONICALISE DRUG NAMES at ", now())

In [None]:
# Additional canonicalistion of columns.
# dict maps column -> { old_value: new_value }
canon_map = { 'Obtained' : { 'Off site' : 'Bought off site',
                             'On site' : 'Bought at this event',
                             'Online' : 'Bought online',
                              np.nan : 'Missing'
                            }
            }
df_final.replace(canon_map, inplace=True)
print("FINAL CANONICALISATION AT ", now())

In [None]:
#
# Finally, dump everything to excel
#
filename = 'JensCleanedData_XX.xls'
writer = pd.ExcelWriter(filename)
df_final.to_excel(writer, 'MergedData', index=False)
writer.save()
print(now() + " Wrote ",filename)

In [None]:
# # Get a list of all the lab data samples that can't be merged - use merge so the method is same as before
# df_check = spss_df.merge(lab_df, how='outer', right_on=['Festival','SampleNumber'], left_on=['Festival','SampleNumber'], indicator=True)
# columns = df_check.loc[df_check['_merge'] == 'right_only', ['Festival', 'SampleNumber']]
# # Remove any that don't start with F
# columns = columns[columns['SampleNumber'].map(lambda x: x.startswith('F'))]
    
# # Create combined coloumns so we can select on them - sure there is better way to do this
# columns['foo'] =  columns['Festival'] + columns['SampleNumber']
# lab_df['foo'] = lab_df['Festival'] + lab_df['SampleNumber']

# lab_df1 = lab_df[lab_df['foo'].isin(columns['foo'])]

# #print(spss_orphans['Date & Time of intervention'])
