The Data sources are:

### 1. Liam's SPSS coded data
**File:** The Loop 2017 Final Interventions.xlsx

Exported as Excel from SPSS, keeping the variable names.

This file contains 1325 entries.


27 have null festival or sample numbers so can't be used, leaving 1298


One has sample number 12151, two have sample number 0 - these cannot be merged.


This leaves 1295 - all of which can be merged

### 2. Guy's cleaned up lab data
**File:** Loop 2017 Lab fixed data.csv

Saved from: Dropbox/Testing/2017 results processing/Loop 2017 Lab fixed data.xlsm in the ‘Raw Lab Data’ sheet

This file contains 2544 entries


1900 entries start with F


621 entries begin with A (amnesty) so can't be merged


23 Begin with W? so can't be merged


Entry SGP2017 F0465 needs editing as 'Client gender' is FemaleaMalee 

### 3. Boomtown Intervention Questionnaire
**File:** BTReport 2017 - Form responses 3.csv

Exported from: https://docs.google.com/spreadsheets/d/15pdETY0HK-VbBcV-N0swt6ZrRBbeDnZR5RGDzfq95dg

This file contains 194 entries

### Merging the data

Merging the data on Festival and SampleNumber resulted in 1295 entries



In [1]:
# Module imports
import os
import numpy as np
import pandas as pd

In [2]:
spssdata = '/opt/random/The Loop 2017 Final Interventions.xlsx'

spss_df = pd.read_excel(spssdata)

# Change festival names
spss_df['Festival'].replace(['BoomTown', 'KC', 'SGP'], ['BT2017', 'KC2017', 'SGP2017'], inplace=True)

# Ensure all Sample numbers are consistent
# 1. Delete any rows where SampleNumber or Festival is NA as we can't do anything with it
spss_df.dropna(subset=['SampleNumber', 'Festival'], inplace=True)

# 2. Make all sample numbers a 4-digit code starting with F
spss_df['SampleNumber'] = spss_df['SampleNumber'].apply(lambda x: 'F{:04d}'.format(int(x)))

# Combine date and time columns into new single column
spss_df['Date'] = pd.to_datetime(spss_df['Date']) # Convert Date to datetime object
spss_df['Date & Time of intervention'] = spss_df.apply(lambda r : pd.datetime.combine(r['Date'], r['Time']), 1)

# Remove Day, Date, Time and SurveyID columns
spss_df.drop(['Day', 'Date', 'Time', 'SurveyID'], axis=1, inplace=True)

# Below shows we are left with 1298 datasets
print(len(spss_df))

1298


In [3]:
labdata = '/opt/random/Loop 2017 Lab fixed data.csv'
date_cols = ['Sample submission time', 'Date & Time of return']
lab_df = pd.read_csv(labdata, encoding="ISO-8859-1", engine="python", parse_dates=date_cols)

# Remame 'Event Name' and 'Sample Number' columns so they match
lab_df.rename(columns={'Event  Name': 'Festival', 'Sample Number': 'SampleNumber'}, inplace=True)

# Delete any rows where SampleNumber or Festival is NA as we can't do anything with it
lab_df.dropna(subset=['SampleNumber', 'Festival'], inplace=True) # This just drops one case

# Uppercase all sample numbers
labels = ['SampleNumber']
lab_df.loc[:, labels] = lab_df[labels].apply(lambda x: x.str.upper())

# Some sample numbers begin with W or F 
#print(len(lab_df[ ~ (lab_df['SampleNumber'].str.startswith('F') | lab_df['SampleNumber'].str.startswith('A')) ]))

In [4]:
dft = pd.merge(spss_df, lab_df, how='inner', on=['Festival','SampleNumber'])
print("%d entries were merged" % len(dft))

# For checking which entries can't be merged - check for right_only
#pd.merge(lab_df, spss_df, how='outer', indicator=True)

# Sort first by Festival, then SampleNumber
dft.sort_values(['Festival', 'SampleNumber'], ascending=True, inplace=True)

# Here we reorder columns that should be identical to:
# 1. spot data errors
# 2. remove duplicate columns once we're happy data is consistent
prefix_cols = ['Festival', 'SampleNumber',
             'Sample submission time', 'Date & Time of return', 'Date & Time of intervention', 
             'Client age', 'Age', 'Client gender', 'Gender', 'Bought as', 'SubmittedSubstanceAs']

# Get the list of columns excluding the ones in prefix_cols
cols = [c for c in dft.columns.tolist() if c not in prefix_cols]
# Prepend prefix_cols to create the new list
cols = prefix_cols + cols
# Reorder columns
dft = dft[cols]

# Uppercase all genders for consistency
labels = ['Client gender', 'Gender']
dft.loc[:, labels] = dft[labels].apply(lambda x: x.str.upper())
# Set any MISSING to be nan
dft.loc[:, labels] = dft.loc[:, labels].replace({'MISSING':np.nan})

# Dump to excel
writer = pd.ExcelWriter('merged.xlsx')
dft.to_excel(writer, 'MergedData', index=False)
writer.save()

1295 entries were merged


In [5]:
# # See which non-na ages don't match
# # 540 entries have valid ages
# print(len(dft))
# df = dft[pd.notnull(dft['Client age']) & pd.notnull(dft['Age'])]
# print(len(df))
# df = df[df['Client age'] != df['Age']]
# # 127 don't match
# print(len(df))
# # df.to_csv('foo.csv')


# Cross tab 'Client gender' and 'Gender
#print(dft['Client gender'].unique())
#print(dft['Gender'].unique())

# # Look where they don't match
# df = dft[pd.notnull(dft['Client gender']) & pd.notnull(dft['Gender'])]
# df = df[df['Client gender'] != df['Gender']]
# # 127 don't match
# print(len(df))
# df.to_csv('foo.csv')



In [41]:
#
# Attempt to disentangle the Form responses into a form they can be merged with Liam's data
#
bt_interventions = '/opt/random/BTReport 2017 - Form responses 3.csv'
date_cols = ['Timestamp']
bt_df = pd.read_csv(bt_interventions, engine="python", parse_dates=date_cols)

"""
Which drugs have you used? [Cocaine]

cocaine_ever	Ever had this drug 
cocaine_year	Ever had this drug in the past year
cocaine_month	Ever had this drug in the past month
cocaine_week	Ever had this drug in the past week
cocaine_yesterday	Ever had this drug yesterday
cocaine_today	Ever had this drug today
cocaine_tonight	Are you having this drug tonight
cocaine_today_tonight	Have they had this drug today or will they have it tonight?

Take column -> return df with names matching Liam's coding

For each cell -> split by commas to get all values
Return boolean of which columns apply

'ever', 
'year',
'month',
'week',
'yesterday',
'today',
'tonight',
'today_tonight',


"""

c = 'Never had, Had today, (Probably) planning later'
def parse_cell(cell):
    cols = ['ever', 'year', 'month', 'week', 'yesterday', 'today', 'tonight']
    len_flags = len(cols) + 1
    flags = [False] * len_flags # need space for 'today_tonight'
    if isinstance(cell, float) and np.isnan(cell): # If Nan return all False
        return flags
    
    form_responses = ['Had in my life', 'Had in last year', 'Had in last month', 'Had in last week', 'Had yesterday', 'Had today', '(Probably) planning later']
    # Form has 'Never had' - which is none being checked
    never = 'Never had'
    values = [v.strip() for v in cell.split(',')]
    if never in values and len(values) > 1:
        # Can't have never with anything else, so remove if it's there with anything else
        values.remove(never)
    
    for i, response in enumerate(form_responses):
        if response in values:
            flags[i] = True
    
    # Add 'today_tonight'
    today = len_flags - 3
    tonight = len_flags - 2
    today_tonight = len_flags - 1
    if flags[today] or flags[tonight]:
        flags[today_tonight] = True
    
    #return list(zip(flags, cols + [today_tonight']))
    return list(map(lambda x: 'Yes' if x else 'No', flags))

#print(parse_cell(np.nan))

clabel = 'Which drugs have you used? [Cocaine]'
df = pd.DataFrame(bt_df[clabel])
#print(parse_cell(df[clabel]))
['ever', 'year', 'month', 'week', 'yesterday', 'today', 'tonight', 'today_tonight']
#print(df[clabel].apply(parse_cell))
df['ever', 'year', 'month', 'week', 'yesterday', 'today', 'tonight', 'today_tonight'] = df[clabel].apply(parse_cell)

df.to_csv('foo.csv')
# clabel = 'Which drugs have you used? [Cocaine]'
# print(bt_df[clabel].unique())

# df = pd.DataFrame(bt_df[clabel])
# #print(df)
# #df.pivot(index='date', columns='variable', values='value')
# df = df.pivot(columns=clabel, values=clabel)

# # Remove any nan columns and the label we've used
# df.drop(np.nan, axis=1, inplace=True)

# print(df)