# Data Wrangling for [<u>There Goes the Neigborhood</u>](https://github.com/hhuntz/ThereGoesTheNeighborhood)

Data manually compiled [here](https://docs.google.com/spreadsheets/d/1i2ASwYq7ZfX76pll3Sxc0Jw9yYac8svfa6LnSfxOLLY/edit?usp=sharing) from original CO state source [here](https://drive.google.com/drive/folders/0B-ZjnNx-rL_mTHU4dHhiX1dEbU0?resourcekey=0-j0x5DFB5M-7nNRLa-8g2Zw)

## Step 1: Filter data and manage missing values

In [1]:
import pandas as pd
import numpy as np

In [2]:
# load data
df = pd.read_excel('CO Cannabis Shops.xlsx', sheet_name = None)
# combine to one df
df = pd.concat(df.values(), ignore_index=True)
df

FileNotFoundError: [Errno 2] No such file or directory: 'CO Cannabis Shops.xlsx'

In [None]:
# keep only med and rec store licenses -- remove grows, manufacturers, etc.
df_stores = df[df['NUM'].str.contains('402R-|402-', na = False, regex = True)]
#list(df_stores.NUM.unique()) # making sure above line worked
df_stores

In [None]:
# change TYPE vals
# some missing in data; rest unclear
df_stores['TYPE'] = np.where(df_stores['NUM'].str.contains('402R-'), 'Rec', 'Med')
df_stores

In [None]:
# fill missing DBA names with LLC names
df_stores.DBA = df_stores.DBA.fillna(df.NAME)
# remove 'LLC' from DBA names
df_stores.DBA = df_stores.DBA.str.replace(' LLC', '')
df_stores

## Step 2: Transform to individual dispensary records

In [19]:
# groupby name and zip -- some (112) names have multiple zips 
df_stores = df_stores.groupby([df_stores['DBA'], df_stores['ZIP']])

# grab first and last months and years aggregated at dispensary level
aggregation_functions = {'TYPE': lambda x: ', '.join(x.unique()), 'CITY': 'first', 
                         'YEAR': ['first', 'last'], 'MONTH': ['first', 'last']}
df_new = df_stores.groupby([df_stores['DBA'], df_stores['ZIP']]).aggregate(aggregation_functions)

# rename columns and take a look
df_new.columns = ['TYPE', 'CITY', 'YEAR_FIRST', 'YEAR_LAST', 'MONTH_FIRST', 'MONTH_LAST']
df_new

Unnamed: 0_level_0,Unnamed: 1_level_0,TYPE,CITY,YEAR,YEAR,MONTH,MONTH
Unnamed: 0_level_1,Unnamed: 1_level_1,<lambda>,first,first,last,first,last
DBA,ZIP,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
,80216.0,Med,Denver,2020,2022,9,9
,80909.0,Med,Colorado Springs,2022,2022,5,12
HAPPY BUDDHA WELLNESS CENTER,80909.0,Med,Colorado Springs,2017,2017,9,9
MOUNTAIN CANNABIS,81432.0,Rec,Ridgway,2019,2019,8,10
10185 RIDGE,80033.0,Med,Wheat Ridge,2018,2018,4,3
...,...,...,...,...,...,...,...
Zipz,80907.0,Med,Colorado Springs,2022,2022,5,5
bdMindful,80909.0,Med,Colorado Springs,2014,2014,12,12
beMINDFUL,80909.0,Med,Colorado Springs,2018,2018,8,7
one:eleven,81143.0,"Rec, Med",Moffat,2022,2022,9,12


In [23]:
# write data
df_new.to_csv('co_cannabis_stores.csv')