/
categorization_productnames.Rmd
429 lines (363 loc) · 23.2 KB
/
categorization_productnames.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
---
title: 'Classifying Extracts: Product Name Text'
author: "Lauren Renaud & Krista Kinnard"
date: "May 15, 2017"
output:
html_document:
toc: yes
toc_depth: 4
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE)
library(dplyr)
library(readr)
library(knitr)
library(tidyr)
options(scipen = 4)
# locatons
locations <- read.csv("../data/Dec2016/cleaned/locations/all_locations.csv", sep=",", header=T)
locations <- select(locations, location_id, name, city, loc_type = typesimp)
```
```{r setup2, include=FALSE, cache=FALSE}
# pull in & clean dfs ------
# inventory
inventory <- readr::read_csv("../data/Dec2016/biotrackthc_inventory.csv")
inventory$inv_date <- as.Date(as.POSIXct(inventory$sessiontime,
origin = "1970-01-01", tz="America/Los_Angeles"))
inventory$sample_id <- as.numeric(inventory$sample_id)
inventory$parentid <- as.numeric(inventory$parentid)
# join to inventory types
inventory.types <- read.csv("../data/Dec2016/cleaned/inventory_type.csv", sep=",", header=T)
inventory <- inventory %>%
# get inventory types
left_join(inventory.types, by="inventorytype") %>%
# get location types
left_join(locations, by=c("location" = "location_id")) %>%
# drop other location variables
select(-(name), -(city))
rm(inventory.types)
# selected variables
inv.select <- select(inventory, inventoryid = id, inv_strain = strain, parentid,
inv_type_name, inventoryparentid, productname, sample_id,
inv_location = location, inv_loctype = loc_type)
# dispensing
dispensing <- readr::read_csv("../data/Dec2016/biotrackthc_dispensing.csv")
dispensing <- filter(dispensing, inventorytype==24)
dispensing$saletime <- as.Date(as.POSIXct(dispensing$sessiontime,
origin = "1970-01-01", tz="America/Los_Angeles"))
dispens.select <- select(dispensing, dispensingid = id, retail_invid = inventoryid, retail_loc = location,
sale_price = price, saletime)
dispens.select <- dispens.select %>%
left_join(locations, by=c("retail_loc" = "location_id")) %>%
# drop other location variables
select(-(name), -(city))
## testing going back one more step then looking at product types
inhalants.joined <- dispens.select %>%
# first get to retailer's inventory to get `parentid`
# as well as retail productname, retail strain
dplyr::left_join(inv.select, by=c("retail_invid" = "inventoryid")) %>%
# rename to inventory variables to clarify they are retail variables
rename(retail_strain = inv_strain, retail_parentid = parentid, retail_invtype = inv_type_name,
retail_invparentid = inventoryparentid, retail_prodname = productname, retail_sampId = sample_id,
retail_location = inv_location) %>%
# then use retailer's parentid to get to processor's inventoryid
left_join(inv.select, by=c("retail_parentid" = "inventoryid")) %>%
# then rename processor inventory variables
rename(process_strain = inv_strain, process_parentid = parentid,
process_invparentid = inventoryparentid, process_invtype = inv_type_name,
process_prodname = productname, process_sampleid = sample_id, process_loc = inv_location) %>%
# then use processors's parentid to get to one step further back inventoryid
# these variables will now be called "inv_variablename"
left_join(inv.select, by=c("process_invparentid" = "inventoryid"))
# 5% of product names for inhalants are missing, which is ~ the same as in full inventory df
# sum(is.na(inventory$productname[inventory$inventorytype==24])) / sum(inventory$inventorytype==24)
# # 33% are missing for processors' product name
# sum(is.na(inhalants.joined$inv_prodname)) / nrow(inhalants.joined)
# # inventory type
# # missing for 70 of joined inhalants (out of 6 million)
# sum(is.na(inhalants.joined$retail_invtype)) / nrow(inhalants.joined)
#
# dispeningtypes <- unique(dispensing$inventorytype[!is.na(dispensing$inventorytype)])
```
```{r more_setup}
# first get list/df of inhalantnames
inhalantnames <- as.data.frame(unique(inhalants.joined$retail_prodname))
# rename column so we can call it
colnames(inhalantnames) <- "retail_prodname"
# get a df that is for each unique inhalant name, the boolean values for categorizing
inhalantnames <- inhalantnames %>%
dplyr::rowwise() %>%
# boolean for cartridge
dplyr::mutate(cartridge = grepl("cart|vap|vc|pen|refill", retail_prodname, ignore.case = T),
# categorical
type = ifelse(
grepl("BHO|butane", retail_prodname, ignore.case = T), "BHO",
ifelse(grepl("CO2", retail_prodname, ignore.case = T), "CO2",
ifelse(grepl("hash", retail_prodname, ignore.case = T), "hash",
ifelse(grepl("hash", retail_prodname, ignore.case = T), "kief",
ifelse(grepl("bubble", retail_prodname, ignore.case = T), "bubble",
ifelse(grepl("shatter", retail_prodname, ignore.case = T), "shatter",
ifelse(grepl("dab", retail_prodname, ignore.case = T), "dab",
ifelse(grepl("resin", retail_prodname, ignore.case = T), "resin",
ifelse(grepl("wax", retail_prodname, ignore.case = T), "wax",
"other"))))))))
),
# now booleans for each
oil = grepl("oil", retail_prodname, ignore.case = T),
CO2 = grepl("CO2", retail_prodname, ignore.case=T),
BHO = grepl("BHO|butane", retail_prodname, ignore.case = T),
hash = grepl("hash", retail_prodname, ignore.case = T),
kief = grepl("kief|keif", retail_prodname, ignore.case = T),
bubble = grepl("bubble", retail_prodname, ignore.case = T),
shatter = grepl("shatter", retail_prodname, ignore.case = T),
dab = grepl("dab", retail_prodname, ignore.case = T),
resin = grepl("resin", retail_prodname, ignore.case = T),
wax = grepl("wax", retail_prodname, ignore.case = T),
uncategorized = (!cartridge & !CO2 & !oil & !BHO & !kief & !bubble & !shatter & !dab & !resin & !wax),
uncat_notcart = (!CO2 & !oil & !BHO & !kief & !bubble & !shatter & !dab & !resin & !wax)
)
# alternate method that creates a boolean for cartridge or not cartridge
# and then another column for 'type' that's more specific
# inhalantnames <- inhalantnames %>%
# dplyr::rowwise() %>%
# dplyr::mutate(cartridge = grepl("cart|vap|vc|pen|refill", productname, ignore.case = T),
# type = ifelse(
# grepl("BHO|butane", productname, ignore.case = T), "BHO",
# ifelse(grepl("CO2", productname, ignore.case = T), "CO2",
# ifelse(grepl("hash", productname, ignore.case = T), "hash",
# ifelse(grepl("hash", productname, ignore.case = T), "kief",
# ifelse(grepl("bubble", productname, ignore.case = T), "bubble",
# ifelse(grepl("shatter", productname, ignore.case = T), "shatter",
# ifelse(grepl("dab", productname, ignore.case = T), "dab",
# ifelse(grepl("resin", productname, ignore.case = T), "resin",
# ifelse(grepl("wax", productname, ignore.case = T), "wax",
# "other"))))))))
# )
# )
# join classified inhalantnames back to dispening df
inhalants.joined$retail_prodname <- as.factor(inhalants.joined$retail_prodname)
inhalants <- left_join(inhalants.joined, inhalantnames, by="retail_prodname")
prelimCat <- inhalants %>%
dplyr::select(dispensingid, cartridge:uncategorized) %>%
tidyr::gather(keyword, status, cartridge:uncategorized) %>%
dplyr::group_by(keyword) %>%
dplyr::summarise(perc = round(sum(status==TRUE) / nrow(inhalants)*100,1),
count = sum(status==TRUE)) %>%
dplyr::arrange(desc(perc))
# trying to get count of how many products hit more than category
# doing just based on names, not actual number of products
# because full df was long to run, but could check again later
num.keywords <- inhalantnames %>%
tidyr::gather(keyword, status, cartridge:uncategorized) %>%
dplyr::group_by(retail_prodname) %>%
dplyr::summarise(numCat = sum(status==TRUE)) %>%
dplyr::ungroup() %>%
dplyr::group_by(numCat) %>%
dplyr::summarise(distro_numkeywords = n(),
perc_numkeywords = n() / nrow(inhalantnames))
```
# Key findings
- Able to perform some form of **classification for about `r round(sum(!inhalants$uncategorized) / nrow(inhalants)*100,0)`%** of the products for inhalation in the dispensing file (counted by number of items sold in the dispensing file).
- Have reviewed these preliminary categories from a domain standpoint, and to consider purposes of classification.
- Need stricter categorization for about `r round(sum(num.keywords$distro_numkeywords[num.keywords$numCat>1]) / nrow(inhalantnames)*100,0)`% of products sold that match more than one keyword to be incorporated in later iterations of this method.
# Exploring Keyword Search
Our first pass at categorizing inhalants is by searching product names for what would indicate product types. This method is able to classify `r round(sum(!inhalants$uncategorized) / nrow(inhalants)*100,1)`% of "Marijuana Extract for Inhalation" products that were sold in the dispensing file, which is `r round(sum(!inhalantnames$uncategorized) / nrow(inhalantnames)*100,1)`% of inhalant product names.
This table shows how many products fit each classification as well as the strings used for classificiation. (For "hash", it also picks up "hashish", but hashish was not searched for separately.) These do not sum to 100% because `r round(sum(num.keywords$distro_numkeywords[num.keywords$numCat>1]) / nrow(inhalantnames)*100,1)`% of product names contain more than one string. `Count` here is counting number of products appearing in the dispensing (retail) file, not a count of product names.
While we could do a more thorough analysis of all the overlapping keywords, for this preliminary pass below we can look at "cartridges" and "oil" to see what the overlap for these keywords look like.
```{r prelimcat_table, warning=FALSE}
# add a column of strings used for categorization
category.strings <- read.csv("../data/Dec2016/cleaned/testing/categoryStrings.csv", sep=",", header=T)
prelimCat_labels <- prelimCat %>%
dplyr::left_join(category.strings, by="keyword")
# reorder the new columsns
refcols <- c("string", "keyword", "perc", "count")
prelimCat_labels <- prelimCat_labels[, c(refcols, setdiff(names(prelimCat_labels), refcols))]
# output table
knitr::kable(prelimCat_labels, col.names = c("Search String", "Keyword", "Percent", "Count"),
# make all columns right aligned so it's easier to see keyword near percent value
align=c("r", "r", "r", "r"))
```
### Overlap: Cartridges
```{r prelimcat_cartridges, warning=FALSE}
# take inhalants dataframe
overlap.cart <- inhalants %>%
# limit to only cartridges
filter(cartridge) %>%
# get a total count of number of cartridges
dplyr::mutate(cart_count = n()) %>%
# select that count, row identifier, and then catetories other than cartridge
dplyr::select(cart_count, dispensingid, oil:wax) %>%
# use gather to get dispensingid + one colum for keyword + one colum for true or false
tidyr::gather(keyword, status, oil:wax) %>%
# group on keyword to get statistics for each keyword
dplyr::group_by(keyword) %>%
dplyr::summarise(
# percent of all inhalant products that match that classification
perc.inhalants = round(sum(status==TRUE) / nrow(inhalants)*100,2),
# percent of cartridges that match that classification
# using cart_count[1] because there are many cart_count's, even though they all have
# the same value, need to tell it to just take the first one
perc.cart = round(sum(status==TRUE) / cart_count[1] * 100,2),
# count of that classification within cartridges
count = sum(status==TRUE)) %>%
dplyr::arrange(desc(perc.cart))
```
In a cursory look at the product names, it looked like things that are categorized as "cartridges" might be picking up other keywords too. This table looks only at things that are cartridges, and then what other, additional keywords they also pick up, as well as their pecent of all inhalants in the market, and percentage of cartridges in the market.
Overall `r sum(overlap.cart$perc.cart)`% of cartridges are picking up other keywords, with the highest being "`r overlap.cart[1,1]`" at `r overlap.cart[1,3]`% followed by "`r overlap.cart[2,1]`" at `r sum(overlap.cart$perc.cart[overlap.cart$keyword=="BHO"])`% of the cartridges. We will have to consider whether to categorize as "cartridge" first, or as oil or BHO, or as something like "BHO cartridge" as compared to "General cartridge", for example. Similarly we need to make the same judgements with other subcategories of cartridges, though there are fewer products in the market with these other combinations of keywords.
```{r, prelimcat_cartridges2, warning=FALSE}
knitr::kable(overlap.cart, col.names = c("Keyword", "% of Inhalants", "% of cartridges", "Count"),
align=c("r", "r", "r", "r"))
```
### Overlap: Oil
```{r prelimcat_oil, warning=FALSE}
overlap.oil <- inhalants %>%
# keep only oils
dplyr::filter(oil) %>%
# get count of number of oils in df
dplyr::mutate(oil_count = n()) %>%
# select only that count of rows in this filtered dataframe,
# row identifier, and classifications other than oil
dplyr::select(oil_count, dispensingid, cartridge, BHO:wax) %>%
# gather to get row identified + column of keywords + column of T/F values
tidyr::gather(keyword, status, cartridge, BHO:wax) %>%
# group by keyword to get statistics for each keyword
dplyr::group_by(keyword) %>%
dplyr::summarise(
# percent of all inhalants that are this keyword
perc.inhalants = round(sum(status==TRUE) / nrow(inhalants)*100,2),
# percent of oil category that are also this keyword
# using oil_count[1] because there are many oil_count's, even though they all have
# the same value, need to tell it to just take the first one
perc.oil = round(sum(status==TRUE) / oil_count[1] * 100,2),
# count of this category within oils
count = sum(status==TRUE)) %>%
dplyr::arrange(desc(perc.oil))
```
It also looks like things that are categorized as "oil" are picking up other keywords too. This table looks only at products that have "oil" in the name, and then what other, additional keywords they also pick up, along with the percentage of inhalant market and percentage of "oil" products that also contain that keyword.
It appears that `r sum(overlap.oil$perc.oil[overlap.oil$keyword=="cartridge"])`% of the oils are also listed as cartridges, which is not surprising. It might be best to categorize as "cartridge" first, and "oil" if not a cartridge. Also `r sum(overlap.oil$perc.oil[!overlap.oil$keyword=="cartridge"])`% of "oils" are also picking up other keywords, which is something we could explore further to determine how to categorize. For example, `r overlap.oil[2,3]`% of products that contain the string "oil" also contain the string "`r overlap.oil[2,1]`", and `r overlap.oil[3,3]`% of oil product names also contain the string "`r overlap.oil[3,1]`".
```{r prelimcat_oil2, warning=FALSE}
knitr::kable(overlap.oil, col.names = c("Keyword", "% of Inhalants", "% of Oils", "Count"),
align=c("r", "r", "r", "r"))
```
# Implementing Text Rules
Because we do not have absolute ground truth for these classifications, we want to err on the side of caution in implementign these rules. To do this, for the most part if a product picks up two keywords, we will move it to "Uncategorized", and then try other methods of classification. The main exception to this is that we will categorize "Hash Oil" as oil. Also, generally speaking we're using "dab" as a search string to pick up product that are used for dabbing, but if some name contains both "wax" and "dab", it is classified as wax.
![Visual explanation of search string hierarchy](../graphics/text_sequence.png)
Using this stricter method, we get the following product distribution and are able to classify `r round(sum(inhalants$inhalant_gen!="Uncategorized") / nrow(inhalants) * 100, 1)`% of extracts.
```{r updated_counts}
# first get list/df of inhalantnames
inhalantnames <- as.data.frame(unique(inhalants.joined$retail_prodname))
# rename column so we can call it
colnames(inhalantnames) <- "retail_prodname"
# bringing in classification function
source("categorization_function.R")
inhalantnames <- inhalantnames %>%
#dplyr::filter(!is.na(retail_prodname)) %>%
dplyr::rowwise() %>%
dplyr::mutate(inhalant_type = categorizeNames(retail_prodname),
inhalant_gen = groupProductTypes(inhalant_type))
# join classified inhalantnames back to dispening df
inhalants.joined$retail_prodname <- as.factor(inhalants.joined$retail_prodname)
inhalants <- left_join(inhalants.joined, inhalantnames, by="retail_prodname")
inhalants %>%
dplyr::mutate(total = nrow(inhalants),
inhalant_gen = factor(inhalant_gen, levels = c("Cartridge/Oil", "Wax/Shatter/Resin",
"Hash/Kief", "Uncategorized"))) %>%
dplyr::group_by(inhalant_gen) %>%
dplyr::summarise(percent_type = round(n() / total[1] * 100, 1)) %>%
# dplyr::summarise(
# `Cartridge/Oil` = round((sum(inhalant_gen == "Cartridge/Oil", na.rm=T) / total * 100),1),
# `Wax, Shatter, Resin` = round((sum(inhalant_gen == "Hash/Kief", na.rm=T) / total * 100),1),
# `Hash, Kief` = round((sum(inhalant_gen == "Wax/Shatter/Resin", na.rm=T) / total * 100),1),
# `Uncategorized` = round(((sum(inhalant_gen == "Uncategorized", na.rm=T)) / total * 100),1)
# ) %>%
kable(col.names = c("Category", "% of Products"))
```
### Functionalizing
Because the objective of this categorization is to inform other research, this sequence has been formalized into a function that can be called for other analyses. There are two functions below -- one to break down into the 9 distinct product categories, and one to get generalized categories, `Oil/Cartridge`, `Wax/Shatter/Resin`, `Hash/Kief`, and `Uncategorized`. (There is also another function, not included here, that breaks oils and cartridges out separately.)
Here it is also possible to see the specific strings that are used. These have been determined through iteratively looking at product names and phrases, consulting with retail website and domain researchers, and then going back again to re-examine the uncategorized products. By creating this function it also creates some flexibility -- the strings can be easily modified as needed.
```{r function_example, echo = TRUE}
# references at bottom
#create search terms for each product type
cartridge.strings <- "cart|vap|vc|pen|refill|juju|joint|atomizer"
oil.strings <- "oil|rso|eso"
hash.strings <- "hash"
kief.strings <- "kief|keif"
wax.strings <- "wax|crumble|budder"
shatter.strings <- "shatter|snap"
dab.strings <-"dab"
resin.strings <- "resin|rosin"
categorizeNames <- function(productName){
#' Takes product name and categorizes it into a
#' product category type
#' @param productName A string of inhalant product names
#' @return A categorized usage of the productname as a string.
# first check for cartridges. allow for oil to be in product name so that
# for example "oil cartridge" will be classified as a cartridge
if(grepl( cartridge.strings, productName, ignore.case = T) == TRUE &
# allows for dab and oil strings
grepl(paste(hash.strings, kief.strings, wax.strings,shatter.strings,
resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
return("Cartridge")
}
# now check for oil products. allow for hash to also be in product name so that "hash oil" is classified as oil
else if(grepl(oil.strings, productName, ignore.case = T) == TRUE &
# allows for dab strings and hash strings
grepl(paste(hash.strings, kief.strings, wax.strings,shatter.strings, resin.strings, sep = "|"),
productName, ignore.case = T) == FALSE) {
return("Oil")
}
# check for hash products. Allow for no overlap with other products except "dab"
else if(grepl( hash.strings, productName, ignore.case = T) == TRUE &
grepl(paste(cartridge.strings, kief.strings,oil.strings, wax.strings,shatter.strings,
resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
return("Hash")
}
# check for kief products. Allow for no overlap with other products except "dab"
else if(grepl( kief.strings, productName, ignore.case = T) == TRUE &
grepl(paste(cartridge.strings, hash.strings,oil.strings, wax.strings,shatter.strings,
resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
return("Kief")
}
# check for wax products. Allow for no overlap with other products except "dab"
else if (grepl( wax.strings, productName, ignore.case = T) == TRUE &
grepl(paste(cartridge.strings, kief.strings,oil.strings, hash.strings,shatter.strings, dab.strings,
resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
return("Wax")
}
#check for shatter products. Allow for no overlap with other products except "dab"
else if (grepl( shatter.strings, productName, ignore.case = T) == TRUE &
grepl(paste(cartridge.strings, kief.strings,oil.strings, wax.strings, hash.strings,
resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
return("Shatter")
}
#check for resin products. Allow for no overlap with other products except "dab"
else if (grepl( resin.strings, productName, ignore.case = T) == TRUE &
grepl(paste(cartridge.strings, kief.strings,oil.strings, wax.strings,shatter.strings,
hash.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
return("Resin")
}
#check for dab products. Allow for no overlap with other products
else if (grepl( dab.strings, productName, ignore.case = T) == TRUE &
grepl(paste(cartridge.strings, kief.strings,oil.strings, wax.strings,shatter.strings, hash.strings,
resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
return("Dab")
}
else return("Uncategorized")
}
groupProductTypes <- function(productType){
#' Takes product type and categorizes it into a
#' product category grouping
#' @param productType A string of inhalant product type
#' @return A grouped usage of the product type as a string.
if(productType=="Cartridge" | productType=="Oil") {
return("Cartridge/Oil")
}
else if(productType=="Hash" | productType=="Kief") {
return("Hash/Kief")
}
else if(productType=="Wax" | productType=="Shatter" | productType=="Dab" | productType=="Resin") {
return("Wax/Shatter/Resin")
}
else return("Uncategorized")
}
```