categorization_productnames.Rmd

---
title: 'Classifying Extracts: Product Name Text'
author: "Lauren Renaud & Krista Kinnard"
date: "May 15, 2017"
output:
  html_document:
    toc: yes
    toc_depth: 4
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = FALSE)

library(dplyr)
library(readr)
library(knitr)
library(tidyr)
options(scipen = 4)

# locatons
locations <- read.csv("../data/Dec2016/cleaned/locations/all_locations.csv", sep=",", header=T)
locations <- select(locations, location_id, name, city, loc_type = typesimp)
```

```{r setup2, include=FALSE, cache=FALSE}

# pull in & clean dfs ------
# inventory
inventory <- readr::read_csv("../data/Dec2016/biotrackthc_inventory.csv")
inventory$inv_date <- as.Date(as.POSIXct(inventory$sessiontime, 
                                            origin = "1970-01-01", tz="America/Los_Angeles"))
inventory$sample_id <- as.numeric(inventory$sample_id)
inventory$parentid <- as.numeric(inventory$parentid)
# join to inventory types
inventory.types <- read.csv("../data/Dec2016/cleaned/inventory_type.csv", sep=",", header=T)
inventory <- inventory %>%
  # get inventory types
  left_join(inventory.types, by="inventorytype") %>%
  # get location types
  left_join(locations, by=c("location" = "location_id")) %>%
  # drop other location variables
  select(-(name), -(city))
rm(inventory.types)

# selected variables
inv.select <- select(inventory, inventoryid = id, inv_strain = strain, parentid,
                     inv_type_name, inventoryparentid, productname, sample_id,
                     inv_location = location, inv_loctype = loc_type)

# dispensing
dispensing <- readr::read_csv("../data/Dec2016/biotrackthc_dispensing.csv")
dispensing <- filter(dispensing, inventorytype==24)
dispensing$saletime <- as.Date(as.POSIXct(dispensing$sessiontime, 
                                           origin = "1970-01-01", tz="America/Los_Angeles"))
dispens.select <- select(dispensing, dispensingid = id, retail_invid = inventoryid, retail_loc = location,
                         sale_price = price, saletime)
dispens.select <- dispens.select %>%
  left_join(locations, by=c("retail_loc" = "location_id")) %>%
  # drop other location variables
  select(-(name), -(city))


## testing going back one more step then looking at product types
inhalants.joined <- dispens.select %>%
  # first get to retailer's inventory to get `parentid`
  # as well as retail productname, retail strain
  dplyr::left_join(inv.select, by=c("retail_invid" = "inventoryid")) %>%
  # rename to inventory variables to clarify they are retail variables
  rename(retail_strain = inv_strain, retail_parentid = parentid, retail_invtype = inv_type_name,
         retail_invparentid = inventoryparentid, retail_prodname = productname, retail_sampId = sample_id,
         retail_location = inv_location) %>%
  # then use retailer's parentid to get to processor's inventoryid
  left_join(inv.select, by=c("retail_parentid" = "inventoryid")) %>%
  # then rename processor inventory variables
  rename(process_strain = inv_strain, process_parentid = parentid,
         process_invparentid = inventoryparentid, process_invtype = inv_type_name,
         process_prodname = productname, process_sampleid = sample_id, process_loc = inv_location) %>%
  # then use processors's parentid to get to one step further back inventoryid
  # these variables will now be called "inv_variablename"
  left_join(inv.select, by=c("process_invparentid" = "inventoryid"))


# 5% of product names for inhalants are missing, which is ~ the same as in full inventory df
# sum(is.na(inventory$productname[inventory$inventorytype==24])) / sum(inventory$inventorytype==24)
# # 33% are missing for processors' product name
# sum(is.na(inhalants.joined$inv_prodname)) / nrow(inhalants.joined)
# # inventory type
# # missing for 70 of joined inhalants (out of 6 million)
# sum(is.na(inhalants.joined$retail_invtype)) / nrow(inhalants.joined)
# 
# dispeningtypes <- unique(dispensing$inventorytype[!is.na(dispensing$inventorytype)])
```

```{r more_setup}

# first get list/df of inhalantnames
inhalantnames <- as.data.frame(unique(inhalants.joined$retail_prodname))
# rename column so we can call it
colnames(inhalantnames) <- "retail_prodname"

# get a df that is for each unique inhalant name, the boolean values for categorizing
inhalantnames <- inhalantnames %>%
  dplyr::rowwise() %>%
                 # boolean for cartridge
  dplyr::mutate(cartridge = grepl("cart|vap|vc|pen|refill", retail_prodname, ignore.case = T),
                # categorical
                type = ifelse(
                  grepl("BHO|butane", retail_prodname, ignore.case = T), "BHO",
                  ifelse(grepl("CO2", retail_prodname, ignore.case = T), "CO2",
                         ifelse(grepl("hash", retail_prodname, ignore.case = T), "hash",
                                ifelse(grepl("hash", retail_prodname, ignore.case = T), "kief",
                                       ifelse(grepl("bubble", retail_prodname, ignore.case = T), "bubble",
                                              ifelse(grepl("shatter", retail_prodname, ignore.case = T), "shatter",
                                                     ifelse(grepl("dab", retail_prodname, ignore.case = T), "dab",
                                                            ifelse(grepl("resin", retail_prodname, ignore.case = T), "resin",
                                                                   ifelse(grepl("wax", retail_prodname, ignore.case = T), "wax",
                                                                          "other"))))))))
                ),
                # now booleans for each
                oil = grepl("oil", retail_prodname, ignore.case = T),
                CO2 = grepl("CO2", retail_prodname, ignore.case=T),
                BHO = grepl("BHO|butane", retail_prodname, ignore.case = T),
                hash = grepl("hash", retail_prodname, ignore.case = T),
                kief = grepl("kief|keif", retail_prodname, ignore.case = T),
                bubble = grepl("bubble", retail_prodname, ignore.case = T),
                shatter = grepl("shatter", retail_prodname, ignore.case = T),
                dab = grepl("dab", retail_prodname, ignore.case = T),
                resin = grepl("resin", retail_prodname, ignore.case = T),
                wax = grepl("wax", retail_prodname, ignore.case = T),
                uncategorized = (!cartridge & !CO2 & !oil & !BHO & !kief & !bubble & !shatter & !dab & !resin & !wax),
                uncat_notcart = (!CO2 & !oil & !BHO & !kief & !bubble & !shatter & !dab & !resin & !wax)
  )

# alternate method that creates a boolean for cartridge or not cartridge
# and then another column for 'type' that's more specific
# inhalantnames <- inhalantnames %>%
#   dplyr::rowwise() %>%
#   dplyr::mutate(cartridge = grepl("cart|vap|vc|pen|refill", productname, ignore.case = T),
#                 type = ifelse(
#                   grepl("BHO|butane", productname, ignore.case = T), "BHO",
#                   ifelse(grepl("CO2", productname, ignore.case = T), "CO2",
#                          ifelse(grepl("hash", productname, ignore.case = T), "hash",
#                                 ifelse(grepl("hash", productname, ignore.case = T), "kief",
#                                        ifelse(grepl("bubble", productname, ignore.case = T), "bubble",
#                                               ifelse(grepl("shatter", productname, ignore.case = T), "shatter",
#                                                      ifelse(grepl("dab", productname, ignore.case = T), "dab",
#                                                             ifelse(grepl("resin", productname, ignore.case = T), "resin",
#                                                                    ifelse(grepl("wax", productname, ignore.case = T), "wax",
#                                                                           "other"))))))))
#                 )
#   )

# join classified inhalantnames back to dispening df
inhalants.joined$retail_prodname <- as.factor(inhalants.joined$retail_prodname)
inhalants <- left_join(inhalants.joined, inhalantnames, by="retail_prodname")

prelimCat <- inhalants %>%
  dplyr::select(dispensingid, cartridge:uncategorized) %>%
  tidyr::gather(keyword, status, cartridge:uncategorized) %>%
  dplyr::group_by(keyword) %>%
  dplyr::summarise(perc = round(sum(status==TRUE) / nrow(inhalants)*100,1),
                   count = sum(status==TRUE)) %>%
  dplyr::arrange(desc(perc))

# trying to get count of how many products hit more than category
# doing just based on names, not actual number of products
# because full df was long to run, but could check again later
num.keywords <- inhalantnames %>%
  tidyr::gather(keyword, status, cartridge:uncategorized) %>%
  dplyr::group_by(retail_prodname) %>%
  dplyr::summarise(numCat = sum(status==TRUE)) %>%
  dplyr::ungroup() %>%
  dplyr::group_by(numCat) %>%
  dplyr::summarise(distro_numkeywords = n(),
                   perc_numkeywords = n() / nrow(inhalantnames))

```


# Key findings

 - Able to perform some form of **classification for about `r round(sum(!inhalants$uncategorized) / nrow(inhalants)*100,0)`%** of the products for inhalation in the dispensing file (counted by number of items sold in the dispensing file).
 - Have reviewed these preliminary categories from a domain standpoint, and to consider purposes of classification.
 - Need stricter categorization for about `r round(sum(num.keywords$distro_numkeywords[num.keywords$numCat>1]) / nrow(inhalantnames)*100,0)`% of products sold that match more than one keyword to be incorporated in later iterations of this method.

# Exploring Keyword Search

Our first pass at categorizing inhalants is by searching product names for what would indicate product types. This method is able to classify `r round(sum(!inhalants$uncategorized) / nrow(inhalants)*100,1)`% of "Marijuana Extract for Inhalation" products that were sold in the dispensing file, which is `r round(sum(!inhalantnames$uncategorized) / nrow(inhalantnames)*100,1)`% of inhalant product names.

This table shows how many products fit each classification as well as the strings used for classificiation. (For "hash", it also picks up "hashish", but hashish was not searched for separately.) These do not sum to 100% because `r round(sum(num.keywords$distro_numkeywords[num.keywords$numCat>1]) / nrow(inhalantnames)*100,1)`% of product names contain more than one string. `Count` here is counting number of products appearing in the dispensing (retail) file, not a count of product names.

While we could do a more thorough analysis of all the overlapping keywords, for this preliminary pass below we can look at "cartridges" and "oil" to see what the overlap for these keywords look like.

```{r prelimcat_table, warning=FALSE}

# add a column of strings used for categorization
category.strings <- read.csv("../data/Dec2016/cleaned/testing/categoryStrings.csv", sep=",", header=T)
prelimCat_labels <- prelimCat %>%
  dplyr::left_join(category.strings, by="keyword")

# reorder the new columsns
refcols <- c("string", "keyword", "perc", "count")
prelimCat_labels <- prelimCat_labels[, c(refcols, setdiff(names(prelimCat_labels), refcols))]

# output table
knitr::kable(prelimCat_labels, col.names = c("Search String", "Keyword", "Percent", "Count"),
             # make all columns right aligned so it's easier to see keyword near percent value
               align=c("r", "r", "r", "r"))

```


### Overlap: Cartridges

```{r prelimcat_cartridges, warning=FALSE}
# take inhalants dataframe
overlap.cart <- inhalants %>%
  # limit to only cartridges
  filter(cartridge) %>%
  # get a total count of number of cartridges
  dplyr::mutate(cart_count = n()) %>%
  # select that count, row identifier, and then catetories other than cartridge
  dplyr::select(cart_count, dispensingid, oil:wax) %>%
  # use gather to get dispensingid + one colum for keyword + one colum for true or false
  tidyr::gather(keyword, status, oil:wax) %>%
  # group on keyword to get statistics for each keyword
  dplyr::group_by(keyword) %>%
  dplyr::summarise(
    # percent of all inhalant products that match that classification
    perc.inhalants = round(sum(status==TRUE) / nrow(inhalants)*100,2),
    # percent of cartridges that match that classification
    # using cart_count[1] because there are many cart_count's, even though they all have
    # the same value, need to tell it to just take the first one
    perc.cart = round(sum(status==TRUE) / cart_count[1] * 100,2),
    # count of that classification within cartridges
    count = sum(status==TRUE)) %>%
  dplyr::arrange(desc(perc.cart))

```

In a cursory look at the product names, it looked like things that are categorized as "cartridges" might be picking up other keywords too. This table looks only at things that are cartridges, and then what other, additional keywords they also pick up, as well as their pecent of all inhalants in the market, and percentage of cartridges in the market.

Overall `r sum(overlap.cart$perc.cart)`% of cartridges are picking up other keywords, with the highest being "`r overlap.cart[1,1]`" at `r overlap.cart[1,3]`% followed by "`r overlap.cart[2,1]`" at `r sum(overlap.cart$perc.cart[overlap.cart$keyword=="BHO"])`% of the cartridges. We will have to consider whether to categorize as "cartridge" first, or as oil or BHO, or as something like "BHO cartridge" as compared to "General cartridge", for example. Similarly we need to make the same judgements with other subcategories of cartridges, though there are fewer products in the market with these other combinations of keywords.

```{r, prelimcat_cartridges2, warning=FALSE}
knitr::kable(overlap.cart, col.names = c("Keyword", "% of Inhalants", "% of cartridges", "Count"),
               align=c("r", "r", "r", "r"))

```

### Overlap: Oil

```{r prelimcat_oil, warning=FALSE}

overlap.oil <- inhalants %>%
  # keep only oils
  dplyr::filter(oil) %>%
  # get count of number of oils in df
  dplyr::mutate(oil_count = n()) %>%
  # select only that count of rows in this filtered dataframe,
  # row identifier, and classifications other than oil
  dplyr::select(oil_count, dispensingid, cartridge, BHO:wax) %>%
  # gather to get row identified + column of keywords + column of T/F values
  tidyr::gather(keyword, status, cartridge, BHO:wax) %>%
  # group by keyword to get statistics for each keyword
  dplyr::group_by(keyword) %>%
  dplyr::summarise(
    # percent of all inhalants that are this keyword
    perc.inhalants = round(sum(status==TRUE) / nrow(inhalants)*100,2),
    # percent of oil category that are also this keyword
    # using oil_count[1] because there are many oil_count's, even though they all have
    # the same value, need to tell it to just take the first one
    perc.oil = round(sum(status==TRUE) / oil_count[1] * 100,2),
    # count of this category within oils
    count = sum(status==TRUE)) %>%
  dplyr::arrange(desc(perc.oil))

```

It also looks like things that are categorized as "oil" are picking up other keywords too. This table looks only at products that have "oil" in the name, and then what other, additional keywords they also pick up, along with the percentage of inhalant market and percentage of "oil" products that also contain that keyword.

It appears that `r sum(overlap.oil$perc.oil[overlap.oil$keyword=="cartridge"])`% of the oils are also listed as cartridges, which is not surprising. It might be best to categorize as "cartridge" first, and "oil" if not a cartridge. Also `r sum(overlap.oil$perc.oil[!overlap.oil$keyword=="cartridge"])`% of "oils" are also picking up other keywords, which is something we could explore further to determine how to categorize. For example, `r overlap.oil[2,3]`% of products that contain the string "oil" also contain the string "`r overlap.oil[2,1]`", and `r overlap.oil[3,3]`% of oil product names also contain the string "`r overlap.oil[3,1]`".

```{r prelimcat_oil2, warning=FALSE}
knitr::kable(overlap.oil, col.names = c("Keyword", "% of Inhalants", "% of Oils", "Count"),
               align=c("r", "r", "r", "r"))
```

# Implementing Text Rules

Because we do not have absolute ground truth for these classifications, we want to err on the side of caution in implementign these rules. To do this, for the most part if a product picks up two keywords, we will move it to "Uncategorized", and then try other methods of classification. The main exception to this is that we will categorize "Hash Oil" as oil. Also, generally speaking we're using "dab" as a search string to pick up product that are used for dabbing, but if some name contains both "wax" and "dab", it is classified as wax.

![Visual explanation of search string hierarchy](../graphics/text_sequence.png)

Using this stricter method, we get the following product distribution and are able to classify `r round(sum(inhalants$inhalant_gen!="Uncategorized") / nrow(inhalants) * 100, 1)`% of extracts.

```{r updated_counts}
# first get list/df of inhalantnames
inhalantnames <- as.data.frame(unique(inhalants.joined$retail_prodname))

# rename column so we can call it
colnames(inhalantnames) <- "retail_prodname"

# bringing in classification function
source("categorization_function.R")
inhalantnames <- inhalantnames %>%
  #dplyr::filter(!is.na(retail_prodname)) %>%
  dplyr::rowwise() %>%
  dplyr::mutate(inhalant_type = categorizeNames(retail_prodname),
         inhalant_gen = groupProductTypes(inhalant_type))

# join classified inhalantnames back to dispening df
inhalants.joined$retail_prodname <- as.factor(inhalants.joined$retail_prodname)
inhalants <- left_join(inhalants.joined, inhalantnames, by="retail_prodname")

inhalants %>%
  dplyr::mutate(total = nrow(inhalants),
                inhalant_gen = factor(inhalant_gen, levels = c("Cartridge/Oil", "Wax/Shatter/Resin",
                                                               "Hash/Kief", "Uncategorized"))) %>%
  dplyr::group_by(inhalant_gen) %>%
  dplyr::summarise(percent_type = round(n() / total[1] * 100, 1)) %>%
  # dplyr::summarise(
  #   `Cartridge/Oil` = round((sum(inhalant_gen == "Cartridge/Oil", na.rm=T) / total * 100),1),
  #   `Wax, Shatter, Resin` = round((sum(inhalant_gen == "Hash/Kief", na.rm=T) / total * 100),1),
  #   `Hash, Kief` = round((sum(inhalant_gen == "Wax/Shatter/Resin", na.rm=T) / total * 100),1),
  #   `Uncategorized` = round(((sum(inhalant_gen == "Uncategorized", na.rm=T)) / total * 100),1)
  # ) %>%
  kable(col.names = c("Category", "% of Products"))

```

### Functionalizing

Because the objective of this categorization is to inform other research, this sequence has been formalized into a function that can be called for other analyses. There are two functions below -- one to break down into the 9 distinct product categories, and one to get generalized categories, `Oil/Cartridge`, `Wax/Shatter/Resin`, `Hash/Kief`, and `Uncategorized`. (There is also another function, not included here, that breaks oils and cartridges out separately.)

Here it is also possible to see the specific strings that are used. These have been determined through iteratively looking at product names and phrases, consulting with retail website and domain researchers, and then going back again to re-examine the uncategorized products. By creating this function it also creates some flexibility -- the strings can be easily modified as needed.

```{r function_example, echo = TRUE}
# references at bottom

#create search terms for each product type
cartridge.strings <- "cart|vap|vc|pen|refill|juju|joint|atomizer"
oil.strings <- "oil|rso|eso"
hash.strings <- "hash"
kief.strings <- "kief|keif"
wax.strings <- "wax|crumble|budder"
shatter.strings <- "shatter|snap"
dab.strings <-"dab"
resin.strings <- "resin|rosin"

categorizeNames <- function(productName){
  #' Takes product name and categorizes it into a 
  #' product category type
  #' @param productName  A string of inhalant product names
  #' @return A categorized usage of the productname as a string.
  
  # first check for cartridges. allow for oil to be in product name so that 
  # for example "oil cartridge" will be classified as a cartridge
  if(grepl( cartridge.strings, productName, ignore.case = T) == TRUE & 
     # allows for dab and oil strings
     grepl(paste(hash.strings, kief.strings, wax.strings,shatter.strings, 
                 resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
    return("Cartridge")
  }
  # now check for oil products. allow for hash to also be in product name so that "hash oil" is classified as oil
  else if(grepl(oil.strings, productName, ignore.case = T) == TRUE & 
          # allows for dab strings and hash strings
     grepl(paste(hash.strings, kief.strings, wax.strings,shatter.strings, resin.strings, sep = "|"), 
           productName, ignore.case = T) == FALSE) {
    return("Oil")
  }
  # check for hash products. Allow for no overlap with other products except "dab"
  else if(grepl( hash.strings, productName, ignore.case = T) == TRUE & 
          grepl(paste(cartridge.strings, kief.strings,oil.strings, wax.strings,shatter.strings, 
                      resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
    return("Hash")
  }
  # check for kief products. Allow for no overlap with other products except "dab"
  else if(grepl( kief.strings, productName, ignore.case = T) == TRUE & 
          grepl(paste(cartridge.strings, hash.strings,oil.strings, wax.strings,shatter.strings, 
                      resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
    return("Kief")
  }
  # check for wax products. Allow for no overlap with other products except "dab"
  else if (grepl( wax.strings, productName, ignore.case = T) == TRUE & 
           grepl(paste(cartridge.strings, kief.strings,oil.strings, hash.strings,shatter.strings, dab.strings, 
                       resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
    return("Wax")
  }
  #check for shatter products. Allow for no overlap with other products except "dab"
  else if (grepl( shatter.strings, productName, ignore.case = T) == TRUE & 
           grepl(paste(cartridge.strings, kief.strings,oil.strings, wax.strings, hash.strings, 
                       resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
    return("Shatter")
  }
  #check for resin products. Allow for no overlap with other products except "dab"
  else if (grepl( resin.strings, productName, ignore.case = T) == TRUE & 
           grepl(paste(cartridge.strings, kief.strings,oil.strings, wax.strings,shatter.strings, 
                       hash.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
    return("Resin")
  }
  #check for dab products. Allow for no overlap with other products
  else if (grepl( dab.strings, productName, ignore.case = T) == TRUE & 
           grepl(paste(cartridge.strings, kief.strings,oil.strings, wax.strings,shatter.strings, hash.strings, 
                       resin.strings, sep = "|"), productName, ignore.case = T) == FALSE) {
    return("Dab")
  }
  else return("Uncategorized")
}

groupProductTypes <- function(productType){
  #' Takes product type and categorizes it into a 
  #' product category grouping
  #' @param productType  A string of inhalant product type
  #' @return A grouped usage of the product type as a string.
  
  if(productType=="Cartridge" | productType=="Oil") {
    return("Cartridge/Oil")
  }
  else if(productType=="Hash" | productType=="Kief") {
    return("Hash/Kief")
  }
  else if(productType=="Wax" | productType=="Shatter" | productType=="Dab" | productType=="Resin") {
    return("Wax/Shatter/Resin")
  }
  else return("Uncategorized")
}
```