# Subpopulation Analysis using Frequent Item Sets
Compare item and itemset frequencies between subpopulation and population

In [None]:
library(data.table)
library(arules)

DATA_DIR_NAME <- '/Users/karenblakemore/koverse/data/'

options(warn = -1)

## Data Preparation

In [None]:
prepare_data <- function(df, continuous_columns, drop_columns) { 
    
    df <- df[, !(names(df) %in% drop_columns)] # drop columns
    
    for(col in continuous_columns) {
        df[,col] <- as.numeric(df[,col])        # convert factor -> numeric
        df[,col] <- cut(df[,col], 10)           # bin into 10 intervals
    }
    
    df[] <- lapply(df, factor)                 # all variables must be of type factor
    trans <- as(df, "transactions")            # coerce to transactions
    
    return(trans)
}

## Frequent Itemsets

In [None]:
frequent_itemsets <- function(trans) { 
    invisible(capture.output(itemsets <- eclat(trans, parameter = list(supp = .1, minlen = 2, maxlen = 10))))  
    return(itemsets)
}

## Compare Itemsets

In [None]:
compare_itemsets <- function(grp2_type, grp1, grp2) {
    grp1_df <- as(grp1, "data.frame")
    grp1_df$match <- NULL
    grp2_df <- as(grp2, "data.frame")
    
    grp2_df <- grp2_df[with(grp2_df, order(-support)), ][1:20,]
    if (grp2_type == "population") {
        colnames(grp2_df)[colnames(grp2_df) == 'support'] <- 'population support'
        colnames(grp2_df)[colnames(grp2_df) == 'count'] <- 'population count' 
    }
    
    for (idx in 1:20) {
        grp1_df$match <- sapply(grp1_df[,"items"], function(grp1_items) setequal(grp1_items, grp2_df[idx, "items"]))
        matching_row <- grp1_df[grp1_df$match == TRUE, ] 
                               
        # Comparing most frequent itemsets of subpopulation with population
        if(grp2_type == "subpopulation") {
            if(nrow(matching_row) != 0) {
                grp2_df[idx, "population support"] <- matching_row$support
                grp2_df[idx, "lift"] <- grp2_df[idx, "support"] / matching_row$support
            } else {
                grp2_df[idx, "population support"] <- 0
                grp2_df[idx, "lift"] <- grp2_df[idx, "support"] / .1
            }
        # Comparing most frequent itemsets of population with subpopulation                   
        } else {
            if(nrow(matching_row) != 0) {
                grp2_df[idx, "support"] <- matching_row$support
                grp2_df[idx, "lift"] <- matching_row$support / grp2_df[idx, "population support"]
            } else {
                grp2_df[idx, "support"] <- 0
                grp2_df[idx, "lift"] <- 0
            }
        }
    }
    if(grp2_type == "subpopulation") {
        grp2_df <- grp2_df[with(grp2_df, order(-lift)), ][1:20,] 
    } else {
        grp2_df <- grp2_df[with(grp2_df, order(lift)), ][1:20,] 
    }
    row.names(grp2_df) <- NULL
    IRdisplay::display(grp2_df)
}

## Experiment - NY subpopulation of Hospital Readmissions Data Set
Compare top ranking items and itemsets by frequency of occurrence

In [None]:
continuous_columns <- list("Excess Readmission Ratio",
                           "Expected Readmission Rate",
                           "Predicted Readmission Rate",
                           "Number of Discharges",              
                           "Number of Readmissions"
                           )

drop_columns <- list("Address",
                     "City",
                     "County Name",
                     "Effectiveness of care national comparison footnote",
                     "Efficient use of medical imaging national comparison footnote",
                     "Emergency Services",
                     "End Date",
                     "Footnote",
                     "Hospital Name",
                     "Hospital overall rating footnote",
                     "Hospital Type",
                     "Measure Name",
                     "Meets criteria for meaningful use of EHRs",
                     "Mortality national comparison footnote",
                     "Patient experience national comparison footnote",
                     "Phone Number",
                     "Provider ID",
                     "Provider Number",
                     "Readmission national comparison footnote",
                     "Safety of care national comparison footnote",
                     "Start Date",
                     "State",
                     "Timeliness of care national comparison footnote",
                     "ZIP Code",
                     "ï»¿Hospital Name"
                    )


df <- read.csv(paste(DATA_DIR_NAME, "hospital_readmissions.csv", sep=""), na.strings=c(""), check.names=FALSE)

# Population Frequent Itemsets
trans <- prepare_data(df, continuous_columns, drop_columns)
itemsets <- frequent_itemsets(trans)

# Subpopulation Frequent Itemsets
ny_df <- df[df$State == "NY",]
ny_trans <- prepare_data(ny_df, continuous_columns, drop_columns)
ny_itemsets <- frequent_itemsets(ny_trans)

# Item Set Frequency - Subpopulation vs Population
IRdisplay::display("Comparing Most Frequent Itemsets - Subpopulation vs Population")
compare_itemsets("subpopulation", itemsets, ny_itemsets)

# Item Set Frequency - Population vs Subpopulation
IRdisplay::display("Comparing Most Frequent Itemsets - Population vs Subpopulation")
compare_itemsets("population", ny_itemsets, itemsets)

# Relative Item Frequency - Bars Subpopulation, Line Population
itemFrequencyPlot(ny_trans, topN=20, cex.names = .8, population=trans, horiz=TRUE)
               
# Relative Item Frequency - Bars Population, Line Subpopulation
itemFrequencyPlot(trans, topN=20, cex.names = .8, population=ny_trans, horiz=TRUE)

## Experiment - Survived subpopulation of Titanic Data Set

In [None]:
DATA_SET_NAME = 'titanic'
CLUSTER_COUNT = 10

continuous_columns <- list("Age",
                           "Fare"
                          )

drop_columns <- list("Cabin",
                     "Name",
                     "PassengerId",
                     "Ticket",
                     "Survived"
                     )

df <- read.csv(paste(DATA_DIR_NAME, "titanic.csv", sep=""), na.strings=c(""), check.names=FALSE)

# Population Frequent Itemsets
trans <- prepare_data(df, continuous_columns, drop_columns)
itemsets <- frequent_itemsets(trans)

# Subpopulation Frequent Itemsets
ny_df <- df[df$Survived == 1,]
ny_trans <- prepare_data(ny_df, continuous_columns, drop_columns)
ny_itemsets <- frequent_itemsets(ny_trans)

# Item Set Frequency - Subpopulation vs Population
IRdisplay::display("Comparing Most Frequent Itemsets - Subpopulation vs Population")
compare_itemsets("subpopulation", itemsets, ny_itemsets)

# Item Set Frequency - Population vs Subpopulation
IRdisplay::display("Comparing Most Frequent Itemsets - Population vs Subpopulation")
compare_itemsets("population", ny_itemsets, itemsets)

# Relative Item Frequency - Bars Subpopulation, Line Population
itemFrequencyPlot(ny_trans, topN=20, cex.names = .8, population=trans, horiz=TRUE)
               
# Relative Item Frequency - Bars Population, Line Subpopulation
itemFrequencyPlot(trans, topN=20, cex.names = .8, population=ny_trans, horiz=TRUE)


## Experiment - Transaction Cluster Subpopulations of Titanic Data Set

In [None]:
DATA_SET_NAME = 'titanic'
CLUSTER_COUNT = 10

continuous_columns <- list("Age",
                           "Fare"
                          )

drop_columns <- list("Cabin",
                     "Name",
                     "PassengerId",
                     "Ticket"
                     )

df <- read.csv(paste(DATA_DIR_NAME, "titanic.csv", sep=""), na.strings=c(""), check.names=FALSE)
clusters_df <- read.csv(paste(DATA_DIR_NAME, "titanic_clusters.csv", sep=""), na.strings=c(""), check.names=FALSE)

# Population Frequent Itemsets
trans <- prepare_data(df, continuous_columns, drop_columns)
itemsets <- frequent_itemsets(trans)

# Compare Frequent Itemsets between population and each cluster
for (i in 1:CLUSTER_COUNT) {
    # Subpopulation Frequent Itemsets
    subpop_df <- clusters_df[clusters_df$"__CLUSTER__" == i,]
    subpop_df$"__CLUSTER__" <- NULL
    subpop_trans <- prepare_data(subpop_df, continuous_columns, drop_columns)
    subpop_itemsets <- frequent_itemsets(subpop_trans)

    # Item Set Frequency - Subpopulation vs Population
    IRdisplay::display(paste("Comparing Most Frequent Itemsets - Population vs Cluster ", i, sep=""))
    compare_itemsets("subpopulation", itemsets, subpop_itemsets)

    # Item Set Frequency - Population vs Subpopulation
    IRdisplay::display(paste("Comparing Most Frequent Itemsets - Cluster vs Population ", i, sep=""))
    compare_itemsets("population", subpop_itemsets, itemsets)
    }