<a href="https://colab.research.google.com/github/Ananya-AJ/CMPE255-SafeDose/blob/main/Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---


**This colab provides a step-by-step data preparation pipeline for the project 
'Safe Dose'. The data preparation steps are functionalized and can be called sequentially to obtain the final processed dataset.**


---

In [None]:
# Install libraries
!pip install category_encoders
!pip install --user prince

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting featurewiz
  Using cached featurewiz-0.2.3-py3-none-any.whl (111 kB)
Collecting lightgbm>=3.2.1
  Using cached lightgbm-3.3.3-py3-none-manylinux1_x86_64.whl (2.0 MB)
Collecting Pillow>=9.0.0
  Downloading Pillow-9.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 13.0 MB/s 
Collecting xgboost>=1.5.1
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
[K     |████████████████████████████████| 255.9 MB 58 kB/s 
Collecting xlrd>=2.0.0
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 5.5 MB/s 
Collecting pyarrow~=7.0.0
  Downloading pyarrow-7.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[K     |████████████████████████████████| 26.7 MB 1.2 MB/s 
[?25hCollecting jupyter
  Using cached jupyter-1.0.0-py2.py3-non

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Imports

import numpy as np
import pandas as pd
import pickle

import category_encoders as ce
from sklearn.decomposition import PCA
import prince

from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


Import raw data and segregate data into demographic information, case related information and drug related information for easier processing.

In [None]:
def getRawData():

  # Read data
  drug = pd.read_csv('/content/drive/Shareddrives/CMPE255/data/DAWN-2011-DS0001-data-excel.tsv', sep = '\t', header = 0)

  # Split dataset into demographic information, case related information and drug related information for easy processing
  demographic_info_df = drug[['CASEID', 'METRO', 'AGECAT', 'SEX', 'RACE']]

  case_info_df = drug[['CASEID', 'CASEWGT', 'YEAR', 'QUARTER', 'DAYPART', 'NUMSUBS', 'CASETYPE', 'DISPOSITION', 'ALLABUSE']]

  # Create drug related information df
  drug_info_cols = ['CASEID'] 
  for i in range(1, 23):
    drug_info_cols.append('DRUGID_' + str(i))
    drug_info_cols.append('CATID_1_' + str(i))
    drug_info_cols.append('CATID_2_' + str(i))
    drug_info_cols.append('CATID_3_' + str(i))
    drug_info_cols.append('ROUTE_' + str(i))
    drug_info_cols.append('TOXTEST_' + str(i))
    drug_info_cols.append('sdled_1_' + str(i))
    drug_info_cols.append('sdled_2_' + str(i))
    drug_info_cols.append('sdled_3_' + str(i))
    drug_info_cols.append('sdled_4_' + str(i))
    drug_info_cols.append('sdled_5_' + str(i))
    drug_info_cols.append('sdled_6_' + str(i))

  # Append remaining columns
  drug_info_cols.append('ALCOHOL')
  drug_info_cols.append('NONALCILL')
  drug_info_cols.append('PHARMA')
  drug_info_cols.append('NONMEDPHARMA')

  drug_info_df = drug[drug_info_cols]

  return demographic_info_df, case_info_df, drug_info_df

It was found in the EDA phase that NUMSUBS has outliers, i.e. the number of drugs reported by the patients. NUMSUBS > 3 lie outside (Q3+1.5*IQR) and therefore those records are removed from the dataset. Along with that sdled_5 and sdled_6 columns for each drug are also removed since it is not applicable for more than 95% of the records.

In [None]:
def removeOutliers(case_info_df, drug_info_df):

  # Remove outliers from numsubs column using IQR method
  q25, q75 = np.percentile(case_info_df['NUMSUBS'], 25), np.percentile(case_info_df['NUMSUBS'], 75)
  iqr = q75 - q25
  cut_off = iqr * 1.5
  lower, upper = q25 - cut_off, q75 + cut_off
  case_info_df = case_info_df[(case_info_df.NUMSUBS >= lower) & (case_info_df.NUMSUBS <= upper)]

  # Filter rows from drug_info_df as per the CASEID in case_info_df after outliers are removed
  drug_info_df_temp = drug_info_df[drug_info_df.CASEID.isin(list(case_info_df.CASEID))]
  cols = ['CASEID', 'DRUGID_1', 'CATID_1_1', 'CATID_2_1', 'CATID_3_1', 'ROUTE_1', 'TOXTEST_1', 'sdled_1_1', 'sdled_2_1', 'sdled_3_1', 'sdled_4_1', \
          'DRUGID_2', 'CATID_1_2', 'CATID_2_2', 'CATID_3_2', 'ROUTE_2', 'TOXTEST_2', 'sdled_1_2', 'sdled_2_2', 'sdled_3_2', 'sdled_4_2', \
          'DRUGID_3', 'CATID_1_3', 'CATID_2_3', 'CATID_3_3', 'ROUTE_3', 'TOXTEST_3', 'sdled_1_3', 'sdled_2_3', 'sdled_3_3', 'sdled_4_3', \
          'ALCOHOL', 'NONALCILL', 'PHARMA', 'NONMEDPHARMA']

  # After removing numsubs outliers, the max number of drugs reported in any case is 3. therefore, elimiate all others and fit the other in the available 3 columns
  reduced_drug_info_df = pd.DataFrame(columns = cols)
  for idx, r in drug_info_df_temp.iterrows():
    row = [r.CASEID]
    num = 1
    for i in range(1, 23):
        if r['DRUGID_' + str(i)] != -7:
          row.append(r['DRUGID_' + str(num)])
          row.append(r['CATID_1_' + str(num)])
          row.append(r['CATID_2_' + str(num)])
          row.append(r['CATID_3_' + str(num)])
          row.append(r['ROUTE_' + str(num)])
          row.append(r['TOXTEST_' + str(num)])
          row.append(r['sdled_1_' + str(num)])
          row.append(r['sdled_2_' + str(num)])
          row.append(r['sdled_3_' + str(num)])
          row.append(r['sdled_4_' + str(num)])
        
          num += 1

    # For records containing less than 3 numsubs, append -7 for the remaining ones
    if num < 4:
        missing = [-7] * ((4-num)*10)
        row.extend(missing)

    row.append(r['ALCOHOL'])
    row.append(r['NONALCILL'])
    row.append(r['PHARMA'])
    row.append(r['NONMEDPHARMA'])

    # Append to landing df
    reduced_drug_info_df = reduced_drug_info_df.append(pd.DataFrame([row], columns = cols))

  return case_info_df, reduced_drug_info_df

The dataset contains negative values that represent the following:
*   -7 : Not applicable
*   -8 : Not documented 
*   -9 : Missing

All these are replaced by 0 as these values cannot be imputed or estimated by interpolation. By substituting them as 0 tells the classification model to treat them as one category of variables.


In [None]:
def cleanData(reduced_drug_info_df, case_info_df, demographic_info_df):
  
  # Merge all 3 dataframes together to generate final dataframe
  final_df = (reduced_drug_info_df.merge(case_info_df, on = ['CASEID'], how = 'left')).merge(demographic_info_df, on = ['CASEID'], how = 'left')

  # Replace -7, -8 and -9 with 0
  final_df = final_df.replace({-7:0, -8:0, -9:0})

  # Drop columns that are not required
  final_df = final_df.drop(['YEAR', 'QUARTER', 'DAYPART', 'NUMSUBS', 'DISPOSITION'], axis = 1)

  # Save new dataframe which will be used for further processing and model training
  final_df.to_csv('/content/drive/Shareddrives/CMPE255/data/final_drug_data.csv')

  return final_df

In [None]:
def createMappingsDf(final_df, cols):

  # Mappings dataframe for drug input from user on dashboard to get catids and sdleds for inputted drug
  columns = ['DRUGID', 'CATID_1', 'CATID_2', 'CATID_3', 'sdled_1', 'sdled_2', 'sdled_3', 'sdled_4']
  df_ = final_df[['DRUGID_1', 'CATID_1_1', 'CATID_2_1', 'CATID_3_1', 'sdled_1_1', 'sdled_2_1', 'sdled_3_1', 'sdled_4_1']]

  prominent_drugs = [1255, 1254, 1253, 865, 2420, 21, 2427, 2343, 1016, 505, 85, 152]

  cat_sdled_df = df_[df_.DRUGID_1.isin(prominent_drugs)].drop_duplicates().reset_index(drop = True)
  cat_sdled_df.columns = columns

  # Save dataframe
  cat_sdled_df.to_csv('/content/drive/Shareddrives/CMPE255/data/cat_sdled_mapping.csv')

The datasets for predicting the casetype and abuse type contain all categorical features. Since the cardinality is very high, we perform hash encoding on the high cardinality columns and one hot encoding on the low cardinality columns. This is done so that categorical columns are treated as categories by the classificaiton models.

*   Hashencoding -  Hashencoding is a process of converting categorical features with very high cardinality into numerical features. Hash encoders hash every value in the feature column and the hash value determines the bucket that the value falls into. By taking 7 buckets, we encode all drug related columns such that every feature is expanded into 7 columns with binary data. 
*   For demographic and some drug related columns, onehot encoding is followed as the number of categories is small. One hot encoding ensures no information is lost, unlike hash encoding where information loss occurs due to hashing collisions. However, hash encoding offers a compressed encoding that is computationally efficient.


In [None]:
def encodingCasetype(df):

  # Getting casetypes that are not others(8) in train and those that are 8 in test
  case_train = df[df['CASETYPE'] != 8]
  case_test = df[df['CASETYPE'] == 8]

  # Getting data for one hot encoding
  demo_cols = ['METRO', 'AGECAT', 'SEX', 'RACE', 'PHARMA', 'CASETYPE', 'ROUTE_1', 'TOXTEST_1', 'ROUTE_2', 'TOXTEST_2', 'ROUTE_3', 'TOXTEST_3']
  case_train_one_hot = case_train[demo_cols]
  case_test_one_hot = case_test[demo_cols]

  # Getting drug data for category hash encoding
  drug_cols = ['DRUGID_1', 'CATID_1_1', 'CATID_2_1', 'CATID_3_1', 'sdled_1_1', 'sdled_2_1', 'sdled_3_1', 'sdled_4_1',
        'DRUGID_2', 'CATID_1_2', 'CATID_2_2', 'CATID_3_2', 'sdled_1_2', 'sdled_2_2', 'sdled_3_2', 'sdled_4_2',
        'DRUGID_3', 'CATID_1_3', 'CATID_2_3', 'CATID_3_3', 'sdled_1_3', 'sdled_2_3', 'sdled_3_3', 'sdled_4_3']
  case_train_drug = case_train[drug_cols]
  case_test_drug = case_test[drug_cols]
  

  def oneHotEncode(df):
    onehotencode = pd.DataFrame()
      
    # Onehot code
    for feature in df.columns:  
      demo_encoded = pd.get_dummies(df[feature], prefix = feature)
      onehotencode = pd.concat([onehotencode, demo_encoded], axis = 1)

    return onehotencode


  def convertToCategory(drug_df):
    for c in drug_df.columns:
      drug_df[c] = drug_df[c].astype('category')

    return drug_df


  def hashEncode(category_case_train_drug,c ategory_case_test_drug):
    drug_trainhashencoding = pd.DataFrame()
    drug_testhashencoding = pd.DataFrame()

    # hash encode feature wise
    for feature in category_case_train_drug.columns:
      # Hashing encoder
      encoder = ce.HashingEncoder(cols=feature)
      encoder.fit(category_case_train_drug[feature])

      # Save encoding object
      pickle.dump(encoder,open('/content/drive/Shareddrives/CMPE255/pickles'+feature+'.pkl', 'wb'))

      # Transform train and test set with hashing encoder object
      case_train_drug_hash = encoder.transform(category_case_train_drug)
      case_test_drug_hash = encoder.transform(category_case_test_drug)

      # Combine
      drug_trainhashencoding = pd.concat([case_train_drug_hash, drug_trainhashencoding], axis = 1)
      drug_testhashencoding = pd.concat([case_test_drug_hash, drug_testhashencoding], axis = 1)

    return drug_trainhashencoding,drug_testhashencoding

  # Onehot encode demographic information
  case_train_onehot = oneHotEncode(case_train_one_hot).reset_index(drop = True)
  case_test_onehot = oneHotEncode(case_test_one_hot).reset_index(drop = True)

  # Convert datatype to category for hash encoding
  category_case_train_drug = convertToCategory(case_train_drug)
  category_case_test_drug = convertToCategory(case_test_drug)

  # Hashencode drug information related columns
  hashencode_case_train, hashencode_case_test = hashEncode(category_case_train_drug, category_case_test_drug)
  hashencode_case_train.reset_index(drop = True, inplace = True)
  hashencode_case_test.reset_index(drop = True, inplace = True)

  # Concat all three dataframes to generate final df
  final_case_train = pd.concat([case_train_onehot, hashencode_case_train], axis = 1)
  final_case_test = pd.concat([case_test_onehot, hashencode_case_train], axis = 1)

  # Save dataframe
  final_case_train.to_csv('/content/drive/Shareddrives/CMPE255/data/encoded data/encodedCasetypeData_train.csv')
  final_case_test.to_csv('/content/drive/Shareddrives/CMPE255/data/encoded data/encodedCasetypeData_test.csv')

  return final_case_train, final_case_test

In [None]:
def encodingAbuse(df):

  # Train test split on cleaned df
  X = df.drop(['ALLABUSE','NONALCILL','ALCOHOL','NONMEDPHARMA', 'PHARMA'], axis = 1)
  y = df['ALLABUSE','NONALCILL','ALCOHOL','NONMEDPHARMA', 'PHARMA']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

  # Getting data for one hot encoding
  demo_cols = ['METRO', 'AGECAT', 'SEX', 'RACE', 'PHARMA', 'CASETYPE', 'ROUTE_1', 'TOXTEST_1', 'ROUTE_2', 'TOXTEST_2', 'ROUTE_3', 'TOXTEST_3']
  case_train_one_hot = X_train[demo_cols]
  case_test_one_hot = X_test[demo_cols]

  # Getting drug data for category hash encoding
  drug_cols = ['DRUGID_1', 'CATID_1_1', 'CATID_2_1', 'CATID_3_1', 'sdled_1_1', 'sdled_2_1', 'sdled_3_1', 'sdled_4_1',
        'DRUGID_2', 'CATID_1_2', 'CATID_2_2', 'CATID_3_2', 'sdled_1_2', 'sdled_2_2', 'sdled_3_2', 'sdled_4_2',
        'DRUGID_3', 'CATID_1_3', 'CATID_2_3', 'CATID_3_3', 'sdled_1_3', 'sdled_2_3', 'sdled_3_3', 'sdled_4_3']
  case_train_drug = X_train[drug_cols]
  case_test_drug = X_test[drug_cols]


  def oneHotEncode(df):
    onehotencode = pd.DataFrame()
      
    # Onehot code
    for feature in df.columns:  
      demo_encoded = pd.get_dummies(df[feature], prefix = feature)
      onehotencode = pd.concat([onehotencode, demo_encoded], axis = 1)

    return onehotencode


  def convertToCategory(drug_df):
    for c in drug_df.columns:
      drug_df[c] = drug_df[c].astype('category')

    return drug_df


  def hashEncode(category_case_train_drug, c ategory_case_test_drug):
    drug_trainhashencoding = pd.DataFrame()
    drug_testhashencoding = pd.DataFrame()

    # hash encode feature wise
    for feature in category_case_train_drug.columns:
      # Hashing encoder
      encoder = ce.HashingEncoder(cols = feature)
      encoder.fit(category_case_train_drug[feature])

      # Transform train and test set with hashing encoder object
      case_train_drug_hash = encoder.transform(category_case_train_drug)
      case_test_drug_hash = encoder.transform(category_case_test_drug)

      # Combine
      drug_trainhashencoding = pd.concat([case_train_drug_hash, drug_trainhashencoding], axis = 1)
      drug_testhashencoding = pd.concat([case_test_drug_hash, drug_testhashencoding], axis = 1)

    return drug_trainhashencoding,drug_testhashencoding


  # Columns that won't be encoded
  data_to_concatenate_last_train = y_train[['ALLABUSE','NONALCILL','ALCOHOL','NONMEDPHARMA','PHARMA']]
  data_to_concatenate_last_test = y_test[['ALLABUSE','NONALCILL','ALCOHOL','NONMEDPHARMA', 'PHARMA']]

   # Taking out caswgt
  casewt_toappend_train = X_train[['CASEWGT']]
  casewt_toappend_test = X_test[['CASEWGT']]

  # One hot encode demographic columns
  case_train_onehot = oneHotEncode(case_train_one_hot).reset_index(drop = True)
  case_test_onehot = oneHotEncode(case_test_one_hot).reset_index(drop = True)

  # Convert datatype to category
  category_case_train_drug = convertToCategory(case_train_drug)
  category_case_test_drug = convertToCategory(case_test_drug)

  # Hashencode drug related columns
  hashencode_case_train, hashencode_case_test = hashEncode(category_case_train_drug, category_case_test_drug)
  hashencode_case_train.reset_index(drop = True, inplace = True)
  hashencode_case_test.reset_index(drop = True, inplace = True)

  # Merge all encodings
  final_abuse_train = pd.concat([case_train_onehot, hashencode_case_train, casewt_toappend_train, data_to_concatenate_last_train], axis = 1)
  final_abuse_test = pd.concat([case_test_onehot, hashencode_case_train, casewt_toappend_train, data_to_concatenate_last_test], axis = 1)

  # Save dataframe
  final_abuse_train.to_csv('/content/drive/Shareddrives/CMPE255/data/encoded data/encodedAbuseData_train.csv')
  final_abuse_test.to_csv('/content/drive/Shareddrives/CMPE255/data/encoded data/encodedAbuseData_test.csv')

  return final_abuse_train, final_abuse_test

After the encoding, the number of feature columns increases drastically. Thus, reducing the dimensionality becomes an inevident step in the processing pipeline. The dimensionality of the datasets is reducing using PCA and MCA.
*   For dataset A, the encodings are then combined and provided to Principal Component Analysis. PCA finds the principal components that explain 80% variance in the data. The resultant dataset has reduced dimensions with minimal information loss which is then input to the model for multilabel classification of the type of abuse. 
*   For dataset C, the encoded data is passed through MCA (Multiple Correpsondence Analysis) which is similar to PCA but is specific to categorical data. This too reduces the dimensionality of the dataset.


In [None]:
def pcaAbuse(train, test):

  cols = ['ALLABUSE', 'NONALCILL', 'ALCOHOL', 'NONMEDPHARMA', 'PHARMA']

  # Separate X and y
  X_train = train.drop(cols, axis = 1)
  y_train = train[cols]
  X_test = test.drop(cols, axis = 1)
  y_test = test[cols]

  # PCA
  pca = PCA(n_components = 0.8).fit(X_train)
  X_train_pca = pca.transform(X_train)
  X_test_pca = pca.transform(X_test)

  # Concatenate X and y columns for both train and test set
  X_train_pca_df = pd.DataFrame(data = X_train_pca)
  X_train_pca_df.reset_index(drop = True, inplace = True)
  y_train.reset_index(drop = True, inplace = True)
  X_train_pca = pd.concat([X_train_pca_df, y_train], axis = 1)

  X_test_pca_df = pd.DataFrame(data = X_test_pca)
  X_test_pca_df.reset_index(drop = True, inplace = True)
  y_test.reset_index(drop = True, inplace = True)
  X_test_pca = pd.concat([X_test_pca_df, y_test], axis = 1)

  # Dump pca object
  pickle.dump(pca, open('pca_abuse_obj.pkl', 'wb'))

  # Save df
  X_train_pca.to_csv('X_train_abuse.csv')
  X_test_pca.to_csv('X_test_abuse.csv')   

In [None]:
def mcaCasetype(train, test):
  train_case.drop(['Unnamed: 0', 'CASEWGT'], axis = 1, inplace = True)
  test_case.drop(['Unnamed: 0', 'CASEWGT'], axis = 1, inplace = True)

  train_cols = ['CASETYPE_1', 'CASETYPE_2', 'CASETYPE_3', 'CASETYPE_4', 'CASETYPE_5', 'CASETYPE_6', 'CASETYPE_7']
  test_cols = ['CASETYPE_8']

  # Separate X and y
  X_train = train.drop(train_cols, axis = 1)
  y_train = traintrain_cols]
  X_test = test.drop(test_cols, axis = 1)
  y_test = test[test_cols]

  # Map value for MCA
  X_train.replace({0: 'False', 1: 'True'}, inplace = True)
  X_test.replace({0: 'False', 1: 'True'}, inplace = True)

  # MCA
  mca = prince.MCA().fit(X_train)
  X_train_mca = mca.transform(X_train)
  X_test_mca = mca.transform(X_test)

  # Concatenate X and y columns for both train and test set
  X_train_mca_df = pd.DataFrame(data = X_train_pca)
  X_train_mca_df.reset_index(drop = True, inplace = True)
  y_train.reset_index(drop = True, inplace = True)
  X_train_mca = pd.concat([X_train_mca_df, y_train], axis = 1)

  X_test_mca_df = pd.DataFrame(data = X_test_pca)
  X_test_mca_df.reset_index(drop = True, inplace = True)
  y_test.reset_index(drop = True, inplace = True)
  X_test_mca = pd.concat([X_test_mca_df, y_test], axis = 1)

  # Save dataframes
  X_train_mca.to_csv('/content/drive/Shareddrives/CMPE255/data/pca/X_train_mca_casetype.csv')
  X_test_mca.to_csv('/content/drive/Shareddrives/CMPE255/data/pca/X_test_mca_casetype.csv')

In [None]:
# Function calls for data preparation

demographic_info_df, case_info_df, drug_info_df = getRawData()
case_info_df, reduced_drug_info_df = removeOutliers(case_info_df, drug_info_df)
final_df = cleanData(reduced_drug_info_df, case_info_df, demographic_info_df)
createMappingsDf(final_df)

encoded_abuse_train_df, encoded_abuse_test_df = encodingCasetype(final_df)
encoded_casetype_train_df, encoded_casetype_test_df = encodingAbuse(final_df)

pcaAbuse(encoded_abuse_train_df, encoded_abuse_test_df

mcaCasetype(encoded_casetype_train_df, encoded_casetype_test_df)