In [None]:
#Working with categorical data - Example adapted from: https://pbpython.com/pandas_dtypes_cat.html
# import required modules
import pandas as pd
import numpy as np
import requests
from io import StringIO
from io import BytesIO
from zipfile import ZipFile

In [None]:
#Defining location of dataset 
filepath="~/datasets/ist652/Categories/medical.zip"

In [None]:
df=pd.read_csv(filepath,compression='zip')

In [None]:
df.head()

In [None]:
df.info()

Let's see which columns may be good candidates for a categorical data type by counting how many unique entries/values are in each column and also using that information to determine if there are some columns that are not relevant.

In [None]:
unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])

In [None]:
unique_counts

In [None]:
#drop columns that don't bring any new information
df.drop(['Payment_Publication_Date','Delay_in_Publication_Indicator','Program_Year'],axis=1,inplace=True)

In [None]:
df.info()

There is a big jump in unique values when we get to 670. We will use that as the threshold (actually, we will make the threshold 700) for conversion to a Categorical values column (except for columns that have date/time based information). 

In [None]:
cols_to_exclude = ['Date_of_Payment']
for col in df.columns:
    if df[col].nunique() < 700 and col not in cols_to_exclude:
        df[col] = df[col].astype('category')

In [None]:
df.info()

Please note that by using categorical types, we have reduced the memory use of the dataframe substantially.

In [None]:
#Summary of total payments made by covered recipient type
df.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()

To change the order of *Covered_Recipient_Type* we create a *CategoricalDtype*

In [None]:
from pandas.api.types import CategoricalDtype
cats_in_order = ["Non-covered Recipient Entity", "Covered Recipient Teaching Hospital",
                 "Covered Recipient Physician", "Non-covered Recipient Individual"]
covered_type = CategoricalDtype(categories=cats_in_order, ordered=True)

In [None]:
covered_type

In [None]:
df['Covered_Recipient_Type'] = df['Covered_Recipient_Type'].cat.reorder_categories(cats_in_order, ordered=True)

In [None]:
df.groupby('Covered_Recipient_Type')['Total_Amount_of_Payment_USDollars'].sum().to_frame()