# Loading the dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = "drive/MyDrive/Colab Notebooks/InsideOut Final Project/Dataset 3/"

import pandas as pd
import numpy as np

dataset3 = pd.read_csv(path + "Dataset.csv")
dataset3.columns = [col.title() for col in dataset3.columns]
dataset3 = dataset3.replace('.', np.nan, regex=False)

In [None]:
dataset3

Unnamed: 0,County,Hospital,Links To Comment Letters,Esophageal Resection (Risk-Adjusted Mortality Rate),Esophageal Resection (# Of Deaths),Esophageal Resection (# Of Cases),Esophageal Resection (Outlier Ratings),Pancreatic Resection (Risk-Adjusted Mortality Rate),Pancreatic Resection (# Of Deaths),Pancreatic Resection (# Of Cases),...,Pneumonia (# Of Cases),Pneumonia (Outlier Ratings),Pci (Risk-Adjusted Mortality Rate),Pci (# Of Deaths),Pci (# Of Cases),Pci (Outlier Ratings),Carotid Endarterectomy (Risk-Adjusted Mortality Rate),Carotid Endarterectomy (# Of Deaths),Carotid Endarterectomy (# Of Cases),Carotid Endarterectomy (Outlier Ratings)
0,Alameda,Alameda County Medical Center - Highland Campus,,,,,,,,,...,212,,,,,,0,0,3,
1,Alameda,Alameda Hospital,,,,,,,,,...,150,,,,,,0,0,3,
2,Alameda,Alta Bates Summit Medical Center - Alta Bates ...,,,,,,,,,...,245,,2.6,5,95,,6.9,1,13,
3,Alameda,Alta Bates Summit Medical Center - Summit Camp...,,0,0,3,,0,0,3,...,371,,2.9,19,792,,7.2,1,21,
4,Alameda,Eden Medical Center,,,,,,,,,...,195,,,,,,0,0,7,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,Ventura,Thousand Oaks Surgical Hospital,,,,,,,,,...,,,,,,,,,,
327,Ventura,Ventura County Medical Center,,,,,,,,,...,146,,,,,,,,,
328,Yolo,Sutter Davis Hospital,,,,,,,,,...,97,,,,,,,,,
329,Yolo,Woodland Memorial Hospital,,,,,,,,,...,108,,,,,,0,0,18,


In [None]:
dataset3.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331 entries, 0 to 330
Data columns (total 51 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   County                                                      331 non-null    object 
 1   Hospital                                                    331 non-null    object 
 2   Links To Comment Letters                                    11 non-null     object 
 3   Esophageal Resection (Risk-Adjusted Mortality Rate)         40 non-null     object 
 4   Esophageal Resection (# Of Deaths)                          40 non-null     object 
 5   Esophageal Resection (# Of Cases)                           40 non-null     object 
 6   Esophageal Resection (Outlier Ratings)                      0 non-null      float64
 7   Pancreatic Resection (Risk-Adjusted Mortality Rate)         64 non-null     object 
 8   

# Preprocessing

## Handling Boolean Columns

Looks like there are no boolean columns but they are separated into two different categories

In [None]:
nunique_values = pd.DataFrame(
    list({column: dataset3[column].nunique() for column in dataset3.columns}.items()),
    columns=['Column', 'UniqueValues']
)
for column in nunique_values[nunique_values['UniqueValues'] == 2]['Column']:
    print(dataset3[column].unique())

[nan 'Better' 'Worse']
[nan 'Worse' 'Better']
[nan 'Better' 'Worse']
[nan 'Better' 'Worse']
[nan 'Better' 'Worse']
[nan 'Worse' 'Better']
[nan 'Worse' 'Better']
[nan 'Worse' 'Better']


# Define the data types of `dataset3`

In [None]:
pd.DataFrame(
    [
        {
            'Column': column,
            'Unique Values': dataset3[column].unique().tolist(),  # Convert numpy array to list for better readability
            'Number of Unique Values': dataset3[column].nunique()
        }
        for column in dataset3.columns
    ]
).sort_values(by="Number of Unique Values")

Unnamed: 0,Column,Unique Values,Number of Unique Values
6,Esophageal Resection (Outlier Ratings),[nan],0
50,Carotid Endarterectomy (Outlier Ratings),"[nan, Worse]",1
14,Aaa Repair (Outlier Ratings),"[nan, Worse]",1
10,Pancreatic Resection (Outlier Ratings),"[nan, Worse]",1
22,Acute Myocardial Infarction (Outlier Ratings),"[nan, Worse, Better]",2
30,Acute Stroke (Outlier Ratings),"[nan, Better, Worse]",2
42,Pneumonia (Outlier Ratings),"[nan, Worse, Better]",2
26,Heart Failure (Outlier Ratings),"[nan, Better, Worse]",2
46,Pci (Outlier Ratings),"[nan, Worse, Better]",2
38,Hip Fracture (Outlier Ratings),"[nan, Worse, Better]",2


In [None]:
categorical_columns = [
    'Carotid Endarterectomy (Outlier Ratings)',         # [nan, Worse]
    'Aaa Repair (Outlier Ratings)',	                    # [nan, Worse]
    'Pancreatic Resection (Outlier Ratings)',	        # [nan, Worse]
    'Acute Myocardial Infarction (Outlier Ratings)',	# [nan, Worse, Better]
    'Acute Stroke (Outlier Ratings)',	                # [nan, Better, Worse]
    'Pneumonia (Outlier Ratings)',	                    # [nan, Worse, Better]
    'Heart Failure (Outlier Ratings)',	                # [nan, Better, Worse]
    'Pci (Outlier Ratings)',	                        # [nan, Worse, Better]
    'Hip Fracture (Outlier Ratings)',	                # [nan, Worse, Better]
    'Gi Hemorrhage (Outlier Ratings)',	                # [nan, Better, Worse]
    'Craniotomy (Outlier Ratings)',  	                # [nan, Better, Worse]
    'County'
]

for column in categorical_columns:
    dataset3[column] = dataset3[column].astype('category')

In [None]:
dataset3_dtypes = dataset3.dtypes.apply(str).to_frame().rename(columns={0: 'Data Type'})
dataset3_dtypes_without_category = dataset3_dtypes[~(dataset3_dtypes['Data Type'] == 'category')]
columns_to_check = dataset3_dtypes_without_category.index

numeric_or_nan_columns = []

for col in columns_to_check:
    # Check if the column data type is numeric
    if pd.api.types.is_numeric_dtype(dataset3[col]):
        numeric_or_nan_columns.append(col)
    # Else if the data type is not numeric, check if it's an object that can be coerced to numeric
    elif dataset3[col].dtype == object and pd.to_numeric(dataset3[col], errors='coerce').notna().any():
        numeric_or_nan_columns.append(col)

for column in numeric_or_nan_columns:
    dataset3[column] = dataset3[column].astype('float64')

In [None]:
for column in ["Links To Comment Letters", "Hospital"]:
    dataset3[column] = dataset3[column].astype('string')

In [None]:
dataset3.dtypes.apply(str).to_frame().rename(columns={0: 'Data Type'}).sort_values(by="Data Type")

Unnamed: 0,Data Type
County,category
Pci (Outlier Ratings),category
Pneumonia (Outlier Ratings),category
Hip Fracture (Outlier Ratings),category
Gi Hemorrhage (Outlier Ratings),category
Acute Stroke (Outlier Ratings),category
Heart Failure (Outlier Ratings),category
Acute Myocardial Infarction (Outlier Ratings),category
Craniotomy (Outlier Ratings),category
Aaa Repair (Outlier Ratings),category


# Handling The Table Discription for dataset3

## Categories
* Number of Different Values
* Minimal Incidence
* Maximal Incidence
* Average Incidence

In [None]:
categoric_analysis = pd.DataFrame(columns=['Number of Different Values', 'Minimal Incidence', 'Maximal Incidence', 'Average Incidence'])

for column in categorical_columns:
    value_counts = dataset3[column].value_counts()
    num_different_values = value_counts.count()
    min_incidence = value_counts.min()
    max_incidence = value_counts.max()
    avg_incidence = value_counts.mean()

    categoric_analysis.loc[column] = [num_different_values, min_incidence, max_incidence, avg_incidence]

categoric_analysis

Unnamed: 0,Number of Different Values,Minimal Incidence,Maximal Incidence,Average Incidence
Carotid Endarterectomy (Outlier Ratings),1.0,1.0,1.0,1.0
Aaa Repair (Outlier Ratings),1.0,2.0,2.0,2.0
Pancreatic Resection (Outlier Ratings),1.0,1.0,1.0,1.0
Acute Myocardial Infarction (Outlier Ratings),2.0,8.0,18.0,13.0
Acute Stroke (Outlier Ratings),2.0,20.0,27.0,23.5
Pneumonia (Outlier Ratings),2.0,22.0,24.0,23.0
Heart Failure (Outlier Ratings),2.0,19.0,20.0,19.5
Pci (Outlier Ratings),2.0,3.0,5.0,4.0
Hip Fracture (Outlier Ratings),2.0,2.0,7.0,4.5
Gi Hemorrhage (Outlier Ratings),2.0,4.0,12.0,8.0


## Numerical
* Discrete / Continuous
* Mean
* Median
* Minimum value
* Maximum value

In [None]:
def discrete_or_continuous(col):
    non_null_values = dataset3[col].dropna()
    if (non_null_values == non_null_values.astype(int)).all():
        return "Discrete"
    else:
        return "Continuous"

numeric_analysis = pd.DataFrame(columns=['Discrete/Continuous', 'Mean', 'Median', 'Minimum Value', 'Maximum Value'])

for column in numeric_or_nan_columns:
    dtype = discrete_or_continuous(column)
    mean_val = dataset3[column].mean()
    median_val = dataset3[column].median()
    min_val = dataset3[column].min()
    max_val = dataset3[column].max()
    numeric_analysis.loc[column] = [dtype, mean_val, median_val, min_val, max_val]

numeric_analysis

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Unnamed: 0,Discrete/Continuous,Mean,Median,Minimum Value,Maximum Value
Esophageal Resection (Risk-Adjusted Mortality Rate),Continuous,3.025,0.0,0.0,35.2
Esophageal Resection (# Of Deaths),Discrete,0.25,0.0,0.0,2.0
Esophageal Resection (# Of Cases),Discrete,8.325,5.0,3.0,49.0
Esophageal Resection (Outlier Ratings),Discrete,,,,
Pancreatic Resection (Risk-Adjusted Mortality Rate),Continuous,5.678125,0.0,0.0,100.0
Pancreatic Resection (# Of Deaths),Discrete,0.375,0.0,0.0,2.0
Pancreatic Resection (# Of Cases),Discrete,10.3125,6.0,3.0,55.0
Aaa Repair (Risk-Adjusted Mortality Rate),Continuous,2.190278,0.0,0.0,100.0
Aaa Repair (# Of Deaths),Discrete,0.236111,0.0,0.0,2.0
Aaa Repair (# Of Cases),Discrete,17.277778,15.0,3.0,73.0


In [None]:
string_columns = ["Links To Comment Letters", "Hospital"]

boolean_analysis = pd.DataFrame(columns=[
    'Number of Unique Values', 'Most Common Value',
    'Frequency of Most Common', 'Average Length',
    'Maximum Length'
])

for col in string_columns:
    unique_values = dataset3[col].nunique()
    most_common_value = dataset3[col].mode()[0] if not dataset3[col].mode().empty else 'None'
    frequency_most_common = dataset3[col].value_counts().iloc[0] if not dataset3[col].mode().empty else 0
    average_length = dataset3[col].dropna().apply(len).mean()
    max_length = dataset3[col].dropna().apply(len).max()

    boolean_analysis.loc[col] = [unique_values, most_common_value, frequency_most_common, average_length, max_length]

boolean_analysis

Unnamed: 0,Number of Unique Values,Most Common Value,Frequency of Most Common,Average Length,Maximum Length
Links To Comment Letters,11,https://oshpd.ca.gov/HID/Products/PatDischarge...,1,110.909091,125
Hospital,331,AHMC Anaheim Regional Medical Center,1,31.909366,69


# Export

In [None]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-3.1.9-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.8/154.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.1.9


In [None]:
with pd.ExcelWriter("Dataset3 Analysis.xlsx", engine='xlsxwriter') as writer:
    categoric_analysis.to_excel(writer, sheet_name='Categorical Analysis')
    numeric_analysis.to_excel(writer, sheet_name='Numeric Analysis')
    boolean_analysis.to_excel(writer, sheet_name='Strings Analysis')