In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

***Introduction***

This is a beginner project to practice basic data cleaning, manipulation, analysis and visualization. I have documented my analysis step by step.

The objectives of this project are bolded and written in markdown cells for readability.

Comments in the project are used for explanation and better understanding. 

Tools used in this project:
- Python (Pandas, Matplotlib, Seaborn for data manipulation and visualization.
- Kaggle notebooks, essentially Jupyter notebooks, for interactive development and documentation.

**Data Inspection**

Display basic information about the dataset, such as:

- The first few rows
- Data Types
- Summary Statisics

In [None]:
# Import dataset and Look at the first 10 rows

hd = pd.read_csv("/kaggle/input/healthcare-dataset/healthcare_dataset.csv")
hd.head(10)



In [None]:
# Inspect data : Non-null count, datatype for any of each column in the dataset, etc

hd.info()

In [None]:
# Get summary stats on numeric values of the dataset

hd.describe()

**Data Cleaning**

- Check for any missing values and return a count of null values
- Rename columns with spaces
- Create new column named Stay_Length
- Remove any outliers

In [None]:
# Check for null values in the entire dataframe, and return a null count for each column.

null_values = hd.isnull().sum()


print("Null Count in HealthCare dataframe :")
print(null_values) # There are no null (NaN) values in this dataset

In [None]:
# Rename column names with spaces, for easier data manipulation/visualization

hdr = hd.rename(columns={'Blood Type':'Blood_Type', 'Medical Condition':'Medical_Condition','Date of Admission':'Admission_Date',
                        'Insurance Provider':'Insurance_Provider', 'Billing Amount':'Billing_Amount', 'Room Number':'Room_Number',
                        'Admission Type':'Admission_Type', 'Discharge Date':'Discharge_Date', 'Test Results':'Test_Results'})
hdr

In [None]:
# Convert datatype for Admission Date and Discharge date to datetime format

hdr['Admission_Date'] = pd.to_datetime(hdr['Admission_Date'])
hdr['Discharge_Date'] = pd.to_datetime(hdr['Discharge_Date'])

hdr.Admission_Date.dtype # check datatype change
# Now Admission and Discharge Date columns are ready to be used to create Stay_Length column.


In [None]:
# Calculate Length of Stay and add that column to the dataframe as Stay_Length

hdr['Stay_Length'] = hdr['Discharge_Date'] - hdr['Admission_Date']
hdr

In [None]:
# Remove an outliers or inconsistancies from the Billing Amount column

hdr_condensed = hdr[hdr['Billing_Amount'] < hdr['Billing_Amount'].quantile(0.95)]
hdr_condensed

# Now the dataframe only 9500 records instead of the 100000, removes the outliers.


***Create Visualizations using Matplotlib or Seaborn***

Explore Distribution of:
- Patient Ages
- Billing Amounts
- Gender Distribution
- Medical Conditions
- Blood Type 
- Test Results
- Medications Administered
- Insurance Provider
- Admission Type

In [None]:
# Plot Distribution of Patients Age

plt.figure(figsize=(10,6))
sns.histplot(hdr['Age'], bins=28, kde=True)
plt.title('Distribution of Patient Ages')
plt.xlabel('Age')
plt.show()


In [None]:
# Deeper look into the Patient's ages

hdr.Age.value_counts() # Age with the highest count was 59.
hdr.Age.describe() # Average age is 51, rounded to nearest whole number.

In [None]:
# Plot Distribution of Billing Amount

plt.figure(figsize=(10,6))
sns.histplot(hdr['Billing_Amount'], bins=20, kde=True)
plt.title('Distribution of Billing Amount')
plt.xlabel('Billing Amount')
plt.show()

In [None]:
# Deeper look into Billing Amounts
hdr.Billing_Amount.describe() # Average Billing Amount = $25,516.80 rounded to nearest cent.
hdr.Billing_Amount.value_counts() # No billing amount is the same, all amounts have 1 count(appearance)

In [None]:
# Plot Distribution of Gender

plt.figure(figsize=(10,6))
sns.histplot(hdr['Gender'], bins=2, kde=False)
plt.title('Distribution of Gender')
plt.xlabel('Gender')
plt.show()

In [None]:
# Deeper look into Patient's Gender
hdr.Gender.describe()

In [None]:
# Plot Distribution of Medical Conditions

plt.figure(figsize=(10,6))
sns.histplot(hdr['Medical_Condition'], bins=28, kde=False)
plt.title('Distribution of Medical Conditions')
plt.xlabel('Medical Conditions')
plt.show()

In [None]:
# Deeper look into Medical Conditions
hdr.Medical_Condition.describe()
hdr.Medical_Condition.value_counts()

In [None]:
# Plot Distribution of Blood Type 

plt.figure(figsize=(10,6))
sns.histplot(hdr['Blood_Type'], bins=8, kde=False)
plt.title('Distribution of Blood Type')
plt.xlabel('Blood_Type')
plt.show()

In [None]:
# Deeper look into Blood Types
hdr.Blood_Type.describe()
hdr.Blood_Type.value_counts()

In [None]:
# Plot Distribution of Test Results

plt.figure(figsize=(10,6))
sns.histplot(hdr['Test_Results'], bins=3, kde=False)
plt.title('Distribution of Test Results')
plt.xlabel('Test Results')
plt.show()

In [None]:
# Deeper look into Test Results
hdr.Test_Results.describe()
hdr.Test_Results.value_counts()

In [None]:
# Plot Distribution of Medication Administered

plt.figure(figsize=(10,6))
sns.histplot(hdr['Medication'], bins=5, kde=False)
plt.title('Distribution of Medications Administered')
plt.xlabel('Medications Administered During Patient Stay')
plt.show()

In [None]:
# Deeper look into Medications Administered
hdr.Medication.describe()
hdr.Medication.value_counts()

In [None]:
# Plot Distribution of Insurance Providers 

plt.figure(figsize=(10,6))
sns.histplot(hdr['Insurance_Provider'], bins=5, kde=False)
plt.title('Distribution of Insurance Providers')
plt.xlabel('Insurance Providers')
plt.show()

In [None]:
# Deeper look into Insurance Providers 

hdr.Insurance_Provider.describe()
hdr.Insurance_Provider.value_counts()

In [None]:
# Plot Distribution of Admission Type 

plt.figure(figsize=(10,6))
sns.histplot(hdr['Admission_Type'], bins=3, kde=False)
plt.title('Distribution of Admission Types')
plt.xlabel('Admission Types')
plt.show()

In [None]:
# Deeper look into Admission Types
hdr.Admission_Type.describe()
hdr.Admission_Type.value_counts()

***Correlation Analysis***

Explore correlations between numerical varibables. Visualize correlation using a heatmap.


In [None]:
numeric_columns = hdr.select_dtypes(include=['Float64','int64','<M8[ns]'])

correlation_matrix = numeric_columns.corr()
plt.figure(figsize=(10,8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

***Summary and insights drawn from data analysis***
- Dataset was not missing any values, columns renamed for easier anaylsis.
- Data was pretty evenly distributed (as shown in visualization). 
- Slightly more female patients then male patients but not significant.
- Most common: 
    - Test result = "Abnormal"
    - Medical Condition = "Asthma"
    - Medication administered = "Penicillin"
    - Age = 59 (Average age was 51 rounded to nearest whole number)
    - Blood Type = AB-
    - Insurance Provider = "Cigna" 
    - Admission Type = "Urgent"
- Average Billing Amount = $25,516.80 (rounded to nearest cent.)
- There is low correlation between numeric values, signifying no notable relationships between age, room number, billing amount, admission and discharge date. 


I have gained hands-on experience with data cleaning, manipulation, visualization, and basic analysis using Python.