In [151]:
import pandas as pd
import csv

In [152]:
# Load the CSV file into a DataFrame
df = pd.read_csv("C:/Users/61433/Desktop/Project-3-Group-2/healthcare_dataset.csv")

# Open the CSV file and list the headers using CSV library
with open('C:/Users/61433/Desktop/Project-3-Group-2/healthcare_dataset.csv', mode='r') as file:
    reader = csv.reader(file)
    headers = next(reader)  # Get the first row as headers

print(headers)

['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition', 'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider', 'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date', 'Medication', 'Test Results']


In [153]:
# Get 1000 random rows
random_rows = df.sample(n=1000, random_state=1) 

#save these random rows to a new CSV file
random_rows.to_csv('random_rows.csv', index=False)

In [154]:
# Standardize capitalisation
df['Name'] = df['Name'].str.title()
df['Doctor'] = df['Doctor'].str.title()
df['Hospital'] = df['Hospital'].str.title()
df['Medical Condition'] = df['Medical Condition'].str.capitalize()
df['Gender'] = df['Gender'].str.capitalize()
df['Test Results'] = df['Test Results'].str.capitalize()
df['Medication'] = df['Medication'].str.capitalize()

# Data type conversion
df['Age'] = df['Age'].astype(int)
df['Billing Amount'] = df['Billing Amount'].astype(float)
df['Room Number'] = df['Room Number'].astype(int)

# dropping missing values
df = df.dropna() 

#Standardize categorical data
df['Blood Type'] = df['Blood Type'].str.upper()
df['Admission Type'] = df['Admission Type'].str.capitalize()

# Removing titles from names 
df['Name'] = df['Name'].str.replace(r'\b(Dr\.?|Mrs\.?|Ms\.?|Mr\.?)\s*', '', regex=True)

# Remove duplicates
df = df.drop_duplicates()

first_10_records = df.head(10)
print(first_10_records)


                 Name  Age  Gender Blood Type Medical Condition  \
0       Bobby Jackson   30    Male         B-            Cancer   
1        Leslie Terry   62    Male         A+           Obesity   
2         Danny Smith   76  Female         A-           Obesity   
3        Andrew Watts   28  Female         O+          Diabetes   
4       Adrienne Bell   43  Female        AB+            Cancer   
5       Emily Johnson   36    Male         A+            Asthma   
6      Edward Edwards   21  Female        AB-          Diabetes   
7  Christina Martinez   20  Female         A+            Cancer   
8     Jasmine Aguilar   82    Male        AB+            Asthma   
9    Christopher Berg   58  Female        AB-            Cancer   

  Date of Admission            Doctor                     Hospital  \
0        2024-01-31     Matthew Smith              Sons And Miller   
1        2019-08-20   Samantha Davies                      Kim Inc   
2        2022-09-22  Tiffany Mitchell               

In [155]:
print(df.columns)

Index(['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition',
       'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider',
       'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date',
       'Medication', 'Test Results'],
      dtype='object')


In [156]:
# Ensure all values in 'Name' are strings
df['Name'] = df['Name'].astype(str)

# Function to split names
def split_name(name):
    parts = name.split(' ', 1)
    first_name = parts[0]
    last_name = parts[1] if len(parts) > 1 else None
    return pd.Series([first_name, last_name])

# Apply the function to the 'Name' column and create 'First_name' and 'Last_name' columns
df[['First_name', 'Last_name']] = df['Name'].apply(split_name)

# Display the first few rows to check the result
print(df.head())

            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby Jackson   30    Male         B-            Cancer        2024-01-31   
1   Leslie Terry   62    Male         A+           Obesity        2019-08-20   
2    Danny Smith   76  Female         A-           Obesity        2022-09-22   
3   Andrew Watts   28  Female         O+          Diabetes        2020-11-18   
4  Adrienne Bell   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons And Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook Plc              Aetna   
3       Kevin Wells  Hernandez Rogers And Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  \
0    18856.281306    

In [157]:
# Drop column 'Name'
df.drop(columns=['Name'], inplace=True)

print(df.head())

   Age  Gender Blood Type Medical Condition Date of Admission  \
0   30    Male         B-            Cancer        2024-01-31   
1   62    Male         A+           Obesity        2019-08-20   
2   76  Female         A-           Obesity        2022-09-22   
3   28  Female         O+          Diabetes        2020-11-18   
4   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons And Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook Plc              Aetna   
3       Kevin Wells  Hernandez Rogers And Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  \
0    18856.281306          328         Urgent     2024-02-02  Paracetamol   
1    33643.327287          265    

In [158]:
print(df.columns)


Index(['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Date of Admission',
       'Doctor', 'Hospital', 'Insurance Provider', 'Billing Amount',
       'Room Number', 'Admission Type', 'Discharge Date', 'Medication',
       'Test Results', 'First_name', 'Last_name'],
      dtype='object')


In [159]:
# Desired column order
new_order = [
    'First_name', 'Last_name', 'Age', 'Gender', 'Blood Type', 
    'Medical Condition', 'Date of Admission', 'Doctor', 'Hospital', 
    'Insurance Provider', 'Billing Amount', 'Room Number', 
    'Admission Type', 'Discharge Date', 'Medication', 'Test Results'
]

# Reorder columns
df = df[new_order]

print(df.head())

  First_name Last_name  Age  Gender Blood Type Medical Condition  \
0      Bobby   Jackson   30    Male         B-            Cancer   
1     Leslie     Terry   62    Male         A+           Obesity   
2      Danny     Smith   76  Female         A-           Obesity   
3     Andrew     Watts   28  Female         O+          Diabetes   
4   Adrienne      Bell   43  Female        AB+            Cancer   

  Date of Admission            Doctor                    Hospital  \
0        2024-01-31     Matthew Smith             Sons And Miller   
1        2019-08-20   Samantha Davies                     Kim Inc   
2        2022-09-22  Tiffany Mitchell                    Cook Plc   
3        2020-11-18       Kevin Wells  Hernandez Rogers And Vang,   
4        2022-09-19    Kathleen Hanna                 White-White   

  Insurance Provider  Billing Amount  Room Number Admission Type  \
0         Blue Cross    18856.281306          328         Urgent   
1           Medicare    33643.327287    

In [160]:
# Convert date columns to ISO 8601 format (YYYY-MM-DD)
df['Date of Admission'] = pd.to_datetime(df['Date of Admission']).dt.strftime('%Y-%m-%d')
df['Discharge Date'] = pd.to_datetime(df['Discharge Date']).dt.strftime('%Y-%m-%d')

# Display the first few rows to check the result
print(df.head())

  First_name Last_name  Age  Gender Blood Type Medical Condition  \
0      Bobby   Jackson   30    Male         B-            Cancer   
1     Leslie     Terry   62    Male         A+           Obesity   
2      Danny     Smith   76  Female         A-           Obesity   
3     Andrew     Watts   28  Female         O+          Diabetes   
4   Adrienne      Bell   43  Female        AB+            Cancer   

  Date of Admission            Doctor                    Hospital  \
0        2024-01-31     Matthew Smith             Sons And Miller   
1        2019-08-20   Samantha Davies                     Kim Inc   
2        2022-09-22  Tiffany Mitchell                    Cook Plc   
3        2020-11-18       Kevin Wells  Hernandez Rogers And Vang,   
4        2022-09-19    Kathleen Hanna                 White-White   

  Insurance Provider  Billing Amount  Room Number Admission Type  \
0         Blue Cross    18856.281306          328         Urgent   
1           Medicare    33643.327287    

In [161]:
# Save the updated DataFrame to a new CSV file
df.to_csv('cleaned_healthcare_dataset.csv', index=False)