# Cleaning the Data

In [89]:
import pandas as pd
from pathlib import Path
import json
import pprint as pp
import os
import us
import pymongo

(to find the full names of the states import "us". To do this, type "pip install us" on a random cell or terminal, once installed, delete new cell (if made oneand then restart kernel to use)

In [90]:
# Store filepath in a variable
file = Path("Resources/us_hospital_locations.csv")
hospitals_df = pd.read_csv(file)

# Change state abbreviations to full names
# Map state abbreviations to full state names using us package
def get_full_state_name(abbrev):
    try:
        return us.states.lookup(abbrev).name
    except AttributeError:
        return abbrev

hospitals_df["STATE"] = hospitals_df["STATE"].apply(get_full_state_name)

# Proper case the columns for uniformity
hospitals_df.columns = [col.title() if col.islower() or col.isupper() else col for col in hospitals_df]

# Proper case the rows for uniformity
hospitals_df = hospitals_df.map(lambda x: x.title() if isinstance(x, str) else x)

hospitals_df

Unnamed: 0,X,Y,Fid,Id,Name,Address,City,State,Zip,Zip4,...,Val_Date,Website,State_Id,Alt_Name,St_Fips,Owner,Ttl_Staff,Beds,Trauma,Helipad
0,-1.331889e+07,4.346975e+06,1,5793230,Central Valley General Hospital,1025 North Douty Street,Hanford,California,93230,Not Available,...,2014/02/10 00:00:00,Http://Www.Hanfordhealth.Com,Not Available,Not Available,6,Proprietary,-999,49,Not Available,N
1,-1.322651e+07,4.049626e+06,2,53391362,Los Robles Hospital & Medical Center - East Ca...,150 Via Merida,Westlake Vilage,California,91362,Not Available,...,2014/02/10 00:00:00,Http://Www.Losrobleshospital.Com,Not Available,Not Available,6,Proprietary,-999,62,Not Available,N
2,-1.315620e+07,4.031978e+06,3,11190023,East Los Angeles Doctors Hospital,4060 Whittier Boulevard,Los Angeles,California,90023,Not Available,...,2014/02/10 00:00:00,Http://Www.Elalax.Com,Not Available,Not Available,6,Proprietary,-999,127,Not Available,N
3,-1.317190e+07,4.041752e+06,4,17090028,Southern California Hospital At Hollywood,6245 De Longpre Avenue,Hollywood,California,90028,Not Available,...,2014/02/10 00:00:00,Http://Sch-Hollywood.Com/,Not Available,Hollywood Community Hospital Of Hollywood,6,Proprietary,-999,100,Not Available,N
4,-1.313208e+07,4.037270e+06,5,23691706,Kindred Hospital Baldwin Park,14148 Francisquito Avenue,Baldwin Park,California,91706,Not Available,...,2014/02/10 00:00:00,Http://Www.Khbaldwinpark.Com,Not Available,Not Available,6,Proprietary,-999,95,Not Available,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7591,-8.566905e+06,4.728815e+06,7607,183520904,Adventist Healthcare White Oak Medical Center,11890 Healing Way,Silver Spring,Maryland,20904,Not Available,...,2020/05/05 00:00:00,Https://Www.Adventisthealthcare.Com/Locations/...,Not Available,Not Available,24,Non-Profit,-999,178,Not Available,N
7592,-1.219624e+07,3.676931e+06,7608,192185607,Copper Queen Douglas Emergency Department,100 E. 5Th Street,Douglas,Arizona,85607,Not Available,...,2020/05/05 00:00:00,Https://Cqch.Org/Cqch-Clinics/Copper-Queen-Dou...,Not Available,Not Available,4,Non-Profit,-999,-999,Level Iv,N
7593,-8.084677e+06,5.094112e+06,7609,196706457,Whiting Forensic Hospital,70 Obrien Dr,Middletown,Connecticut,6457,3945,...,2020/05/08 00:00:00,Https://Portal.Ct.Gov/Dmhas/Wfh/Whiting-Forens...,Not Available,Not Available,9,Government - State,-999,229,Not Available,N
7594,-8.522423e+06,4.718512e+06,7610,191021401,J Kent Mcnew Family Medical Center,175 Harry S Truman Parkway,Annapolis,Maryland,21401,Not Available,...,2020/05/05 00:00:00,Https://Aahs.Org/Locations/J--Kent-Mcnew-Famil...,Not Available,Not Available,24,Non-Profit,-999,-999,Not Available,N


In [91]:
# Identifying the columns to select which to keep
hospitals_df.columns

Index(['X', 'Y', 'Fid', 'Id', 'Name', 'Address', 'City', 'State', 'Zip',
       'Zip4', 'Telephone', 'Type', 'Status', 'Population', 'County',
       'Countyfips', 'Country', 'Latitude', 'Longitude', 'Naics_Code',
       'Naics_Desc', 'Source', 'Sourcedate', 'Val_Method', 'Val_Date',
       'Website', 'State_Id', 'Alt_Name', 'St_Fips', 'Owner', 'Ttl_Staff',
       'Beds', 'Trauma', 'Helipad'],
      dtype='object')

In [92]:
# Selecting columns
hospitals_clean_df = pd.DataFrame(hospitals_df[['Id','Name','Address', 'City','County','State','Status','Latitude', 'Longitude','Beds', 'Trauma', 'Helipad']])
hospitals_clean_df

Unnamed: 0,Id,Name,Address,City,County,State,Status,Latitude,Longitude,Beds,Trauma,Helipad
0,5793230,Central Valley General Hospital,1025 North Douty Street,Hanford,Kings,California,Closed,36.336159,-119.645667,49,Not Available,N
1,53391362,Los Robles Hospital & Medical Center - East Ca...,150 Via Merida,Westlake Vilage,Ventura,California,Open,34.154939,-118.815736,62,Not Available,N
2,11190023,East Los Angeles Doctors Hospital,4060 Whittier Boulevard,Los Angeles,Los Angeles,California,Open,34.023647,-118.184165,127,Not Available,N
3,17090028,Southern California Hospital At Hollywood,6245 De Longpre Avenue,Hollywood,Los Angeles,California,Open,34.096391,-118.325235,100,Not Available,N
4,23691706,Kindred Hospital Baldwin Park,14148 Francisquito Avenue,Baldwin Park,Los Angeles,California,Open,34.063039,-117.967438,95,Not Available,N
...,...,...,...,...,...,...,...,...,...,...,...,...
7591,183520904,Adventist Healthcare White Oak Medical Center,11890 Healing Way,Silver Spring,Montgomery,Maryland,Open,39.049854,-76.957821,178,Not Available,N
7592,192185607,Copper Queen Douglas Emergency Department,100 E. 5Th Street,Douglas,Cochise,Arizona,Open,31.339594,-109.560682,-999,Level Iv,N
7593,196706457,Whiting Forensic Hospital,70 Obrien Dr,Middletown,Middlesex,Connecticut,Open,41.552040,-72.625890,229,Not Available,N
7594,191021401,J Kent Mcnew Family Medical Center,175 Harry S Truman Parkway,Annapolis,Anne Arundel,Maryland,Open,38.977938,-76.558229,-999,Not Available,N


In [93]:
# Removing Hospitals with a STATUS of "CLOSED".
open_hospitals_df = hospitals_clean_df[hospitals_clean_df['Status'] != 'Closed']
open_hospitals_df

Unnamed: 0,Id,Name,Address,City,County,State,Status,Latitude,Longitude,Beds,Trauma,Helipad
1,53391362,Los Robles Hospital & Medical Center - East Ca...,150 Via Merida,Westlake Vilage,Ventura,California,Open,34.154939,-118.815736,62,Not Available,N
2,11190023,East Los Angeles Doctors Hospital,4060 Whittier Boulevard,Los Angeles,Los Angeles,California,Open,34.023647,-118.184165,127,Not Available,N
3,17090028,Southern California Hospital At Hollywood,6245 De Longpre Avenue,Hollywood,Los Angeles,California,Open,34.096391,-118.325235,100,Not Available,N
4,23691706,Kindred Hospital Baldwin Park,14148 Francisquito Avenue,Baldwin Park,Los Angeles,California,Open,34.063039,-117.967438,95,Not Available,N
5,25190712,Lakewood Regional Medical Center,3700 East South Street,Lakewood,Los Angeles,California,Open,33.859707,-118.148403,172,Not Available,N
...,...,...,...,...,...,...,...,...,...,...,...,...
7591,183520904,Adventist Healthcare White Oak Medical Center,11890 Healing Way,Silver Spring,Montgomery,Maryland,Open,39.049854,-76.957821,178,Not Available,N
7592,192185607,Copper Queen Douglas Emergency Department,100 E. 5Th Street,Douglas,Cochise,Arizona,Open,31.339594,-109.560682,-999,Level Iv,N
7593,196706457,Whiting Forensic Hospital,70 Obrien Dr,Middletown,Middlesex,Connecticut,Open,41.552040,-72.625890,229,Not Available,N
7594,191021401,J Kent Mcnew Family Medical Center,175 Harry S Truman Parkway,Annapolis,Anne Arundel,Maryland,Open,38.977938,-76.558229,-999,Not Available,N


In [94]:
# Filter the DataFrame to exclude rows where 'BEDS' is less than 0 and remove Guam and Pw
filtered_hospitals_df = open_hospitals_df[
    (open_hospitals_df['Beds'] >= 0) & 
    (~open_hospitals_df['State'].isin(['Guam', 'Pw','Puerto Rico','Virgin Islands']))
]

In [95]:
# For reference, looking at the count of hidden rows
hidden_beds_count = open_hospitals_df[open_hospitals_df['Beds'] < 0].shape[0]
hidden_beds_count

243

In [96]:
# To check if removed US Territories aren't there
filtered_hospitals_df['State'].unique()

array(['California', 'Louisiana', 'Texas', 'Illinois', 'Georgia',
       'Wisconsin', 'Rhode Island', 'Arkansas', 'Dc', 'Florida',
       'Massachusetts', 'Connecticut', 'Kentucky', 'Iowa', 'Kansas',
       'Arizona', 'Maine', 'Michigan', 'Minnesota', 'Missouri',
       'North Dakota', 'Indiana', 'Nevada', 'Pennsylvania', 'Montana',
       'South Carolina', 'Virginia', 'Ohio', 'Washington', 'Oregon',
       'New York', 'New Hampshire', 'Maryland', 'Oklahoma',
       'West Virginia', 'South Dakota', 'Tennessee', 'Alaska', 'Wyoming',
       'New Mexico', 'Idaho', 'Hawaii', 'Alabama', 'North Carolina',
       'Mississippi', 'Nebraska', 'New Jersey', 'Utah', 'Vermont',
       'Colorado', 'Delaware'], dtype=object)

In [97]:
# Displaying newly filtered and clean df
filtered_hospitals_df

Unnamed: 0,Id,Name,Address,City,County,State,Status,Latitude,Longitude,Beds,Trauma,Helipad
1,53391362,Los Robles Hospital & Medical Center - East Ca...,150 Via Merida,Westlake Vilage,Ventura,California,Open,34.154939,-118.815736,62,Not Available,N
2,11190023,East Los Angeles Doctors Hospital,4060 Whittier Boulevard,Los Angeles,Los Angeles,California,Open,34.023647,-118.184165,127,Not Available,N
3,17090028,Southern California Hospital At Hollywood,6245 De Longpre Avenue,Hollywood,Los Angeles,California,Open,34.096391,-118.325235,100,Not Available,N
4,23691706,Kindred Hospital Baldwin Park,14148 Francisquito Avenue,Baldwin Park,Los Angeles,California,Open,34.063039,-117.967438,95,Not Available,N
5,25190712,Lakewood Regional Medical Center,3700 East South Street,Lakewood,Los Angeles,California,Open,33.859707,-118.148403,172,Not Available,N
...,...,...,...,...,...,...,...,...,...,...,...,...
7588,193711040,Cohen Children’S Medical Center,269-01 76Th Avenue,New Hyde Park,Queens,New York,Open,40.753005,-73.708457,202,Level I Pediatric,N
7589,193339744,North Ms Medical Center - Eupora,70 Medical Plaza,Eupora,Webster,Mississippi,Open,33.533926,-89.266357,38,Level Iv,Y
7590,185838852,North Ms Medical Center - Iuka,177 Curtis Drive,Iuka,Tishomingo,Mississippi,Open,34.801856,-88.208355,48,Level Iv,Y
7591,183520904,Adventist Healthcare White Oak Medical Center,11890 Healing Way,Silver Spring,Montgomery,Maryland,Open,39.049854,-76.957821,178,Not Available,N


In [98]:
# Convert dataframe into a csv
file_path = Path('./Resources/cleaned_hospitals.csv')
filtered_hospitals_df.to_csv(file_path, index=False)
print(f"csv data has been written to {file_path}")

csv data has been written to Resources/cleaned_hospitals.csv


# Jsonify
Creating a dictionary for the data and Jsonifying it for mongo use

In [99]:
# Initialize an empty list to store dictionaries
hospital_list = []

# Iterate over the DataFrame rows
for index, row in filtered_hospitals_df.iterrows():
    # Create a dictionary with the hospital name and other information
    hospital_dict = {
        'Hospital Name': row['Name'],
        'Info': row.drop(labels=['Name']).to_dict()
    }
    # Append the dictionary to the list
    hospital_list.append(hospital_dict)

# Convert the list of dictionaries to a JSON-formatted string
hospital_list_json = json.dumps(hospital_list, indent=4)

# Print the JSON-formatted string
print(hospital_list_json)

[
    {
        "Hospital Name": "Los Robles Hospital & Medical Center - East Campus",
        "Info": {
            "Id": 53391362,
            "Address": "150 Via Merida",
            "City": "Westlake Vilage",
            "County": "Ventura",
            "State": "California",
            "Status": "Open",
            "Latitude": 34.1549388720001,
            "Longitude": -118.815736391,
            "Beds": 62,
            "Trauma": "Not Available",
            "Helipad": "N"
        }
    },
    {
        "Hospital Name": "East Los Angeles Doctors Hospital",
        "Info": {
            "Id": 11190023,
            "Address": "4060 Whittier Boulevard",
            "City": "Los Angeles",
            "County": "Los Angeles",
            "State": "California",
            "Status": "Open",
            "Latitude": 34.023647302,
            "Longitude": -118.184164805,
            "Beds": 127,
            "Trauma": "Not Available",
            "Helipad": "N"
        }
    },
    {
     

In [100]:
# Path to the resources folder
file_path = os.path.join('Resources', 'cleaned_hospitals.json')

# Write the list of dictionaries to a JSON file in the resources folder
with open(file_path, 'w') as json_file:
    json.dump(hospital_list, json_file, indent=4)

print(f"JSON data has been written to {file_path}")

JSON data has been written to Resources/cleaned_hospitals.json


# Creating Hospital Database with Mongo

In [101]:
# Connect to MongoDB
mongo = pymongo.MongoClient("mongodb://localhost:27017/")
# Check if 'hospitals_db' exists in the list of databases
if 'hospitals_db' in mongo.list_database_names():
    # Drop the database if it exists
    mongo.drop_database('hospitals_db')

db = mongo["hospitals_db"]
collection = db["hospitals"]

# Insert the list of JSON documents
collection.insert_many(hospital_list)

# Verify that database is there
mongo.list_database_names()

['admin',
 'autosaurus',
 'classDB',
 'config',
 'epa',
 'fruits_db',
 'hospitals_db',
 'local',
 'met',
 'mongo_class',
 'travel_db',
 'uk_food']