# Cleaning the Data

In [11]:
import pandas as pd
from pathlib import Path
import json
import pprint as pp
import os
import pymongo

In [12]:
# Store filepath in a variable
file = Path("Resources/us_hospital_locations.csv")
hospitals_df = pd.read_csv(file)
hospitals_df.head()

Unnamed: 0,X,Y,FID,ID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,...,VAL_DATE,WEBSITE,STATE_ID,ALT_NAME,ST_FIPS,OWNER,TTL_STAFF,BEDS,TRAUMA,HELIPAD
0,-13318890.0,4346975.0,1,5793230,CENTRAL VALLEY GENERAL HOSPITAL,1025 NORTH DOUTY STREET,HANFORD,CA,93230,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.hanfordhealth.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,49,NOT AVAILABLE,N
1,-13226510.0,4049626.0,2,53391362,LOS ROBLES HOSPITAL & MEDICAL CENTER - EAST CA...,150 VIA MERIDA,WESTLAKE VILAGE,CA,91362,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.losrobleshospital.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,62,NOT AVAILABLE,N
2,-13156200.0,4031978.0,3,11190023,EAST LOS ANGELES DOCTORS HOSPITAL,4060 WHITTIER BOULEVARD,LOS ANGELES,CA,90023,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.elalax.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,127,NOT AVAILABLE,N
3,-13171900.0,4041752.0,4,17090028,SOUTHERN CALIFORNIA HOSPITAL AT HOLLYWOOD,6245 DE LONGPRE AVENUE,HOLLYWOOD,CA,90028,NOT AVAILABLE,...,2014/02/10 00:00:00,http://sch-hollywood.com/,NOT AVAILABLE,HOLLYWOOD COMMUNITY HOSPITAL OF HOLLYWOOD,6,PROPRIETARY,-999,100,NOT AVAILABLE,N
4,-13132080.0,4037270.0,5,23691706,KINDRED HOSPITAL BALDWIN PARK,14148 FRANCISQUITO AVENUE,BALDWIN PARK,CA,91706,NOT AVAILABLE,...,2014/02/10 00:00:00,http://www.khbaldwinpark.com,NOT AVAILABLE,NOT AVAILABLE,6,PROPRIETARY,-999,95,NOT AVAILABLE,N


In [13]:
# Identifying the columns to select which to keep
hospitals_df.columns

Index(['X', 'Y', 'FID', 'ID', 'NAME', 'ADDRESS', 'CITY', 'STATE', 'ZIP',
       'ZIP4', 'TELEPHONE', 'TYPE', 'STATUS', 'POPULATION', 'COUNTY',
       'COUNTYFIPS', 'COUNTRY', 'LATITUDE', 'LONGITUDE', 'NAICS_CODE',
       'NAICS_DESC', 'SOURCE', 'SOURCEDATE', 'VAL_METHOD', 'VAL_DATE',
       'WEBSITE', 'STATE_ID', 'ALT_NAME', 'ST_FIPS', 'OWNER', 'TTL_STAFF',
       'BEDS', 'TRAUMA', 'HELIPAD'],
      dtype='object')

In [14]:
# Selecting columns
hospitals_clean_df = pd.DataFrame(hospitals_df[['ID','NAME','ADDRESS', 'CITY','COUNTY','STATE','STATUS','LATITUDE', 'LONGITUDE','BEDS', 'TRAUMA', 'HELIPAD']])
hospitals_clean_df

Unnamed: 0,ID,NAME,ADDRESS,CITY,COUNTY,STATE,STATUS,LATITUDE,LONGITUDE,BEDS,TRAUMA,HELIPAD
0,5793230,CENTRAL VALLEY GENERAL HOSPITAL,1025 NORTH DOUTY STREET,HANFORD,KINGS,CA,CLOSED,36.336159,-119.645667,49,NOT AVAILABLE,N
1,53391362,LOS ROBLES HOSPITAL & MEDICAL CENTER - EAST CA...,150 VIA MERIDA,WESTLAKE VILAGE,VENTURA,CA,OPEN,34.154939,-118.815736,62,NOT AVAILABLE,N
2,11190023,EAST LOS ANGELES DOCTORS HOSPITAL,4060 WHITTIER BOULEVARD,LOS ANGELES,LOS ANGELES,CA,OPEN,34.023647,-118.184165,127,NOT AVAILABLE,N
3,17090028,SOUTHERN CALIFORNIA HOSPITAL AT HOLLYWOOD,6245 DE LONGPRE AVENUE,HOLLYWOOD,LOS ANGELES,CA,OPEN,34.096391,-118.325235,100,NOT AVAILABLE,N
4,23691706,KINDRED HOSPITAL BALDWIN PARK,14148 FRANCISQUITO AVENUE,BALDWIN PARK,LOS ANGELES,CA,OPEN,34.063039,-117.967438,95,NOT AVAILABLE,N
...,...,...,...,...,...,...,...,...,...,...,...,...
7591,183520904,ADVENTIST HEALTHCARE WHITE OAK MEDICAL CENTER,11890 HEALING WAY,SILVER SPRING,MONTGOMERY,MD,OPEN,39.049854,-76.957821,178,NOT AVAILABLE,N
7592,192185607,COPPER QUEEN DOUGLAS EMERGENCY DEPARTMENT,100 E. 5TH STREET,DOUGLAS,COCHISE,AZ,OPEN,31.339594,-109.560682,-999,LEVEL IV,N
7593,196706457,WHITING FORENSIC HOSPITAL,70 OBRIEN DR,MIDDLETOWN,MIDDLESEX,CT,OPEN,41.552040,-72.625890,229,NOT AVAILABLE,N
7594,191021401,J KENT MCNEW FAMILY MEDICAL CENTER,175 HARRY S TRUMAN PARKWAY,ANNAPOLIS,ANNE ARUNDEL,MD,OPEN,38.977938,-76.558229,-999,NOT AVAILABLE,N


In [15]:
# Removing Hospitals with a STATUS of "CLOSED".
open_hospitals_df = hospitals_clean_df[hospitals_clean_df['STATUS'] != 'CLOSED']
open_hospitals_df

Unnamed: 0,ID,NAME,ADDRESS,CITY,COUNTY,STATE,STATUS,LATITUDE,LONGITUDE,BEDS,TRAUMA,HELIPAD
1,53391362,LOS ROBLES HOSPITAL & MEDICAL CENTER - EAST CA...,150 VIA MERIDA,WESTLAKE VILAGE,VENTURA,CA,OPEN,34.154939,-118.815736,62,NOT AVAILABLE,N
2,11190023,EAST LOS ANGELES DOCTORS HOSPITAL,4060 WHITTIER BOULEVARD,LOS ANGELES,LOS ANGELES,CA,OPEN,34.023647,-118.184165,127,NOT AVAILABLE,N
3,17090028,SOUTHERN CALIFORNIA HOSPITAL AT HOLLYWOOD,6245 DE LONGPRE AVENUE,HOLLYWOOD,LOS ANGELES,CA,OPEN,34.096391,-118.325235,100,NOT AVAILABLE,N
4,23691706,KINDRED HOSPITAL BALDWIN PARK,14148 FRANCISQUITO AVENUE,BALDWIN PARK,LOS ANGELES,CA,OPEN,34.063039,-117.967438,95,NOT AVAILABLE,N
5,25190712,LAKEWOOD REGIONAL MEDICAL CENTER,3700 EAST SOUTH STREET,LAKEWOOD,LOS ANGELES,CA,OPEN,33.859707,-118.148403,172,NOT AVAILABLE,N
...,...,...,...,...,...,...,...,...,...,...,...,...
7591,183520904,ADVENTIST HEALTHCARE WHITE OAK MEDICAL CENTER,11890 HEALING WAY,SILVER SPRING,MONTGOMERY,MD,OPEN,39.049854,-76.957821,178,NOT AVAILABLE,N
7592,192185607,COPPER QUEEN DOUGLAS EMERGENCY DEPARTMENT,100 E. 5TH STREET,DOUGLAS,COCHISE,AZ,OPEN,31.339594,-109.560682,-999,LEVEL IV,N
7593,196706457,WHITING FORENSIC HOSPITAL,70 OBRIEN DR,MIDDLETOWN,MIDDLESEX,CT,OPEN,41.552040,-72.625890,229,NOT AVAILABLE,N
7594,191021401,J KENT MCNEW FAMILY MEDICAL CENTER,175 HARRY S TRUMAN PARKWAY,ANNAPOLIS,ANNE ARUNDEL,MD,OPEN,38.977938,-76.558229,-999,NOT AVAILABLE,N


In [16]:
# Filter the DataFrame to exclude rows where 'BEDS' is less than 0
filtered_hospitals_df = open_hospitals_df[open_hospitals_df['BEDS'] >= 0]

# For reference, looking at the count of hidden rows
hidden_beds_count = open_hospitals_df[open_hospitals_df['BEDS'] < 0].shape[0]
hidden_beds_count

243

In [17]:
# Displaying newly filtered and clean df
filtered_hospitals_df

Unnamed: 0,ID,NAME,ADDRESS,CITY,COUNTY,STATE,STATUS,LATITUDE,LONGITUDE,BEDS,TRAUMA,HELIPAD
1,53391362,LOS ROBLES HOSPITAL & MEDICAL CENTER - EAST CA...,150 VIA MERIDA,WESTLAKE VILAGE,VENTURA,CA,OPEN,34.154939,-118.815736,62,NOT AVAILABLE,N
2,11190023,EAST LOS ANGELES DOCTORS HOSPITAL,4060 WHITTIER BOULEVARD,LOS ANGELES,LOS ANGELES,CA,OPEN,34.023647,-118.184165,127,NOT AVAILABLE,N
3,17090028,SOUTHERN CALIFORNIA HOSPITAL AT HOLLYWOOD,6245 DE LONGPRE AVENUE,HOLLYWOOD,LOS ANGELES,CA,OPEN,34.096391,-118.325235,100,NOT AVAILABLE,N
4,23691706,KINDRED HOSPITAL BALDWIN PARK,14148 FRANCISQUITO AVENUE,BALDWIN PARK,LOS ANGELES,CA,OPEN,34.063039,-117.967438,95,NOT AVAILABLE,N
5,25190712,LAKEWOOD REGIONAL MEDICAL CENTER,3700 EAST SOUTH STREET,LAKEWOOD,LOS ANGELES,CA,OPEN,33.859707,-118.148403,172,NOT AVAILABLE,N
...,...,...,...,...,...,...,...,...,...,...,...,...
7588,193711040,COHEN CHILDREN’S MEDICAL CENTER,269-01 76TH AVENUE,NEW HYDE PARK,QUEENS,NY,OPEN,40.753005,-73.708457,202,LEVEL I PEDIATRIC,N
7589,193339744,NORTH MS MEDICAL CENTER - EUPORA,70 MEDICAL PLAZA,EUPORA,WEBSTER,MS,OPEN,33.533926,-89.266357,38,LEVEL IV,Y
7590,185838852,NORTH MS MEDICAL CENTER - IUKA,177 CURTIS DRIVE,IUKA,TISHOMINGO,MS,OPEN,34.801856,-88.208355,48,LEVEL IV,Y
7591,183520904,ADVENTIST HEALTHCARE WHITE OAK MEDICAL CENTER,11890 HEALING WAY,SILVER SPRING,MONTGOMERY,MD,OPEN,39.049854,-76.957821,178,NOT AVAILABLE,N


# Jsonify
Creating a dictionary for the data and Jsonifying it for mongo use

In [18]:
# Initialize an empty list to store dictionaries
hospital_list = []

# Iterate over the DataFrame rows
for index, row in filtered_hospitals_df.iterrows():
    # Create a dictionary with the hospital name and other information
    hospital_dict = {
        'hospital name': row['NAME'],
        'info': row.drop(labels=['NAME']).to_dict()
    }
    # Append the dictionary to the list
    hospital_list.append(hospital_dict)

# Convert the list of dictionaries to a JSON-formatted string
hospital_list_json = json.dumps(hospital_list, indent=4)

# Print the JSON-formatted string
print(hospital_list_json)

[
    {
        "hospital name": "LOS ROBLES HOSPITAL & MEDICAL CENTER - EAST CAMPUS",
        "info": {
            "ID": 53391362,
            "ADDRESS": "150 VIA MERIDA",
            "CITY": "WESTLAKE VILAGE",
            "COUNTY": "VENTURA",
            "STATE": "CA",
            "STATUS": "OPEN",
            "LATITUDE": 34.1549388720001,
            "LONGITUDE": -118.815736391,
            "BEDS": 62,
            "TRAUMA": "NOT AVAILABLE",
            "HELIPAD": "N"
        }
    },
    {
        "hospital name": "EAST LOS ANGELES DOCTORS HOSPITAL",
        "info": {
            "ID": 11190023,
            "ADDRESS": "4060 WHITTIER BOULEVARD",
            "CITY": "LOS ANGELES",
            "COUNTY": "LOS ANGELES",
            "STATE": "CA",
            "STATUS": "OPEN",
            "LATITUDE": 34.023647302,
            "LONGITUDE": -118.184164805,
            "BEDS": 127,
            "TRAUMA": "NOT AVAILABLE",
            "HELIPAD": "N"
        }
    },
    {
        "hospital nam

In [19]:
# Path to the resources folder
file_path = os.path.join('Resources', 'cleaned_hospitals.json')

# Write the list of dictionaries to a JSON file in the resources folder
with open(file_path, 'w') as json_file:
    json.dump(hospital_list, json_file, indent=4)

print(f"JSON data has been written to {file_path}")

JSON data has been written to Resources/cleaned_hospitals.json


# Creating Hospital Database with Mongo

In [20]:
# Connect to MongoDB
mongo = pymongo.MongoClient("mongodb://localhost:27017/")
# Check if 'hospitals_db' exists in the list of databases
if 'hospitals_db' in mongo.list_database_names():
    # Drop the database if it exists
    mongo.drop_database('hospitals_db')

db = mongo["hospitals_db"]
collection = db["hospitals"]

# Insert the list of JSON documents
collection.insert_many(hospital_list)

# Verify that database is there
mongo.list_database_names()

['admin',
 'autosaurus',
 'classDB',
 'config',
 'epa',
 'fruits_db',
 'hospitals_db',
 'local',
 'met',
 'mongo_class',
 'travel_db',
 'uk_food']