### Load in and clean up data

In [7]:
import pandas as pd
from sodapy import Socrata
import statsmodels.formula.api as smf
import pgeocode

In [8]:
client = Socrata("health.data.ny.gov", None)

# First 50,000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("tg3i-cinn", limit=50000)

# Convert to pandas DataFrame
nyd2019_50k = pd.DataFrame.from_records(results)
nyd2019_50k.shape
nyd2019_50k.head(3)



Unnamed: 0,hospital_service_area,hospital_county,operating_certificate_number,permanent_facility_id,facility_name,age_group,zip_code_3_digits,gender,race,ethnicity,...,apr_severity_of_illness,apr_risk_of_mortality,apr_medical_surgical,payment_typology_1,payment_typology_2,emergency_department_indicator,total_charges,total_costs,birth_weight,payment_typology_3
0,New York City,Bronx,7000006,1169,Montefiore Medical Center - Henry & Lucy Moses...,70 or Older,104,M,Other Race,Spanish/Hispanic,...,Major,Extreme,Medical,Medicare,Medicaid,Y,320922.43,60241.34,,
1,New York City,Bronx,7000006,1169,Montefiore Medical Center - Henry & Lucy Moses...,50 to 69,104,F,White,Not Span/Hispanic,...,Moderate,Minor,Medical,Private Health Insurance,,Y,61665.22,9180.69,,
2,New York City,Bronx,7000006,1168,Montefiore Medical Center-Wakefield Hospital,18 to 29,104,F,Other Race,Spanish/Hispanic,...,Minor,Minor,Surgical,Medicaid,,N,42705.34,11366.5,,


In [9]:
#go from county to latitiude longitude
nomi = pgeocode.Nominatim('us')

nomi.query_location("Bronx", top_k=3)
## might need to go from inputted lat/long in the webapp to county name

Unnamed: 0,country_code,postal_code,place_name,state_name,state_code,county_name,county_code,community_name,community_code,latitude,longitude,accuracy
25168,US,10451,Bronx,New York,NY,Bronx,5.0,,,40.8222,-73.9217,4.0
25169,US,10452,Bronx,New York,NY,Bronx,5.0,,,40.8376,-73.9216,4.0
25170,US,10453,Bronx,New York,NY,Bronx,5.0,,,40.852,-73.9129,4.0


### Create regression models

In [10]:
# predict total charges and length of stay from demographic variables

## length of stay

#remove rows where length of stay >120
nyd2019_los = nyd2019_50k[~(nyd2019_50k['length_of_stay'] == '120 +')]
#make length of stay variable numeric
nyd2019_los['length_of_stay'] = pd.to_numeric(nyd2019_los['length_of_stay'])

los_model = smf.ols(formula='length_of_stay ~ C(hospital_county) + C(age_group) + C(gender) + C(race) + C(ethnicity) + C(type_of_admission) + C(payment_typology_1)', data=nyd2019_los).fit()
print(los_model.summary())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyd2019_los['length_of_stay'] = pd.to_numeric(nyd2019_los['length_of_stay'])


                            OLS Regression Results                            
Dep. Variable:         length_of_stay   R-squared:                       0.044
Model:                            OLS   Adj. R-squared:                  0.043
Method:                 Least Squares   F-statistic:                     54.58
Date:                Fri, 12 May 2023   Prob (F-statistic):               0.00
Time:                        19:14:54   Log-Likelihood:            -1.7448e+05
No. Observations:               49668   AIC:                         3.490e+05
Df Residuals:                   49625   BIC:                         3.494e+05
Df Model:                          42                                         
Covariance Type:            nonrobust                                         
                                                         coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------

In [45]:
##total charges

#converting total charges column to numeric values
nyd2019_50k['total_charges'] = pd.to_numeric(nyd2019_50k['total_charges'])
#dropping rows from the dataframe where total charges is greater than $400000
nyd2019_charges = nyd2019_50k[nyd2019_50k['total_charges'] <= 400000]

charges_model = smf.ols(formula='total_charges ~ C(hospital_county) + C(age_group) + C(gender) + C(race) + C(ethnicity) + C(type_of_admission) + C(payment_typology_1)', data=nyd2019_charges).fit()
print(charges_model.summary())

                            OLS Regression Results                            
Dep. Variable:          total_charges   R-squared:                       0.190
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                     252.2
Date:                Fri, 12 May 2023   Prob (F-statistic):               0.00
Time:                        19:45:43   Log-Likelihood:            -5.3574e+05
No. Observations:               45231   AIC:                         1.072e+06
Df Residuals:                   45188   BIC:                         1.072e+06
Df Model:                          42                                         
Covariance Type:            nonrobust                                         
                                                         coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------

### Create streamlit app