# Import fire and pm2.5 data and generate fake medical data

In [1]:
#import relevant tools
import numpy as np
import pandas as pd
from faker import Faker
import os
from datetime import datetime
fake=Faker()
#!pip install Faker

# Import fire and PM2.5 data

In [2]:
#read in data that contains fire and pm25 
fd=pd.read_csv('instrument2.csv') 

In [3]:
#list all files in directory
os.listdir('.')


['.ipynb_checkpoints',
 'Fake Data.ipynb',
 'fake_med_data.ipynb',
 'fake_med_data_03072023.ipynb',
 'fake_med_data_03072023_zipcode.ipynb',
 'fires_clean_with_zipcodes.csv',
 'instrument2.csv']

In [4]:
#Take a look at the data
fd.head()


Unnamed: 0.1,Unnamed: 0,lat,lon,ZCTA,u,v,wdir,wspd,year_month,GIS_ACRES,DURATION,FIRE_AREA_KM2,pm25,treatment
0,0,37.465,-117.936,89010,0.504258,-0.719008,305.042938,0.878208,199101,,,,12.450976,False
1,1,35.396,-116.322,89019,-0.172753,-0.94694,259.661102,0.962568,199101,,,,10.846541,False
2,2,36.161,-116.139,89060,-0.435964,-0.812957,241.796738,0.922477,199101,,,,12.385,False
3,3,35.957,-115.897,89061,-0.560538,-1.176384,244.522552,1.303105,199101,,,,10.15,False
4,4,39.52,-120.032,89439,0.042253,0.205661,78.390099,0.209957,199101,,,,8.389565,False


In [5]:
#select columns from the original data
fd1=fd[["ZCTA","year_month","pm25","treatment","DURATION","GIS_ACRES"]]

In [6]:
#Peek at data
fd1.head()

Unnamed: 0,ZCTA,year_month,pm25,treatment,DURATION,GIS_ACRES
0,89010,199101,12.450976,False,,
1,89019,199101,10.846541,False,,
2,89060,199101,12.385,False,,
3,89061,199101,10.15,False,,
4,89439,199101,8.389565,False,,


In [7]:
#check for null zipcodes
print(fd1["ZCTA"].notnull().sum())
print(fd1.shape)

628507
(628507, 6)


In [10]:
zipcodes=fd1["ZCTA"]

# Creation of fake medical data
Call function by setting num = # of pa tients. 
This number(num) for california should be about 30,000,000(about 1.1M patients per year)

In [13]:

#create information on individual patients
#define a function to create fake medical conditions for individuals within each county
def make_patients(num):
    #list of diseases to randomly assign to patients
    disease=["PTSD","Depression","Anxiety"]
    race=["White", "Black/African American", "American Indian/Alaska Native","Asian","Native Hawaiian/Other Pacific Islander"]
    gender=["M","F"]
    
    
    fake_patients=[{'Patient Name':fake.name(),
                  'Visit_date':fake.date_between(start_date=datetime(1991,1,1),end_date=datetime(2020,12,1)),  
                  'Gender':np.random.choice(gender,p=[0.5,0.5]),
                  'Race':np.random.choice(race,p=[0.2,0.2,0.2,0.2,0.2]),
                  'Zipcode':np.random.choice(zipcodes),
                  'Patient_disease':np.random.choice(disease,p=[0.3,0.3,0.4])} for x in range(num)]

    
    return fake_patients



In [14]:
#call the function with the number of patients
patient_df=pd.DataFrame(make_patients(1000))
patient_df

Unnamed: 0,Patient Name,Visit_date,Gender,Race,Zipcode,Patient_disease
0,Justin Gay,1995-03-19,F,American Indian/Alaska Native,92282,PTSD
1,Amy Herrera,2019-10-26,F,Native Hawaiian/Other Pacific Islander,95246,Anxiety
2,Amy Wilson,2016-07-28,M,Asian,95008,Depression
3,Logan Lopez,2000-05-06,F,White,90003,PTSD
4,Timothy Little,1991-03-25,M,American Indian/Alaska Native,95501,Depression
...,...,...,...,...,...,...
995,Stephanie Atkins,2009-07-04,M,American Indian/Alaska Native,93638,PTSD
996,Ronald Watts,2004-08-05,F,Asian,90089,Anxiety
997,Miss Paula Oconnor,1998-12-26,M,Asian,93614,PTSD
998,Hannah Grant,1997-02-10,F,American Indian/Alaska Native,95636,PTSD


In [None]:
patient_df["Visit_date1"]=patient_df["Visit_date"].astype("string")

In [None]:
patient_df.head()

In [None]:
patient_df["Visit_date_ym"]=patient_df["Visit_date1"].str[0:7]

In [None]:
patient_df.head()

In [None]:
#This is the dataset with the individual patient level information
patient_df["visit_year_month"]=patient_df["Visit_date_ym"].str.replace('-','')

In [None]:
patient_df.head()

In [None]:
#select only relevant fields  to join with fire and pm2.5 data
patient_df1=patient_df[["Patient Name","Gender","Race","Zipcode","Patient_disease","visit_year_month"]]

Aggregate individual patient level data to the zipcode level by month

In [None]:
#summarize the data at the zipcode level
#columns are: zipcode, month, count of patient diagnosis
patient_data_zipcode=patient_df.groupby(['Zipcode','visit_year_month','Patient_disease']).size().reset_index(name='total_cases')

In [None]:
patient_data_zipcode.head()

In [None]:
patient_data_zipcode.tail()

In [None]:
patient_data_zipcode.shape

# Join fire & PM2.5 data to patient medical data

In [None]:
fd1.head()

In [None]:
#convert year_month to string from original int
fd1=fd1.astype({'year_month':'string'})

In [None]:
#join patient and fire/pm25 datasets
#this is zipcode level data
#This is a left join(fire/pm25 is the left dataset and patient zipcode level data is the right dataset)
# The join fields are: zipcode and year_month
df=fd1.merge(patient_data_zipcode,how='left', left_on=['ZCTA','year_month'],right_on=['Zipcode','visit_year_month'] )

In [None]:
fd1.dtypes

In [None]:
patient_data_zipcode.dtypes

In [None]:
df.head()