In [3]:
library(data.table)
library(arrow)

pre_assembled_file = 'Input/Processed/osha_data.csv'
database <- fread(pre_assembled_file, na.strings=c("", "nan"))

head(database)

Some features are not enabled in this build of Arrow. Run `arrow_info()` for more information.


Attaching package: 'arrow'


The following object is masked from 'package:utils':

    timestamp




V1,AIR_VOLUME_SAMPLED,BLANK_USED,CITY,DATE_REPORTED,DATE_SAMPLED,EIGHT_HOUR_TWA_CALC,ESTABLISHMENT_NAME,FIELD_NUMBER,IMIS_SUBSTANCE_CODE,...,SAMPLE_TYPE,SAMPLE_WEIGHT,SAMPLING_NUMBER,SIC_CODE,STATE,SUBSTANCE,TIME_SAMPLED,UNIT_OF_MEASUREMENT,YEAR,ZIP_CODE
<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,...,<chr>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<int>,<dbl>
0,516.0,N,Plainview,1984/06/15 00:00:00,1984/05/07 00:00:00,,CONTROLLED CASTINGS CORP,2LI8491,T100,...,P,,5245543.0,3365,NY,Thorium,258.0,M,1984,11803
1,,Y,Plainview,1984/06/15 00:00:00,1984/05/07 00:00:00,,CONTROLLED CASTINGS CORP,2LI8492,T100,...,P,,5245543.0,3365,NY,Thorium,,,1984,11803
2,76.0,N,Plainview,1984/06/15 00:00:00,1984/05/07 00:00:00,,CONTROLLED CASTINGS CORP,2LI8493,T100,...,P,,5245501.0,3365,NY,Thorium,38.0,M,1984,11803
3,,Y,Plainview,1984/06/15 00:00:00,1984/05/07 00:00:00,,CONTROLLED CASTINGS CORP,2LI8494,T100,...,P,,5245501.0,3365,NY,Thorium,,,1984,11803
4,74.0,N,East Farmingdale,1984/04/27 00:00:00,1984/04/04 00:00:00,,MID ISLAND NON FERROUS FOUNDRY,2LI84-49,1591,...,P,0.0,5245345.0,3365,NY,"Lead, Inorganic (as Pb)",37.0,M,1984,11735
5,0.0,Y,East Farmingdale,1984/04/27 00:00:00,1984/04/04 00:00:00,,MID ISLAND NON FERROUS FOUNDRY,2LI84-50,1591,...,P,0.0,5245345.0,3365,NY,"Lead, Inorganic (as Pb)",0.0,M,1984,11735


In [53]:
################# RE-FORMAT #################

## Re adjusting type and format of variables

lct <- Sys.getlocale("LC_TIME"); Sys.setlocale("LC_TIME", "C") #format error for date without this command

database <- within(database, {
  
  AIR_VOLUME_SAMPLED <- as.numeric(as.character(AIR_VOLUME_SAMPLED))
  
  BLANK_USED <- factor(BLANK_USED, levels = c('Y', 'N'))
  
  CITY <- as.character(CITY)
  
  DATE_REPORTED <- as.character(tolower(DATE_REPORTED))
  DATE_REPORTED <- ifelse(nchar(DATE_REPORTED) == 11 , as.character(as.Date(DATE_REPORTED, format = "%Y-%b-%d")), 
                          ifelse(nchar(DATE_REPORTED) == 19, as.character(as.Date(DATE_REPORTED, format = '%Y/%m/%d')), 
                                 as.character(as.Date(DATE_REPORTED, format = '%Y-%m-%d'))))
  DATE_REPORTED <- as.Date(DATE_REPORTED, format = '%Y-%m-%d')
  
  DATE_SAMPLED <- as.character(tolower(DATE_SAMPLED))
  DATE_SAMPLED <- ifelse(nchar(DATE_SAMPLED) == 11 , as.character(as.Date(DATE_SAMPLED, format = "%Y-%b-%d")), 
                         ifelse(nchar(DATE_SAMPLED) == 19, as.character(as.Date(DATE_SAMPLED, format = '%Y/%m/%d')), 
                                as.character(as.Date(DATE_SAMPLED, format = '%Y-%m-%d'))))
  DATE_SAMPLED <- as.Date(DATE_SAMPLED, format = '%Y-%m-%d')
  
  EIGHT_HOUR_TWA_CALC <- factor(EIGHT_HOUR_TWA_CALC, levels = c('Y', 'N'))
  
  ESTABLISHMENT_NAME <- as.character(ESTABLISHMENT_NAME)
  
  FIELD_NUMBER <- as.character(FIELD_NUMBER)
  
  IMIS_SUBSTANCE_CODE <- factor(gsub(" ", "0", sprintf("%4s", database$IMIS_SUBSTANCE_CODE)))
  INSPECTION_NUMBER <- factor(INSPECTION_NUMBER)
  
  INSTRUMENT_TYPE <- as.character(INSTRUMENT_TYPE) #need to be cleaned before levels
  
  LAB_NUMBER <- factor(LAB_NUMBER)
  
  NAICS_CODE <- as.character(NAICS_CODE)
  NAICS_CODE <- ifelse(nchar(NAICS_CODE) < 6, NA, NAICS_CODE) #deletion of missing or 0 NAICS CODE
  
  OFFICE_ID <- factor(OFFICE_ID)
  
  QUALIFIER <- as.character(QUALIFIER) #need to be cleaned before levels
  
  SAMPLE_RESULT <- as.numeric(as.character(database$SAMPLE_RESULT))
  
  ## Should have only 4 levels according to CEHD information
  SAMPLE_TYPE <- factor(SAMPLE_TYPE) #A B BL BU L M P S U W WB Z
  
  SAMPLE_WEIGHT <- as.numeric(as.character(SAMPLE_WEIGHT))
  
  ## Maybe need a cleaning. Kept it as factor for now on.
  SAMPLING_NUMBER <- factor(SAMPLING_NUMBER)
  
  SIC_CODE <- factor(SIC_CODE)
  
  STATE <- factor(STATE)
  
  # Label of IMIS code, not the same number of levels though.
  SUBSTANCE <- as.character(SUBSTANCE)
  
  # Time in minutes
  TIME_SAMPLED <- as.numeric(as.character(TIME_SAMPLED))
  
  UNIT_OF_MEASUREMENT <- as.character(UNIT_OF_MEASUREMENT) # #need to be cleaned before levels
  
  ZIP_CODE <- factor(gsub(" ", "0", sprintf("%5s", database$ZIP_CODE)))
  
})


database$YEAR <- factor(substring(database$DATE_SAMPLED, 1, 4))
# saveRDS(database, "database.rds")  

"NAs introduced by coercion"


In [54]:
database$IMIS_SUBSTANCE_CODE <- as.character(database$IMIS_SUBSTANCE_CODE)
database$INSPECTION_NUMBER <- as.character(database$INSPECTION_NUMBER)
database$SAMPLING_NUMBER <- as.character(database$SAMPLING_NUMBER)

#remove white spaces left and right of variables
database$INSPECTION_NUMBER <- trimws(database$INSPECTION_NUMBER)
database$SAMPLING_NUMBER <- trimws(database$SAMPLING_NUMBER)

In [55]:
### Setting data.frame to count elimination
reasons <- data.frame(YEAR = min(as.character(database$YEAR)) : max(as.character(database$YEAR)))

In [56]:
# ##################### N01: Blanks ####
# N01 removing blanks from the blank_used variable (other blanks identified later by qualifier)

vec_eff <- table(database$YEAR[database$BLANK_USED != 'N'])
reasons$N01[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[database$BLANK_USED == 'N', ]

In [57]:
# ##################### N02: Personal measurements ####
# N02 removing non personal samples

vec_eff <- table(database$YEAR[database$SAMPLE_TYPE != 'P'])
reasons$N02[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[database$SAMPLE_TYPE == 'P', ] #excluding all things that are not P (keeping P only)

In [58]:
# ##################### N03 Excluding substances with few samples ####

# Store all substances with >100 records in an object
subst <- data.frame(table(database$IMIS_SUBSTANCE_CODE))
names(subst) <-c('code','n')
subst <- subst[order(-subst$n),]
nrow(subst) #[1] 978
subst$name <- database$SUBSTANCE[match(subst$code,database$IMIS_SUBSTANCE_CODE)]
subst <- subst[subst$n>=100,]
nrow(subst) #[1] 265
#View(subst)
# remove codes which do not correspond to chemical substances
subst_all <- subst[!is.element(subst$code,c('G301','G302','Q115','T110','M125','Q116','Q100','S325')),]
nrow(subst_all) #[1] 257
sub_list_all <- sort(as.character(subst_all$code))

vec_eff <- table(database$YEAR[!is.element(database$IMIS_SUBSTANCE_CODE,sub_list_all)])
reasons$N03[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[is.element(database$IMIS_SUBSTANCE_CODE,sub_list_all),] #restrict to subst in the list

In [59]:
# library(writexl)

# ##################### Preparation of conversion tables for QUALIFIER and UNIT_OF_MEASUREMENT ####

#### new 2020 table ###/#: creation of a new conversion table for QUALIFIER variable
# replace NAs in the qualifier variable by "raw was NA"
class(database$QUALIFIER)

In [60]:
length(database$QUALIFIER) #[1] 1387254

In [61]:
sum(is.na(database$QUALIFIER)) #[1] 751060

In [62]:
database$QUALIFIER[is.na(database$QUALIFIER)] <- "raw was NA"
qualif <- data.frame(table(database$QUALIFIER))
names(qualif) <-c('raw','n')
#nrow(qualif) 
sum(qualif$n) #[1] 1387254
#write_xlsx(x = qualif, path = "qualif_new.xlsx", col_names = TRUE)

In [63]:
qualif.conv.2020 <- read.csv(
    "Input\\Raw\\OSHA\\CEHD\\CEHD1984_2018\\cleaning scripts\\Conversion tables\\qualif_new_2020.csv",
    sep=";", header=T
)
qualif.conv.2020$clean <- as.character(qualif.conv.2020$clean)
qualif.conv.2020$raw <- as.character(qualif.conv.2020$raw)
qualif.conv.2020$possible_bulk <- as.character(qualif.conv.2020$possible_bulk)
table(qualif.conv.2020$clean)
#B       BLK  eliminate        ND         S sample OK         W 
#1         7        137        49         6        12         1 


        B       BLK eliminate        ND         S sample OK         W 
        1         7       137        49         6        12         1 

In [64]:
#### UNIT_OF_MEASUREMENT

#### new 2020 table ###/#: creation of a new conversion table for UNIT_OF_MEASUREMENT variable
# replace NAs in the UNIT_OF_MEASUREMENT variable by "raw was NA"
class(database$UNIT_OF_MEASUREMENT)
length(database$UNIT_OF_MEASUREMENT) #[1] 1387254
sum(is.na(database$UNIT_OF_MEASUREMENT)) #[1] 29836

In [65]:
database$UNIT_OF_MEASUREMENT[is.na(database$UNIT_OF_MEASUREMENT)] <- "raw was NA"
unit_2020 <- data.frame(table(database$UNIT_OF_MEASUREMENT))
names(unit_2020) <-c('raw','n')
nrow(unit_2020) 
sum(unit_2020$n) #[1] 1387254

In [66]:
#write_xlsx(x = unit_2020, path = "unit_conv.xlsx", col_names = TRUE)
unit.conv.2020 <- read.csv(
    "Input\\Raw\\OSHA\\CEHD\\CEHD1984_2018\\cleaning scripts\\Conversion tables\\unit_conv_2020.csv",
    sep=";", header=T
)
unit.conv.2020$clean <- as.character(unit.conv.2020$clean)
unit.conv.2020$raw <- as.character(unit.conv.2020$raw)
table(unit.conv.2020$clean)
#          % eliminate         F         M         P         X         Y 
#2         1        22         4         2         2         2         1


                  % eliminate         F         M         P         X         Y 
        2         1        22         4         2         2         2         1 

In [67]:
##Add a column that indicates that the sample is censored ONLY based on the Qualifier variable with
#approximative '<' signs
database$CENSORED <- rep('N',length(database$IMIS_SUBSTANCE_CODE))
database$CENSORED[database$QUALIFIER=="-<"] <- "Y"
database$CENSORED[database$QUALIFIER=="  <"] <- "Y"
database$CENSORED[database$QUALIFIER==" =<"] <- "Y"
database$CENSORED[database$QUALIFIER=="@<"] <- "Y"
database$CENSORED[database$QUALIFIER=="@<="] <- "Y"
database$CENSORED[database$QUALIFIER=="@=<"] <- "Y"
database$CENSORED[database$QUALIFIER=="<"] <- "Y"
database$CENSORED[database$QUALIFIER=="< ="] <- "Y"
database$CENSORED[database$QUALIFIER=="<@"] <- "Y"
database$CENSORED[database$QUALIFIER=="<="] <- "Y"
database$CENSORED[database$QUALIFIER=="<= 0"] <- "Y"
database$CENSORED[database$QUALIFIER=="= <"] <- "Y"
database$CENSORED[database$QUALIFIER=="=<"] <- "Y"
database$CENSORED[database$QUALIFIER=="=<@"] <- "Y"

In [68]:
# "raw was NA" transformed into EMPTY in QUALIFIER
length(database$QUALIFIER[is.na(database$QUALIFIER)]) #0
length(database$QUALIFIER[database$QUALIFIER=="raw was NA"]) #751060
database$QUALIFIER[database$QUALIFIER=="raw was NA"] <- ''
length(database$QUALIFIER[database$QUALIFIER=="raw was NA"]) #0

In [69]:
# NAs transformed into 0 in SAMPLE_RESULT
database$SAMPLE_RESULT_2 <- database$SAMPLE_RESULT
length(database$SAMPLE_RESULT_2[is.na(database$SAMPLE_RESULT_2)]) #39131
database$SAMPLE_RESULT_2[is.na(database$SAMPLE_RESULT_2)] <- 0
length(database$SAMPLE_RESULT_2[is.na(database$SAMPLE_RESULT_2)]) #0

In [70]:
##################### Standardisation of qualifier variable ##################### 
#####Removing samples: Qualifier suggests ND but result is >0
# ##################### N08 / N29 qualifier suggests ND but sample result is not null ####
# N08
temp <- database[(database$SAMPLE_RESULT_2>0 & database$CENSORED!="Y" &
                            is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="ND"]))),]

vec_eff <- table(database$YEAR[(database$SAMPLE_RESULT_2>0 & database$CENSORED!="Y" &
                                is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="ND"])))])
reasons$N08[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(database$SAMPLE_RESULT_2>0 & database$CENSORED!="Y" &
                      is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="ND"]))), ] #exclusion
sum(reasons$N08,na.rm = TRUE)
length(database[,1]) #1386387(-867)

In [71]:
# N29
temp <- database[(database$SAMPLE_RESULT_2>0 & (database$CENSORED=="Y" | 
               is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="ND"])))),]

vec_eff <- table(database$YEAR[(database$SAMPLE_RESULT_2>0 & (database$CENSORED=="Y" | 
               is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="ND"]))))])
reasons$N29[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(database$SAMPLE_RESULT_2>0 & (database$CENSORED=="Y" | 
                     is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="ND"])))),] #exclusion
sum(reasons$N29,na.rm = TRUE)
length(database[,1]) #1377502(-8885)

In [72]:
#####Removing samples: Qualifier conflicting with unit of measurement / sample type, or judged 'to be eliminated'
#cleaning unit of measurement

database$UNIT_OF_MEASUREMENT_2 <- database$UNIT_OF_MEASUREMENT
unik.clean <- unique(unit.conv.2020$clean)

for (j in 1:length(unik.clean))
{
one.clean <- as.character(unik.clean)[j]
raw.values <- as.character(unit.conv.2020$raw[unit.conv.2020$clean==one.clean])
database$UNIT_OF_MEASUREMENT_2[is.element(database$UNIT_OF_MEASUREMENT, raw.values)] <- one.clean 
}

as.data.frame(table(database$UNIT_OF_MEASUREMENT))
table(database$UNIT_OF_MEASUREMENT_2)

Var1,Freq
<fct>,<int>
',1
%,36789
",",1
.,1
@,1
[,1
\,2
],1
0,10
4,1



        % eliminate         F         M      None         P         X         Y 
    36789      2015     18951   1020239     29840    267442      2096       130 

In [73]:
# ##################### N04: qualifier=BLK and not possible bulk # elimination ####
temp <- database[is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="BLK" & 
                                                                       qualif.conv.2020$possible_bulk=="N"])),]

vec_eff <- table(database$YEAR[is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="BLK" & 
                                                                                     qualif.conv.2020$possible_bulk=="N"]))])
reasons$N04[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="BLK" & 
                                                                            qualif.conv.2020$possible_bulk=="N"])),] #exclusion
sum(reasons$N04,na.rm = TRUE)
length(database[,1]) #1377481(-21)

In [74]:
# ##################### N05: qualifier deemed not interpretable and record is to be eliminated ####
temp <- database[is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="eliminate"])),]

vec_eff <- table(database$YEAR[is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="eliminate"]))])
reasons$N05[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="eliminate"])),] #exclusion
sum(reasons$N05,na.rm = TRUE)
length(database[,1]) #1377022(-459)

In [75]:
# ##################### N06 / N07: qualifier conflicting with sample type ####
temp <- database[is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="B" | qualif.conv.2020$clean=="W"])),]

vec_eff <- table(database$YEAR[is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="B" | qualif.conv.2020$clean=="W"]))])
reasons$N0607[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="B" | qualif.conv.2020$clean=="W"])),] #exclusion
sum(reasons$N0607,na.rm = TRUE)
length(database[,1]) #1376771(-251)

In [76]:
# ##################### N09: qualifier=BLK and a possible bulk as judged by researcher and variable blank_used says NO and sample type is not bulk, final result=eliminate ####
temp <- database[is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="BLK" & qualif.conv.2020$possible_bulk=="Y"])) & is.element(database$BLANK_USED,"N"),]

vec_eff <- table(database$YEAR[is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="BLK" & qualif.conv.2020$possible_bulk=="Y"])) & is.element(database$BLANK_USED,"N")])
reasons$N09[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!is.element(database$QUALIFIER, c(qualif.conv.2020$raw[qualif.conv.2020$clean=="BLK" & qualif.conv.2020$possible_bulk=="Y"])) & is.element(database$BLANK_USED,"N"),] #exclusion
sum(reasons$N09,na.rm = TRUE)
length(database[,1]) #1376453(-318)

In [77]:
# ##################### N10: combustion related : eliminate ####
temp <- database[is.element(database$QUALIFIER, c("COMB",'COMD','com','comb')),]

vec_eff <- table(database$YEAR[is.element(database$QUALIFIER, c("COMB",'COMD','com','comb'))])
reasons$N10[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!is.element(database$QUALIFIER, c("COMB",'COMD','com','comb')),] #exclusion
sum(reasons$N10,na.rm = TRUE)
length(database[,1]) #1376258(-195)

In [78]:
# ##################### N11: qualifier suggest fibers (F) but F not relevant for substance ####
temp <- database[database$QUALIFIER=="F" & database$IMIS_SUBSTANCE_CODE!=9020,]

vec_eff <- table(database$YEAR[database$QUALIFIER=="F" & database$IMIS_SUBSTANCE_CODE!=9020])
reasons$N11[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(database$QUALIFIER=="F" & database$IMIS_SUBSTANCE_CODE!=9020),] #exclusion
sum(reasons$N11,na.rm = TRUE)
length(database[,1]) #1376256(-2)

In [79]:
# ##################### N13: 'Y' qualifier value judged possible 'Ytrium', deemed OK for the generic particles category only : 9135 ####
temp <- database[database$QUALIFIER=="Y" & database$IMIS_SUBSTANCE_CODE!=9135,]

vec_eff <- table(database$YEAR[database$QUALIFIER=="Y" & database$IMIS_SUBSTANCE_CODE!=9135])
reasons$N13[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(database$QUALIFIER=="Y" & database$IMIS_SUBSTANCE_CODE!=9135),] #exclusion
sum(reasons$N13,na.rm = TRUE)
length(database[,1]) #1376206(-50)

In [80]:
# ##################### N16: measure is approximate (to be eliminated) ####
temp <- database[is.element(database$QUALIFIER,c("@"," @","@<","@=<","@<=","<@","=<@","EST")),]

vec_eff <- table(database$YEAR[is.element(database$QUALIFIER,c("@"," @","@<","@=<","@<=","<@","=<@","EST"))])
reasons$N16[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!is.element(database$QUALIFIER,c("@"," @","@<","@=<","@<=","<@","=<@","EST")),] #exclusion
sum(reasons$N16,na.rm = TRUE)
length(database[,1]) #1375371(-835)

In [81]:
# ##################### N19: elimination of records that have "%" in qualifier, but not "%" in the unit (to be removed), same for "M"  ####
temp <- database[(database$UNIT_OF_MEASUREMENT_2!="%" & database$QUALIFIER=="%") | (database$UNIT_OF_MEASUREMENT_2!="M" & database$QUALIFIER=="M"),]

vec_eff <- table(database$YEAR[(database$UNIT_OF_MEASUREMENT_2!="%" & database$QUALIFIER=="%") | (database$UNIT_OF_MEASUREMENT_2!="M" & database$QUALIFIER=="M")])
reasons$N19[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!((database$UNIT_OF_MEASUREMENT_2!="%" & database$QUALIFIER=="%") | (database$UNIT_OF_MEASUREMENT_2!="M" & database$QUALIFIER=="M")),] #exclusion
sum(reasons$N19,na.rm = TRUE)
length(database[,1]) #1375365(-6)

In [82]:
#####Removing samples: Unit of measurement is judged erroneous / missing  but the sample result is not null / conflicting with substance

# ##################### N17 / N18: elimination of records that should not have "F" as unit  ####
temp <- database[database$UNIT_OF_MEASUREMENT_2=="F",]
class(temp$IMIS_SUBSTANCE_CODE)
temp$IMIS_SUBSTANCE_CODE <- as.character(temp$IMIS_SUBSTANCE_CODE)
table(temp$IMIS_SUBSTANCE_CODE)
#0527  1073  1300  2270  2470  9020  9135  R251 
#   1     5   576     1     1 18105     1   189  

temp <- database[is.element(database$IMIS_SUBSTANCE_CODE,c('0527','1073','2270','2470','9135')) & database$UNIT_OF_MEASUREMENT_2=="F",]

vec_eff <- table(database$YEAR[is.element(database$IMIS_SUBSTANCE_CODE,c('1073','2270','2470','9135')) & database$UNIT_OF_MEASUREMENT_2=="F"])
reasons$N1718[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(is.element(database$IMIS_SUBSTANCE_CODE,c('1073','2270','2470','9135')) & database$UNIT_OF_MEASUREMENT_2=="F"),] #exclusion
sum(reasons$N1718,na.rm = TRUE)
length(database[,1]) #1375357(-8)


 0527  1073  1300  2270  2470  9020  9135  R251 
    1     5   576     1     1 18105     1   189 

In [83]:
# ##################### N23: elimination of records for which the sample result is not null and that the unit is empty  ####
temp <- database[database$UNIT_OF_MEASUREMENT_2=="" & database$SAMPLE_RESULT_2>0,]

vec_eff <- table(database$YEAR[database$UNIT_OF_MEASUREMENT_2=="" & database$SAMPLE_RESULT_2>0])
reasons$N23[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(database$UNIT_OF_MEASUREMENT_2=="" & database$SAMPLE_RESULT_2>0),] #exclusion
sum(reasons$N23,na.rm = TRUE)
length(database[,1]) #1374125(-1232)

In [84]:
# ##################### N24: elimination of cases where unit is "%" in unit, but sample_result is >100  ####
temp <- database[database$UNIT_OF_MEASUREMENT_2=="%" & database$SAMPLE_RESULT_2>100,]

vec_eff <- table(database$YEAR[database$UNIT_OF_MEASUREMENT_2=="%" & database$SAMPLE_RESULT_2>100])
reasons$N24[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(database$UNIT_OF_MEASUREMENT_2=="%" & database$SAMPLE_RESULT_2>100),] #exclusion
sum(reasons$N24,na.rm = TRUE)
length(database[,1]) #1374117(-8)

In [85]:
## Modification of variables qualifier and air volume sampled ####
##creating a qualifier.2 : detected / not detected
database$QUALIFIER_2 <- rep('detected',length(database[,1]))
database$QUALIFIER_2[database$SAMPLE_RESULT_2==0] <- 'ND'	

#**** this step is simplified for the 2020 cleanup: AIR_VOLUME_SAMPLED==NA, 0 or empty are deleted at N22 / N27 **** 
class(database$AIR_VOLUME_SAMPLED)
length(database$AIR_VOLUME_SAMPLED[is.na(database$AIR_VOLUME_SAMPLED)]) #[1] 3897
temp <- database[which(is.na(as.numeric(as.character(database$AIR_VOLUME_SAMPLED)))),]
summary(database$AIR_VOLUME_SAMPLED,na.rm=TRUE)
#   Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
#   0.0     30.8    314.4    368.9    606.0 924560.0     3897 
cehd_top <- database[1:5000,]


#further standardization of units
#based on the units for the personal cleaned, we keep only the %, F, M, P data
#in addition sample weight must not be null for detected results with perc

#list of 29 substances with most records
subst <- data.frame(table(database$IMIS_SUBSTANCE_CODE))
names(subst) <-c('code','n')
subst <- subst[order(-subst$n),]
nrow(subst) #[1] 257
list_29 <- as.character(sort(subst$code[c(1:29)]))
list_29

#list used in 2011 cleanup
substances.list.29 <-c('0040','0230','0260','0360','0430','0491','0685','0720','0731','1073','1290','1520','1560',
                     '1591','1620','1730','1790','1840','2270','2280','2460','2571','2590','2610','9020','9130','9135','C141','S103')

setdiff(list_29,substances.list.29) #[1] "9010"
setdiff(substances.list.29,list_29) #[1] "S103"
#9010 is replaced by S103 in the list
sum(subst$n[is.element(subst$code,substances.list.29)]) #[1] 1099503

#list used in 2011 cleanup will also be used in the current 2020 cleanup

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
     0.0     30.8    314.2    368.8    606.0 924560.0     3925 

In [86]:
# ##################### N31: record from the '29' (with 9010 excluded) and unit is not part of '','F','P','M'  ####
#table(database$UNIT_OF_MEASUREMENT_2)
temp <- database[(is.element(database$IMIS_SUBSTANCE_CODE,substances.list.29)) &
                (!is.element(database$UNIT_OF_MEASUREMENT_2,c('','F','P','M'))),]

vec_eff <- table(database$YEAR[(is.element(database$IMIS_SUBSTANCE_CODE,substances.list.29)) &
                               (!is.element(database$UNIT_OF_MEASUREMENT_2,c('','F','P','M')))])
reasons$N31[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!((is.element(database$IMIS_SUBSTANCE_CODE,substances.list.29)) &
                     (!is.element(database$UNIT_OF_MEASUREMENT_2,c('','F','P','M')))),] #exclusion
sum(reasons$N31,na.rm = TRUE)
length(database[,1]) #1371807(-2310)
#temp <- database[database$UNIT_OF_MEASUREMENT_2=="%",]

In [87]:
# ##################### N32: record from substance '9010' : if not '','%' or 'M' eliminate  ####
table(database$UNIT_OF_MEASUREMENT_2)

temp <- database[(is.element(database$IMIS_SUBSTANCE_CODE,'9010')) &
                 (!is.element(database$UNIT_OF_MEASUREMENT_2,c('','%','M'))),]

vec_eff <- table(database$YEAR[(is.element(database$IMIS_SUBSTANCE_CODE,'9010')) &
                               (!is.element(database$UNIT_OF_MEASUREMENT_2,c('','%','M')))])
reasons$N32[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!((is.element(database$IMIS_SUBSTANCE_CODE,'9010')) &
                       (!is.element(database$UNIT_OF_MEASUREMENT_2,c('','%','M')))),] #exclusion
sum(reasons$N32,na.rm = TRUE)
length(database[,1]) #1371631(-176)
#temp <- database[database$UNIT_OF_MEASUREMENT_2=="%",]


        % eliminate         F         M      None         P         X         Y 
    35329      1639     18871   1019691     17554    267242       828        71 

In [88]:
# ##################### N33: record not from 29 substances or  '9010' : if not '','P','F','%' or 'M' eliminate  ####
#table(database$UNIT_OF_MEASUREMENT_2)
restrict <- !database$IMIS_SUBSTANCE_CODE=='9010' & !is.element(database$IMIS_SUBSTANCE_CODE,substances.list.29)
temp <- database[restrict & (!is.element(database$UNIT_OF_MEASUREMENT_2,c('','%','M','P','F'))),]

vec_eff <- table(database$YEAR[restrict & (!is.element(database$UNIT_OF_MEASUREMENT_2,c('','%','M','P','F')))])
reasons$N33[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(restrict & (!is.element(database$UNIT_OF_MEASUREMENT_2,c('','%','M','P','F')))),] #exclusion
sum(reasons$N33,na.rm = TRUE)
length(database[,1]) #1369269(-2362)
#temp <- database[database$UNIT_OF_MEASUREMENT_2=="%",]

In [89]:
# ##################### N34: elimination when unit is perc (%) and the result is non null but sample weight is null ####
#missing sampling weight replaced by 0
database$SAMPLE_WEIGHT_2 <- database$SAMPLE_WEIGHT
length(database$SAMPLE_WEIGHT_2[is.na(database$SAMPLE_WEIGHT)])
database$SAMPLE_WEIGHT_2[is.na(database$SAMPLE_WEIGHT)] <- 0
length(database$SAMPLE_WEIGHT_2[is.na(database$SAMPLE_WEIGHT_2)])

temp <- database[database$SAMPLE_WEIGHT_2==0 & database$UNIT_OF_MEASUREMENT_2=="%" & 
               database$SAMPLE_RESULT_2>0,]

vec_eff <- table(database$YEAR[database$SAMPLE_WEIGHT_2==0 & database$UNIT_OF_MEASUREMENT_2=="%" & 
                               database$SAMPLE_RESULT_2>0])
reasons$N34[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(database$SAMPLE_WEIGHT_2==0 & database$UNIT_OF_MEASUREMENT_2=="%" & 
                       database$SAMPLE_RESULT_2>0),] #exclusion
sum(reasons$N34,na.rm = TRUE)
length(database[,1]) #1366423(-2846)

#transformation of percs into mg/m3										
length(database$SAMPLE_WEIGHT_2[is.na(database$SAMPLE_WEIGHT_2)]) #[1] 0
length(database$AIR_VOLUME_SAMPLED[is.na(database$AIR_VOLUME_SAMPLED)]) #[1] 3309
table(database$UNIT_OF_MEASUREMENT_2)
#            %       F       M       P 
#28136   32483   18871 1019691  267242

restrict.1 <- database$SAMPLE_WEIGHT_2!=0 &
database$UNIT_OF_MEASUREMENT_2=='%' &
database$SAMPLE_RESULT_2>0 &
!is.na(database$SAMPLE_WEIGHT_2) &
!is.na(database$AIR_VOLUME_SAMPLED) & database$AIR_VOLUME_SAMPLED>0

database$SAMPLE_RESULT_3 <- database$SAMPLE_RESULT_2

database$SAMPLE_RESULT_3[restrict.1] <- database$SAMPLE_RESULT_2[restrict.1]*
database$SAMPLE_WEIGHT_2[restrict.1]*10/
database$AIR_VOLUME_SAMPLED[restrict.1]										

database$UNIT_OF_MEASUREMENT_2[restrict.1] <- 'M.from.Perc'
temp <- database[database$UNIT_OF_MEASUREMENT_2=="P",]

table(database$UNIT_OF_MEASUREMENT_2)
#                %           F           M M.from.Perc           P 
#28136       16779       18871     1019691       15704      26724 


      %       F       M       P 
  32483   18871 1019691  267242 


          %           F           M M.from.Perc           P 
      16779       18871     1019691       15704      267242 

In [90]:
# ##################### N20: elimination of the records that have a missing value for the office ID  ####
temp <- database[is.na(database$OFFICE_ID),]

vec_eff <- table(database$YEAR[is.na(database$OFFICE_ID)])
reasons$N20[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!is.na(database$OFFICE_ID),] #exclusion
sum(reasons$N20,na.rm = TRUE)
length(database[,1]) #1363807(-2616)

In [91]:
# ##################### N21: records that have a missing time_sampled variable are deleted  ####
temp <- database[is.na(database$TIME_SAMPLED),]

vec_eff <- table(database$YEAR[is.na(database$TIME_SAMPLED)])
reasons$N21[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!is.na(database$TIME_SAMPLED),] #exclusion
sum(reasons$N21,na.rm = TRUE)
length(database[,1]) #1362164(-1643)

In [92]:
# ##################### N26: elimination of the records that have a null time_sampled variable   ####
temp <- database[database$TIME_SAMPLED==0,]

vec_eff <- table(database$YEAR[database$TIME_SAMPLED==0])
reasons$N26[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[database$TIME_SAMPLED!=0,] #exclusion
sum(reasons$N26,na.rm = TRUE)
length(database[,1]) #1357348(-4816)

In [93]:
# ##################### N28: elimination of sample results less than 0   ####
temp <- database[database$SAMPLE_RESULT_3<0,]

vec_eff <- table(database$YEAR[database$SAMPLE_RESULT_3<0])
reasons$N28[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[database$SAMPLE_RESULT_3>=0,] #exclusion
sum(reasons$N28,na.rm = TRUE)
length(database[,1]) #1357347(-1)

In [94]:
# ##################### N25: records to be deleted because of a missing or null sampling number   ####
temp <- database[is.na(database$SAMPLING_NUMBER) | database$SAMPLING_NUMBER==0,]

vec_eff <- table(database$YEAR[is.na(database$SAMPLING_NUMBER) | database$SAMPLING_NUMBER==0])
reasons$N25[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(is.na(database$SAMPLING_NUMBER) | database$SAMPLING_NUMBER==0),] #exclusion
sum(reasons$N25,na.rm = TRUE)
length(database[,1]) #1355916(-1431)

In [95]:
# ##################### N22: records that have a NA or '' volume sampled variable are deleted   ####
summary(database$AIR_VOLUME_SAMPLED)
#  Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
#   0.0     32.1    316.6    370.3    608.9 924560.0     1705 
temp <- database[is.na(database$AIR_VOLUME_SAMPLED),] #1705
temp <- database[is.na(database$AIR_VOLUME_SAMPLED) | database$AIR_VOLUME_SAMPLED=='',] #1705

vec_eff <- table(database$YEAR[is.na(database$AIR_VOLUME_SAMPLED) | database$AIR_VOLUME_SAMPLED==''])
reasons$N22[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!(is.na(database$AIR_VOLUME_SAMPLED) | database$AIR_VOLUME_SAMPLED==''),] #exclusion
sum(reasons$N22,na.rm = TRUE)
length(database[,1]) #1354211(-1705)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
     0.0     36.0    320.0    372.8    613.0 924560.0     1642 

In [96]:
# ##################### N27: elimination of records that have an air volume sampled of zero   ####
summary(database$AIR_VOLUME_SAMPLED)
#  Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     
#   0.0     32.1    316.6    370.3    608.9 924560.0     
temp <- database[database$AIR_VOLUME_SAMPLED==0,] 

vec_eff <- table(database$YEAR[database$AIR_VOLUME_SAMPLED==0])
reasons$N27[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[database$AIR_VOLUME_SAMPLED>0,] #exclusion
sum(reasons$N27,na.rm = TRUE)
length(database[,1]) #1341727(-12484)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
     0.0     36.0    320.0    372.8    613.0 924560.0 

In [97]:
# ##################### N35: instrument type is equal to ''   ####
#temp <- database[database$INSTRUMENT_TYPE=='' & !is.na(database$INSTRUMENT_TYPE),] 
#temp <- database[is.na(database$INSTRUMENT_TYPE),] 
#table(temp$YEAR)

restrictN35 <- database$INSTRUMENT_TYPE=='' & !is.na(database$INSTRUMENT_TYPE)
#table(restrictN35)

temp <- database[restrictN35,]

vec_eff <- table(database$YEAR[restrictN35])
reasons$N35[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!restrictN35,] #exclusion
sum(reasons$N35,na.rm = TRUE)
length(database[,1]) #1341727(-0)

# ##################### N37: instrument type is missing   ####
#****NAs are not removed at this step: they are treated at N36**** 
temp <- database[is.na(database$INSTRUMENT_TYPE),] 
# table(temp$YEAR)
restrictN37 <- is.na(database$INSTRUMENT_TYPE)
# table(restrictN37)
#  FALSE    TRUE 
#1171397  170330 

# ##################### N36: cleaning of instrument type ####
database$INSTRUMENT_TYPE[is.na(database$INSTRUMENT_TYPE)] <- ''
length(database$INSTRUMENT_TYPE[database$INSTRUMENT_TYPE=='']) #[1] 170330
temp <- database[database$INSTRUMENT_TYPE=='',]

### cleaning strategy ###
#original 81 substances: clean for 1984-2009 using IT tables
#original 81 substances: copy raw IT for 2010-2011
#new substances: copy raw IT for 1984-2011
#all substances: indicate "not recorded" for 2013-2018

#1) all substances: create INSTRUMENT_TYPE_2 variable + indicate "not recorded" for all years (1984-2018)
database$INSTRUMENT_TYPE_2 <- 'not recorded'

#2) all substances: copy raw IT for 1984-2011
database$INSTRUMENT_TYPE_2[as.numeric(as.character(database$YEAR))<2012] <- database$INSTRUMENT_TYPE[as.numeric(as.character(database$YEAR))<2012]
temp <- database[database$INSTRUMENT_TYPE_2=='not recorded',]
# table(temp$YEAR)

In [98]:
#3) original 81 substances: clean for 1984-2009 using IT tables
#store tables in a list object
# setwd("Input\\Raw\\OSHA\\CEHD\\CEHD1984_2018\\cleaning scripts\\Conversion tables IT")
conv.tables.path = "Input\\Raw\\OSHA\\CEHD\\CEHD1984_2018\\cleaning scripts\\Conversion tables IT"
files <- list.files(path=conv.tables.path)
n <- length(files)
data.list <- vector(mode="list",n)

for (i in 1:n) {
subs_code <- substr(files[i],3,6)
csv.path = file.path(conv.tables.path, files[i])
data.list[[i]] <- read.csv(csv.path,sep=",", header=TRUE)
# data.list[[i]] <- read.csv(files[i],sep=",", header=TRUE)
names(data.list)[i] <- subs_code
# print(i)
}

# data.list[i]

length(database$IMIS_SUBSTANCE_CODE[database$IMIS_SUBSTANCE_CODE=="0040"])
#[1] 10726
length(database$IMIS_SUBSTANCE_CODE[database$IMIS_SUBSTANCE_CODE=="0040" & as.numeric(as.character(database$YEAR))<2010])
#[1] 9691

for (i in 1:n) {

restrict <- database$IMIS_SUBSTANCE_CODE==names(data.list)[i] & as.numeric(as.character(database$YEAR))<2010
# table(restrict)

unik.clean <- unique(data.list[[i]]$clean)

for (j in 1:length(unik.clean))
{
  one.clean <- as.character(unik.clean)[j]
  raw.values <- as.character(data.list[[i]]$raw[data.list[[i]]$clean==one.clean])
  
  database$INSTRUMENT_TYPE_2[restrict & is.element(database$INSTRUMENT_TYPE,raw.values)] <- one.clean
 }

# print(i)
}

In [99]:
#verifications
#View(database[,c(9,11,22,26,32,33)])
#View(database[database$INSTRUMENT_TYPE=='',c(9,11,22,26,32,33)])
#View(database[database$INSTRUMENT_TYPE_2=='',c(9,11,22,26,32,33)])

temp <- database[database$INSTRUMENT_TYPE_2=='',]
# table(temp$YEAR)

temp <- database[database$IMIS_SUBSTANCE_CODE=="9135",c(9,11,26,32,33)]
temp <- database[database$IMIS_SUBSTANCE_CODE=="9135" & database$INSTRUMENT_TYPE=='',c(9,11,26,32,33)]
# table(temp$YEAR)

#4) replace INSTRUMENT_TYPE_2=='' by INSTRUMENT_TYPE_2=='eliminate' (these records correspond to 
#   new substances <2012 which didn't have an IT recorded)
length(database$INSTRUMENT_TYPE_2[database$INSTRUMENT_TYPE_2=='']) #[1] 129
database$INSTRUMENT_TYPE_2[database$INSTRUMENT_TYPE_2==''] <- 'eliminate'


#exclusion
restrictN36 <- database$INSTRUMENT_TYPE_2=='eliminate'
# table(restrictN36)
#  FALSE    TRUE 
#1330338   11389

temp <- database[restrictN36,]

vec_eff <- table(database$YEAR[restrictN36])
reasons$N36[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database <- database[!restrictN36,] #exclusion
sum(reasons$N36,na.rm = TRUE)
length(database[,1]) #1330338(-11389)

In [22]:
database <- read_parquet('cleaned_database.parquet')

In [23]:
# ##################### Duplicates ####
#creating the hash variable
database$HASH <- paste(database$INSPECTION_NUMBER,
                     database$IMIS_SUBSTANCE_CODE,
                     database$SAMPLING_NUMBER,
                     database$FIELD_NUMBER,sep='-')

#file listing the problematic hash codes					
bla <- data.frame(table(database$HASH),stringsAsFactors=F)
names(bla) <- c('name','n')
bla <- bla[bla$n>1,]
bla$name <- as.character(bla$name)
bla$code <- database$IMIS_SUBSTANCE_CODE[match(bla$name,database$HASH)]
bla$sub <- database$SUBSTANCE[match(bla$name,database$HASH)]

In [24]:
#creating a concatenated variable to identify if there are true duplicates
database$CONCAT <- paste(database$LAB_NUMBER,
                       database$STATE,
                       database$ZIP_CODE,
                       database$YEAR,
                       database$TIME_SAMPLED,
                       database$SAMPLE_WEIGHT_2,
                       sep='-')


#identification of cases where "CONCAT" is the same 
bla$concatdiff <- rep(F,length(bla[,1]))

for (i in 1:length(bla[,1])) {

if (length(unique(database$CONCAT[database$HASH==bla$name[i]]))!=1) bla$concatdiff[i] <- T 

}

In [25]:
#### M: false duplicates ####
#list of hash codes to eliminate because "CONCAT" variable varies
Hash.list.diff.concat <- bla$name[bla$concatdiff==TRUE]

#elimination of these
restrictM <- is.element(database$HASH,Hash.list.diff.concat)

# temp <- database[restrictM,] 

# vec_eff <- table(database$YEAR[restrictM])
# reasons$M[match(names(vec_eff), reasons$YEAR)] <- vec_eff

database.1 <- database[!restrictM,] #exclusion

In [26]:
#### N: true duplicates ####
##treatment of cases with equal "CONCAT" variable values

#we separate the DB into the OK and remaining problematic

database.1.ok <- database.1[!is.element(database.1$HASH,bla$name),]
database.1.nonok <- database.1[is.element(database.1$HASH,bla$name),]

# treating the OK
bla.diff <- bla[bla$concatdiff==FALSE,]
true.duplicates <- database[is.element(database$HASH,bla.diff$name),]

# majority is 9010 (e.g. duplicates of "M" and "M.from.Perc" cases) 
# only 9010 treated, remaining cases are deleted
database.1.nonok.9010 <- database.1.nonok[database.1.nonok$IMIS_SUBSTANCE_CODE=="9010",]

In [27]:
#data are sorted according to HASH and one out of 2 records is retained
database.1.nonok.9010 <- database.1.nonok.9010[order(database.1.nonok.9010$HASH),]

In [28]:
database.1.nonok.9010 <- database.1.nonok.9010[seq(from=1,to=6083,by=2),]

In [29]:
# binding of two subsets of database.1
database.final <- rbind(database.1.ok,database.1.nonok.9010)

In [30]:
write_parquet(database.final, 'database.parquet')

In [None]:
#### finalizing database ####
#finalizing variable class
str(database.final)

database.final$LAB_NUMBER <- as.character(database.final$LAB_NUMBER)
database.final$OFFICE_ID <- as.character(database.final$OFFICE_ID)
database.final$SIC_CODE <- as.character(database.final$SIC_CODE)
database.final$STATE <- as.character(database.final$STATE)
database.final$ZIP_CODE <- as.character(database.final$ZIP_CODE)

saveRDS(database.final, file="C:\\Users\\phisar\\Dropbox (IRSST)\\PhD\\Projet IMIS\\R�sultats\\IMIS_ND_predict\\CEHD 84_18\\CEHD_84_18_clean_allfields.rds")

#final variable names
#CEHD <- read.csv("C:\\Users\\phisar\\Documents\\Data_CEHD\\CEHD.csv",sep=",", header=T)
#dim(CEHD) #[1] 1037395      23
#names(CEHD)
#[1] "inspection_number"     "establishment_name"    "city"                  "state"                 "zip_code"             
#[6] "sic_code"              "naics_code"            "sampling_number"       "office_id"             "date_sampled"         
#[11] "date_reported"         "field_number"          "time_sampled"          "imis_substance_code"   "substance"            
#[16] "unit_of_measurement_N" "sample_weight_N"       "air_volume_sampled_N"  "sample_result_N"       "instrument_type_N"    
#[21] "hash"                  "data_source"           "position_in_source"   

names(database.final)
#[1] "AIR_VOLUME_SAMPLED"    "BLANK_USED"            "CITY"                  "DATE_REPORTED"         "DATE_SAMPLED"         
#[6] "EIGHT_HOUR_TWA_CALC"   "ESTABLISHMENT_NAME"    "FIELD_NUMBER"          "IMIS_SUBSTANCE_CODE"   "INSPECTION_NUMBER"    
#[11] "INSTRUMENT_TYPE"       "LAB_NUMBER"            "NAICS_CODE"            "OFFICE_ID"             "QUALIFIER"            
#[16] "SAMPLE_RESULT"         "SAMPLE_TYPE"           "SAMPLE_WEIGHT"         "SAMPLING_NUMBER"       "SIC_CODE"             
#[21] "STATE"                 "SUBSTANCE"             "TIME_SAMPLED"          "UNIT_OF_MEASUREMENT"   "ZIP_CODE"             
#[26] "YEAR"                  "CENSORED"              "SAMPLE_RESULT_2"       "UNIT_OF_MEASUREMENT_2" "QUALIFIER_2"          
#[31] "SAMPLE_WEIGHT_2"       "SAMPLE_RESULT_3"       "INSTRUMENT_TYPE_2"     "HASH"                  "CONCAT"    

names(database.final)[1] <-'air_volume_sampled_N'
names(database.final)[3] <-'city'
names(database.final)[4] <-'date_reported'
names(database.final)[5] <-'date_sampled'
names(database.final)[7] <-'establishment_name'
names(database.final)[8] <-'field_number'
names(database.final)[9] <-'imis_substance_code'
names(database.final)[10] <-'inspection_number'
names(database.final)[12] <-'lab_number'
names(database.final)[13] <-'naics_code'
names(database.final)[14] <-'office_id'
names(database.final)[19] <-'sampling_number'
names(database.final)[20] <-'sic_code'
names(database.final)[21] <-'state'
names(database.final)[22] <-'substance'
names(database.final)[23] <-'time_sampled'
names(database.final)[25] <-'zip_code'
names(database.final)[26] <-'year'
names(database.final)[29] <-'unit_of_measurement_N'
names(database.final)[31] <-'sample_weight_N'
names(database.final)[32] <-'sample_result_N'
names(database.final)[33] <-'instrument_type_N'
names(database.final)[34] <-'hash'

# selection and ordering of final variables
database.final.1 <- database.final[,-c(2,6,11,15,16,17,18,24,27,28,30,35),drop=FALSE]
names(database.final.1)
#[1] "air_volume_sampled_N"  "city"                  "date_reported"         "date_sampled"          "establishment_name"   
#[6] "field_number"          "imis_substance_code"   "inspection_number"     "lab_number"            "naics_code"           
#[11] "office_id"             "sampling_number"       "sic_code"              "state"                 "substance"            
#[16] "time_sampled"          "zip_code"              "year"                  "unit_of_measurement_N" "sample_weight_N"      
#[21] "sample_result_N"       "instrument_type_N"     "hash"  
database.final.2 <- database.final.1[,c(8,5,2,14,17,13,10,12,11,9,4,3,6,18,7,15,22,20,16,1,21,19,23)]
dim(database.final.2)
# [1] 1320183      23

subst <- data.frame(table(database.final.2$imis_substance_code))
names(subst) <-c('code','n')
subst <- subst[order(-subst$n),]
nrow(subst) #[1] 257

length(unique(database.final.2$hash))

saveRDS(database.final.2, file="C:\\Users\\phisar\\Dropbox (IRSST)\\PhD\\Projet IMIS\\R�sultats\\IMIS_ND_predict\\CEHD 84_18\\CEHD_84_18_clean.rds")

cehd <- readRDS("C:\\Users\\phisar\\Dropbox (IRSST)\\PhD\\Projet IMIS\\R�sultats\\IMIS_ND_predict\\CEHD 84_18\\CEHD_84_18_clean.rds")
dim(cehd) #[1] 2395071      26
# [1] 1320183      23

length(cehd$instrument_type_N[cehd$instrument_type_N=="raw was NA"])
  
  
  
  

In [31]:
cehd <- readRDS("C:\\Users\\jmank\\Repositories\\ht_occupational_plus\\Input\\Raw\\OSHA\\CEHD\\CEHD1984_2018\\database\\CEHD_84_18_clean_original.rds")

cehd

Unnamed: 0_level_0,inspection_number,establishment_name,city,state,zip_code,sic_code,naics_code,sampling_number,office_id,lab_number,...,year,imis_substance_code,substance,instrument_type_N,sample_weight_N,time_sampled,air_volume_sampled_N,sample_result_N,unit_of_measurement_N,hash
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,...,<fct>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
5,111252,MID ISLAND NON FERROUS FOUNDRY,East Farmingdale,NY,11735,3365,,5245345,214700,N00046,...,1984,1591,"Lead, Inorganic (as Pb)",ID121,0,37,74.000,0.0640,M,111252-1591-5245345-2LI84-49
8,111252,MID ISLAND NON FERROUS FOUNDRY,East Farmingdale,NY,11735,3365,,5245345,214700,N00046,...,1984,1840,"Nickel, Metal and Insoluble compounds (as Ni)",ID121,0,37,74.000,0.0000,M,111252-1840-5245345-2LI84-49
10,111252,MID ISLAND NON FERROUS FOUNDRY,East Farmingdale,NY,11735,3365,,5245345,214700,N00046,...,1984,0731,Copper Fume (as Cu),ID121,0,37,74.000,0.0063,M,111252-0731-5245345-2LI84-49
86,113951,PANTASOTE INC,Passaic,NJ,07055,3089,,5269568,214500,J00522,...,1984,2580,Vinyl Chloride,GC-FID,0,15,0.783,1.4000,P,113951-2580-5269568-K167
111,114033,IBG INC BLEACHETTE BLUING DIV,Clifton,NJ,07015,2879,,5269873,214500,K00477,...,1984,2490,Trichloroethylene,GC-FID,0,15,3.000,0.0000,,114033-2490-5269873-976
114,114033,IBG INC BLEACHETTE BLUING DIV,Clifton,NJ,07015,2879,,5269873,214500,K00477,...,1984,2460,Toluene,GC-FID,0,15,3.000,0.0000,,114033-2460-5269873-976
119,114033,IBG INC BLEACHETTE BLUING DIV,Clifton,NJ,07015,2879,,5269873,214500,K00477,...,1984,2280,Styrene,GC-FID,0,15,3.000,19.0000,P,114033-2280-5269873-976
143,114173,TRANS WORLD MANUFACTURING CORP,E RUTHERFORD,NJ,07073,3993,,5269725,214500,K00592,...,1984,0921,Diethylenetriamine,GC/NPD LC/UV,0,108,5.400,0.0000,M,114173-0921-5269725-2NJH84 K158
144,114173,TRANS WORLD MANUFACTURING CORP,E RUTHERFORD,NJ,07073,3993,,5269725,214500,K00594,...,1984,0921,Diethylenetriamine,GC/NPD LC/UV,0,66,3.300,0.0000,M,114173-0921-5269725-K160
145,114173,TRANS WORLD MANUFACTURING CORP,E RUTHERFORD,NJ,07073,3993,,5269741,214500,K00595,...,1984,0921,Diethylenetriamine,GC/NPD LC/UV,0,116,5.800,0.0000,M,114173-0921-5269741-K161


In [32]:
write_parquet(cehd, 'cehd.parquet')