In [1]:
/*********************************************************************************************************************/
/************************************** CLHLS longitudinal dataset survival time *************************************/
/*********************************************************************************************************************/
* Zhengting (Johnathan) He
* July 5th, 2021
* healthy-aging project
* Verify Yaxi's code on generting survival time: 98_14wave.do

In [31]:
// set working directories
global root "F:\Box Sync\Archives2020LLY\Zhengting\Duke Kunshan University Intern (zh133@duke.edu)\4 healthy aging-CLHLS\Group meeting coordination\survival time"
* define path for data sources
global RAW "${root}/raw data"
* define path for output data
global OUT "${root}/out data"
* define path for INTERMEDIATE
global INTER "${root}/inter data"

In [3]:
/*********************************************************************************************************************/
/************************************* I. logical check on death status and date *************************************/
/*********************************************************************************************************************/

/************************************* (1) Extract new added data at current wave *************************************/
use "${RAW}/2005_2014_longitudinal_dataset_released_version1.dta", clear
gen int id_year = mod(id, 100)
keep if id_year == 05 //7459 obs




(8,179 observations deleted)


In [5]:
/************************************* (2) Check the actual values of death date variables, against the codebook for those variables *************************************/
foreach var in d8vyear d11vyear d14vyear d8vmonth d11vmonth d14vmonth d8vday d11vday d14vday dth05_08 dth08_11 dth11_14 {
    codebook `var'
}
// codebook on death variables

// validated death year
* d8vyear, d11vyear, d14vyear: validated year of death*/
* -9: lost to follow up in the 2005/2008/2011/2014 survey*
* -8: died or lost to follow-up in previous waves*
* -7: it is for the deceased persons, not applicable for survivors*
* .: missing*/

// validated death month
* d8vmonth, d11vmonth, d14vmonth: validated month of death*/
* -9: lost to follow-up in the 2005/08/11/14 survey*/
* -8: died or lost to follow-up in previous waves*/
* -7: it is for the deceased persons, not applicable for survivors*/
* .: missing*/

// validated death day
* d8vday, d11vday, d14vday: validated day of death*/
* -9: lost to follow-up in the 2002/05/08/11/14 survey*/ 
* -8: died or lost to follow-up in previous waves*/
* -7: it is for the deceased persons, not applicable for survivors*/
* .: missing*/

// survival status
* dth05_08, dth08_11, dth11_14: status of survival, death, or lost to follow-up from 2002-2005/2005-2008/2008-2011/2011-2014 waves*/
* dth**_##:
* -9: lost to follow-up at the ## survey;
* -8: died or lost to follow-up in previous waves;
* 0: surviving at the ## survey;
* 1: died before the ## survey



--------------------------------------------------------------------------------
d8vyear                            validated year of death of the sampled person
--------------------------------------------------------------------------------

                  type:  numeric (int)

                 range:  [-9,2008]                    units:  1
         unique values:  6                        missing .:  3/7,459

            tabulation:  Freq.  Value
                         1,472  -9
                         3,281  -7
                           327  2005
                         1,022  2006
                           934  2007
                           420  2008
                             3  .

--------------------------------------------------------------------------------
d11vyear                           validated year of death of the sampled person
--------------------------------------------------------------------------------

                  type:  numeric (int)
     

In [6]:
/************************************* (3) Check whether there are logical input mistakes between death status for different waves *************************************/
// check whether there are logical mistakes for dth**_##
* If the current death status is -9/0/1, the previous one can only be 0;
* if the current death status is -8, then the previous can only be -8,-9 and 1.
preserve
rename dth05_08 dth4
rename dth08_11 dth5
rename dth11_14 dth6
label drop _all
forv i = 4/5 {
    local j = `i' + 1
    tab dth`i' if dth`j' == -9 | dth`j' == 0 | dth`j' == 1, missing //0
    tab dth`i' if dth`j' == -8, missing //-8, -9, 1
}
restore








  status of |
  survival, |
  death, or |
    lost to |
  follow-up |
   from the |
    2005 to |
 2008 waves |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |      3,281      100.00      100.00
------------+-----------------------------------
      Total |      3,281      100.00

  status of |
  survival, |
  death, or |
    lost to |
  follow-up |
   from the |
    2005 to |
 2008 waves |      Freq.     Percent        Cum.
------------+-----------------------------------
         -9 |      1,472       35.23       35.23
          1 |      2,706       64.77      100.00
------------+-----------------------------------
      Total |      4,178      100.00

  status of |
  survival, |
  death, or |
    lost to |
  follow-up |
  from 2008 |
    to 2011 |
      waves |      Freq.     Percent        Cum.
------------+-----------------------------------
          0 |      1,678      100.00      100.00
------------+----------------------

In [7]:
*****************************create work.dta, which has changed the death status according results above, and renanme dth**_##***********
rename dth05_08 dth8
rename dth08_11 dth11
rename dth11_14 dth14

In [8]:
global waves "8 11 14"                                                        //******need to be changed
global year1 "2005 2006 2007 2009 2010 2011 2013 2014"
global year2 "2008 2012"
global months "4 6 9 11"
global wavein "in5 in8 in11 in14"
save "${INTER}/work.dta", replace







file F:\Box Sync\Archives2020LLY\Zhengting\Duke Kunshan University Intern (zh133
> @duke.edu)\4 healthy aging-CLHLS\Group meeting coordination\survival time/inte
> r data/work.dta saved


In [9]:
/************************************* (4) Check whether there are logical input mistakes between death status and the verified death year, month, day *************************************/
// check whether there are logical mistakes between d*vyear d*vmonth d*vday dth**_##
foreach i of global waves {
    // unify missing value to "99"
    recode d`i'vday(. 88=99) 
    recode d`i'vmonth(. 88=99)
    recode d`i'vyear(. 8888 9999=99)  //no 88 for all the 4 vars
    
    replace d`i'vyear = 1 if d`i'vyear > 1997 & d`i'vyear < 2020
    replace d`i'vmonth = 1 if d`i'vmonth > 0 & d`i'vmonth < 13
    replace d`i'vday = 1 if d`i'vday > 0 & d`i'vday < 32
    
    bys d`i'vyear: gen fre`i'_year = _N
    bys d`i'vmonth: gen fre`i'_month = _N
    bys d`i'vday: gen fre`i'_day = _N
    bys dth`i': gen fre`i'_dth = _N
}
label drop _all
save "${INTER}/work1.dta", replace


(d8vday: 3 changes made)
(d8vmonth: 3 changes made)
(d8vyear: 3 changes made)
(2,703 real changes made)
(2,486 real changes made)
(2,628 real changes made)
(d11vday: 23 changes made)
(d11vmonth: 0 changes made)
(d11vyear: 0 changes made)
(1,071 real changes made)
(963 real changes made)
(1,006 real changes made)
(d14vday: 7004 changes made)
(d14vmonth: 6991 changes made)
(d14vyear: 7002 changes made)
(457 real changes made)
(407 real changes made)
(437 real changes made)


(note: file F:\Box Sync\Archives2020LLY\Zhengting\Duke Kunshan University Intern
>  (zh133@duke.edu)\4 healthy aging-CLHLS\Group meeting coordination\survival ti
> me/inter data/work1.dta not found)
file F:\Box Sync\Archives2020LLY\Zhengting\Duke Kunshan University Intern (zh133
> @duke.edu)\4 healthy aging-CLHLS\Group meeting coordination\survival time/inte
> r data/work1.dta saved


In [10]:
foreach i of global waves {
    keep d`i'vyear d`i'vmonth d`i'vday dth`i' fre`i'_year fre`i'_month fre`i'_day fre`i'_dth 
    duplicates drop d`i'vyear d`i'vmonth d`i'vday dth`i', force 
    save "${INTER}/wave`i'.dta", replace
    use "${INTER}/work1.dta", clear
}
use "${INTER}/wave14.dta",clear
append using "${INTER}/wave8.dta" "${INTER}/wave11.dta"



Duplicates in terms of d8vyear d8vmonth d8vday dth8

(7,455 observations deleted)
(note: file F:\Box Sync\Archives2020LLY\Zhengting\Duke Kunshan University Intern
>  (zh133@duke.edu)\4 healthy aging-CLHLS\Group meeting coordination\survival ti
> me/inter data/wave8.dta not found)
file F:\Box Sync\Archives2020LLY\Zhengting\Duke Kunshan University Intern (zh133
> @duke.edu)\4 healthy aging-CLHLS\Group meeting coordination\survival time/inte
> r data/wave8.dta saved

Duplicates in terms of d11vyear d11vmonth d11vday dth11

(7,454 observations deleted)
(note: file F:\Box Sync\Archives2020LLY\Zhengting\Duke Kunshan University Intern
>  (zh133@duke.edu)\4 healthy aging-CLHLS\Group meeting coordination\survival ti
> me/inter data/wave11.dta not found)
file F:\Box Sync\Archives2020LLY\Zhengting\Duke Kunshan University Intern (zh133
> @duke.edu)\4 healthy aging-CLHLS\Group meeting coordination\survival time/inte
> r data/wave11.dta saved

Duplicates in terms of d14vyear d14vmonth d14vday dth1

In [11]:
browse

In [12]:
/* The results show that, in wave0-wave11, -9, -8, 0/-7(alive) have completely the same freq, 
all missing values in d*vyear/month/day occur only when dth*=1(died). Only in wave14, all is missing in d14vyear/month/day when dth14=-9/-8/0. 
There is no logical mistakes between the 4 vars.*/

In [13]:
// tabulate the lost,died and alive number for each wave
use "${INTER}/work1.dta",clear
foreach i of global waves{
    tabulate dth`i' if dth`i' != -8
}




  status of |
  survival, |
  death, or |
    lost to |
  follow-up |
   from the |
    2005 to |
 2008 waves |      Freq.     Percent        Cum.
------------+-----------------------------------
         -9 |      1,472       19.73       19.73
          0 |      3,281       43.99       63.72
          1 |      2,706       36.28      100.00
------------+-----------------------------------
      Total |      7,459      100.00

  status of |
  survival, |
  death, or |
    lost to |
  follow-up |
  from 2008 |
    to 2011 |
      waves |      Freq.     Percent        Cum.
------------+-----------------------------------
         -9 |        532       16.21       16.21
          0 |      1,678       51.14       67.36
          1 |      1,071       32.64      100.00
------------+-----------------------------------
      Total |      3,281      100.00

  status of |
  survival, |
  death, or |
    lost to |
  follow-up |
       from |
  2011/2012 |
    to 2014 |
      waves |      Freq. 

In [14]:
foreach i of global waves{
    erase "${INTER}/wave`i'.dta"
}
erase "${INTER}/work1.dta"

In [15]:
/********************************************************************************************************************/
/************************************* II. Replacement of NA and input mistakes *************************************/
/********************************************************************************************************************/

use "${INTER}/work.dta", clear
/*change all . to 99 for month&day, . to 9999 for year*/
foreach a of global waves {
    recode d`a'vday (. = 99) 
    recode d`a'vmonth (. = 99)
    recode d`a'vyear (. = 9999)
}



(d8vday: 3 changes made)
(d8vmonth: 3 changes made)
(d8vyear: 3 changes made)
(d11vday: 23 changes made)
(d11vmonth: 0 changes made)
(d11vyear: 0 changes made)
(d14vday: 7004 changes made)
(d14vmonth: 6991 changes made)
(d14vyear: 7002 changes made)


In [16]:
****calculate the mid-point between the last interview date of the previous wave and the first interview date of the next wave
capture noisily gen in98 = mdy(month98, date98, year9899)                            
capture noisily gen in0 = mdy(month_0, day_0, 2000)
capture noisily gen in2 = mdy(month02, day02, 2002)
capture noisily gen in5 = mdy(monthin, dayin, 2005)
capture noisily gen in8 = mdy(month_8, day_8, year_8)
gen in11 = mdy(monthin_11, dayin_11, yearin_11)
gen in14 = mdy(monthin_14, dayin_14, yearin_14) 


year9899 not found

day_0 not found

day02 not found

(2 missing values generated)

(4,178 missing values generated)

(5,781 missing values generated)

(6,349 missing values generated)


In [17]:
forv i=1/3 {                                              
    local wavein2 = word("$wavein", `i')
         egen min_`wavein2' = min(`wavein2')
         egen max_`wavein2' = max(`wavein2')
    local j = `i'+1
    local wavein3 = word("$wavein", `j')
         egen min_`wavein3' = min(`wavein3')
         egen max_`wavein3' = max(`wavein3')
    gen mid_`wavein2'_`wavein3' = (max_`wavein2' + min_`wavein3')/2
    gen midyear_`wavein2'_`wavein3' = year(mid_`wavein2'_`wavein3')
    gen midmonth_`wavein2'_`wavein3' = month(mid_`wavein2'_`wavein3')
    gen midday_`wavein2'_`wavein3' = day(mid_`wavein2'_`wavein3')
drop min_`wavein3' max_`wavein3'
}

In [18]:
/************************************* (5) Replacement of the missing death date according to Rule 1 *************************************/
* Rule 1:
* For the three variables, year, month, and day:
* a. if only month is missing, the month is assumed to be July;
* b. if only day is missing, the day is assumed to be 15;
* c. for the rest of all the scenarios, the year/month/day is assumed to be that of the mid-point between the last interview date of the previous wave and
* the first interview date of the next wave. (these scenarios inc, all the three variables are missing, or any two variables are missing, or only year is
* missing.)
local j = 1
foreach i of global waves { 
    local inid = word("$wavein",`j')
        replace d`i'vday = midday_`inid'_in`i' if d`i'vday == 99 & dth`i' == 1
        replace d`i'vmonth = midmonth_`inid'_in`i' if d`i'vmonth == 99 & dth`i' == 1
        replace d`i'vyear = midyear_`inid'_in`i' if d`i'vyear == 9999 & dth`i' == 1
      
        recode d`i'vday (99 = 15) if d`i'vmonth != 99 & d`i'vyear != 9999 & dth`i' == 1 
        recode d`i'vmonth (99 = 7) if d`i'vday != 99 & d`i'vyear != 9999 & dth`i' == 1 
    
    local j = `j'+1
}



(3 real changes made)
(3 real changes made)
(3 real changes made)
(d8vday: 0 changes made)
(d8vmonth: 0 changes made)
(26 real changes made)
(0 real changes made)
(0 real changes made)
(d11vday: 0 changes made)
(d11vmonth: 0 changes made)
(13 real changes made)
(0 real changes made)
(11 real changes made)
(d14vday: 0 changes made)
(d14vmonth: 0 changes made)


In [19]:
/************************************* (6) Modify input mistakes of death date according to Rule 2 *************************************/
* Rule 2:
* a. change day 29/max of Feb to 28 for years 99, 01, 02, 03, 05, 06, 07, 09, 10, 11, 13, 14 (non-leap year);
* b. change day 30/max of Feb to 29 for years 00, 04, 08, 12 (leap year);
* c. change day 31 to 30 for months 4, 6, 9, 11
foreach i of global waves{
    foreach year of global year1{
        recode d`i'vday (29/max=28) if d`i'vyear == `year' & d`i'vmonth == 2
    }
    foreach year of global year2{
        recode d`i'vday (30/max=29) if d`i'vyear == `year' & d`i'vmonth == 2
    }
    foreach month of global months{
        recode d`i'vday (31=30) if d`i'vmonth == `month'
    }
}


(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d8vday: 0 changes made)
(d11vday: 0 changes made)
(d11vday: 0 changes made)
(d11vday: 0 changes made)
(d11vday: 2 changes made)
(d11vday: 0 changes made)
(d11vday: 0 changes made)
(d11vday: 0 changes made)
(d11vday: 0 changes made)
(d11vday: 0 changes made)
(d11vday: 0 changes made)
(d11vday: 1 changes made)
(d11vday: 3 changes made)
(d11vday: 8 changes made)
(d11vday: 4 changes made)
(d14vday: 0 changes made)
(d14vday: 0 changes made)
(d14vday: 0 changes made)
(d14vday: 0 changes made)
(d14vday: 0 changes made)
(d14vday: 0 changes made)
(d14vday: 0 changes made)
(d14vday: 0 changes made)
(d14vday: 0 changes made)
(d14vday: 0 changes made)
(d14vday: 0 changes made)

In [20]:
/****************************************************************************************************************************************/
/************************************* III. calculating survival time, censor and lost to follow-up *************************************/
/****************************************************************************************************************************************/

/************************************* (7) Replacement of the missing interview baseline date according to Rule 3 *************************************/
* Rule 3:
* a. if only the interview day is missing, then the day is assumed to be 15th
* b. if both month and day are missing and the year isn't missing, or only the month is missing, the month/day is assumed to be that of the mid-point between the earliest interview date
* and the latest interiew date of that year
* c. no interview year is missing

codebook dayin    // no missing interview day
codebook monthin  // no missing interview month



--------------------------------------------------------------------------------
dayin                                                  day of the 2005 interview
--------------------------------------------------------------------------------

                  type:  numeric (byte)

                 range:  [1,31]                       units:  1
         unique values:  31                       missing .:  0/7,459

                  mean:   15.1337
              std. dev:   9.20308

           percentiles:        10%       25%       50%       75%       90%
                                 3         6        16        23        28


--------------------------------------------------------------------------------
monthin                                              month of the 2005 interview
--------------------------------------------------------------------------------

                  type:  numeric (byte)

                 range:  [3,10]                       units:  1
        

In [21]:
/************************************* (8) Modify input mistakes of interview baseline date according to Rule 2 *************************************/
* Rule 2:
* a. change day 29/max of Feb to 28 for years 99, 01, 02, 03, 05, 06, 07, 09, 10, 11, 13, 14 (non-leap year);
* b. change day 30/max of Feb to 29 for years 00, 04, 08, 12 (leap year);
* c. change day 31 to 30 for months 4, 6, 9, 11
recode dayin (29/max=28) if monthin == 2

foreach month of global months{
    recode dayin (31=30) if monthin == `month'
}


(dayin: 0 changes made)

(dayin: 0 changes made)
(dayin: 1 changes made)
(dayin: 1 changes made)
(dayin: 0 changes made)


In [22]:
****set interview baseline
**codebook on interview date variables
* datein: day of interview of the 2005 survey; 1~31, 99=missing
* monthin: month of the interview of the 2005 survey*; 1~12, 99=missing
gen interview_baseline = mdy(monthin, dayin, 2005)                                  //******need to be changed      

In [23]:
/************************************* (9) Calculate survival time for each person according to Rule 4 *************************************/
* Rule 4:
* Generate two different survival time (**for data sets with suffix_14**):

* One is `survival_bas', from interview baseline to death or censored, **up to 2014 wave**.
* a. For those died in the study: survival time = death date - interview date at baseline;
* b. For those lost in the study: survival time = the mid-point of the two adjacent waves - interview date at baseline;
* (the mid-point of the two adjacent waves is generated according to Rule 1)
* c. For those still alive at the end of the study: survival time = interview date in the last wave - interview date at baseline;
* d. If survival_bas < 0, change survival time to 0.

* Another one is `survival_bth', from birth to death or censored, **up to 2014 wave**.
* e. survival_bth = survival_bas + verified age (*trueage*)

* Variables for death/lost status
* `censor' is coded as: 1 = died, 0 = not died (alive or lost);
* `lost' is coed as: 1 = lost, . = not lost

* gen survival_bas,means the years from baseline to death or censored
* generate dthyear/month/day, means the exact death year/month/day of those who died during the whole period (2000-2014)
* gen lostdate, means the lost date for those lost in the survey, and equals to the mid-point of last day of the previous interview and the first day of the next one

gen dthyear = .
gen dthmonth = .
gen dthday = .
gen lostdate = .
gen survival_bas = .


(7,459 missing values generated)

(7,459 missing values generated)

(7,459 missing values generated)

(7,459 missing values generated)

(7,459 missing values generated)


In [24]:
local j=1
foreach i of global waves {
    replace dthyear = d`i'vyear if d`i'vyear > 0 & d`i'vyear < 2020
    replace dthmonth = d`i'vmonth if d`i'vmonth > 0 & d`i'vmonth < 13
    replace dthday = d`i'vday if d`i'vday > 0 & d`i'vday < 32
local inid = word("$wavein", `j')
    replace lostdate = mdy(midmonth_`inid'_in`i', midday_`inid'_in`i', midyear_`inid'_in`i') if dth`i' == -9
local j = `j' + 1
}



(2,706 real changes made)
(2,706 real changes made)
(2,706 real changes made)
(1,472 real changes made)
(1,071 real changes made)
(1,071 real changes made)
(1,071 real changes made)
(532 real changes made)
(468 real changes made)
(468 real changes made)
(468 real changes made)
(100 real changes made)


In [25]:
gen dthdate = mdy(dthmonth, dthday, dthyear)
replace survival_bas = (dthdate - interview_baseline)/365.25
gen censor = 0
replace censor = 1 if survival_bas != .  //generate censor=1 if die, censor=0 if survived until end of the wave or lost to follow


(3,214 missing values generated)

(4,245 real changes made)


(4,245 real changes made)


In [26]:
replace survival_bas = (lostdate - interview_baseline)/365.25 if lostdate != .
gen lost = 1
replace lost = . if lostdate == .


(2,104 real changes made)


(5,355 real changes made, 5,355 to missing)


In [27]:
gen interview2014 = mdy(monthin_14, dayin_14, yearin_14) if dth14 == 0
replace survival_bas = (interview2014 - interview_baseline)/365.25 if interview2014 != .


(6,349 missing values generated)

(1,110 real changes made)


In [28]:
**************replace the survival time to 0 for those whose survival was negative
sum survival_bas
* gen survival_bth,means the years from birth to death or censored
replace survival_bas = 0 if survival_bas < 0  
* gen survival_bth,means the years from birth to death or censored
gen survival_bth = survival_bas + trueage                                                            
erase "${INTER}/work.dta"
macro drop _all



    Variable |        Obs        Mean    Std. Dev.       Min        Max
-------------+---------------------------------------------------------
survival_bas |      7,459    3.747085    2.886631  -.5968515   9.492129

(25 real changes made)





In [32]:
/************************************* (10) calc survival time to 2018 *************************************/

merge 1:1 id using "${OUT}/dat14_18surtime.dta", keepus(id survival_bas14_18 survival_bth14_18 censor14_18 lost14_18) nolabel //47, 96, 1110, 821 merged for dat98/00/05/11_14

(note: variable id was long, now double to accommodate using data's values)

    Result                           # of obs.
    -----------------------------------------
    not matched                        12,431
        from master                     6,349  (_merge==1)
        from using                      6,082  (_merge==2)

    matched                             1,110  (_merge==3)
    -----------------------------------------


In [33]:
ren (survival_bas survival_bth lost censor) (survival_bas05_14 survival_bth05_14 lost05_14 censor05_14)

In [34]:
gen survival_bas05_18 = survival_bas05_14
replace survival_bas05_18 = survival_bas05_14 + survival_bas14_18 if censor05_14 == 0 & _merge == 3  //1538 replaced, one died in 2011.6.30, but was recorded as missing in d18vyear/month/day, so no need to be changed
preserve


(6,082 missing values generated)

(1,110 real changes made)



In [35]:
gen survival_bth05_18 = survival_bth05_14
replace survival_bth05_18 = survival_bth05_14 + survival_bas14_18 if censor05_14 == 0 & _merge == 3


(6,082 missing values generated)

(1,110 real changes made)


In [36]:
gen censor05_18 = censor05_14
replace censor05_18 = censor14_18 if _merge == 3


(6,082 missing values generated)

(282 real changes made)


In [37]:
gen lost05_18 = lost05_14
replace lost05_18 = lost14_18 if _merge==3


(11,437 missing values generated)

(288 real changes made)


In [38]:
drop if _merge==2
drop _merge


(6,082 observations deleted)



In [39]:
save "${OUT}/dat05_18surtime.dta", replace

file F:\Box Sync\Archives2020LLY\Zhengting\Duke Kunshan University Intern (zh133
> @duke.edu)\4 healthy aging-CLHLS\Group meeting coordination\survival time/out 
> data/dat05_18surtime.dta saved
