In [1]:
dat <- read.csv("/share/storage/FTND/UKBiobank/observed/phenotype/ukb23019.csv", stringsAsFactors = FALSE)

# Heaviness of Smoking Index (HSI)

## 1. On the days that you smoke, how soon after you wake up do you have your first cigarette?
 - A. Within 5 minutes (3 points)
 - B. 6- 30 minutes (2 points)
 - C. 31-60 minutes (1 point)
 - D. After 60 minutes (0 points)

## 2. How many cigarettes do you typically smoke per day?
 - A. 10 or fewer (0 points)
 - B. 11-20 (1 point)
 - C. 21-30 (2 points)
 - D. 31 or more (3 points)

## SCORING:
 - 0-2: low addiction
 - 3-4: moderate addiction
 - 5-6: high addiction


## how soon after wake up to first cigarette
 - 1 Less than 5 minutes
 - 2   Between 5-15 minutes
 - 3   Between 30 minutes - 1 hour
 - 4   Between 1 and 2 hours
 - 5   Longer than 2 hours
 - -1  Do not know
 - -3  Prefer not to answer

In [2]:
dat$score.time <- NA
dat$score.time[dat$X3466.0.0 == 1] <- 3
dat$score.time[dat$X3466.0.0 == 2] <- 2
dat$score.time[dat$X3466.0.0 == 3] <- 1
dat$score.time[dat$X3466.0.0 %in% c(4,5)] <- 0


## Number of cigarettes currently smoked daily (current cigarette smokers)
 - -10   Less than one a day
 - -1  Do not know
 - -3  Prefer not to answer


In [3]:
dat$score.cpd <- NA
dat$score.cpd[dat$X3456.0.0 == -10 | ( dat$X3456.0.0 > 0 & dat$X3456.0.0 <= 10)] <- 0
dat$score.cpd[dat$X3456.0.0 > 10 & dat$X3456.0.0 <= 20] <- 1
dat$score.cpd[dat$X3456.0.0 > 20 & dat$X3456.0.0 <= 30] <- 2
dat$score.cpd[dat$X3456.0.0 > 30 ] <- 3


In [4]:
dat$heaviness <- dat$score.time + dat$score.cpd

# Kinship

kinship coefficient range >0.354, [0.177, 0.354], [0.0884, 0.177] and [0.0442, 0.0884] corresponds to duplicate/MZ twin, 1st-degree, 2nd-degree, and 3rd-degree relationships respectively

In [5]:
related <- read.table("/share/storage/FTND/UKBiobank/observed/rel/ukb24603_rel_s488295.dat", head = TRUE, stringsAsFactors = FALSE)

In [6]:
head(related)

ID1,ID2,HetHet,IBS0,Kinship
1000031,3813341,0.067,0.0002,0.2451
1000237,1231015,0.045,0.0143,0.0595
1000251,5910072,0.046,0.0141,0.0677
1000262,5259319,0.076,0.0051,0.2465
1000286,5319336,0.05,0.0098,0.1165
1000295,5338158,0.045,0.0151,0.0569


In [7]:
has.hsi <- dat$eid[ !is.na(dat$heaviness) ]

In [8]:
length(has.hsi)

In [9]:
related.hsi <- related[ (related$ID1 %in% has.hsi) & (related$ID2 %in% has.hsi), ]
dim(related.hsi)

### Degrees

In [10]:
# degree
table(table(c(related.hsi$ID1, related.hsi$ID2)))


   1    2    3    4    6    7    8 
1622   82   15    2    1    1    1 

### Remove one with 8 relatives

In [11]:
remove.list <- c()
# Remove degree = 8
remove.list <- c(remove.list, names(table(c(related.hsi$ID1, related.hsi$ID2))[which(table(c(related.hsi$ID1, related.hsi$ID2)) == 8)]))
remove.list

In [12]:
# exclude individual
related.hsi <- related.hsi[ (! related.hsi$ID1 %in% remove.list) & (! related.hsi$ID2 %in% remove.list), ]
# degree
table(table(c(related.hsi$ID1, related.hsi$ID2)))


   1    2    3    5    7 
1620   85   14    1    1 

### Remove one with 7 relatives

In [13]:
# Remove degree = 7
remove.list <- c(remove.list, names(table(c(related.hsi$ID1, related.hsi$ID2))[which(table(c(related.hsi$ID1, related.hsi$ID2)) == 7)]))
remove.list
# exclude individual
related.hsi <- related.hsi[ (! related.hsi$ID1 %in% remove.list) & (! related.hsi$ID2 %in% remove.list), ]
# degree
table(table(c(related.hsi$ID1, related.hsi$ID2)))


   1    2    3    5 
1613   85   14    1 

### Remove one with 5 relatives

In [14]:
# Remove degree = 5
remove.list <- c(remove.list, names(table(c(related.hsi$ID1, related.hsi$ID2))[which(table(c(related.hsi$ID1, related.hsi$ID2)) == 5)]))
remove.list
# exclude individual
related.hsi <- related.hsi[ (! related.hsi$ID1 %in% remove.list) & (! related.hsi$ID2 %in% remove.list), ]
# degree
table(table(c(related.hsi$ID1, related.hsi$ID2)))


   1    2    3 
1614   85   12 

### Remove ones with 3 relatives
Turns out to be three families of 4 people each

In [15]:
# Remove degree = 3
degree.3 <- names(table(c(related.hsi$ID1, related.hsi$ID2))[which(table(c(related.hsi$ID1, related.hsi$ID2)) == 3)])
tmp <- related.hsi[(related.hsi$ID1 %in% degree.3) | (related.hsi$ID2 %in% degree.3),]
tmp$ID1.hsi <- dat$heaviness[match(tmp$ID1, dat$eid)]
tmp$ID2.hsi <- dat$heaviness[match(tmp$ID2, dat$eid)]
tmp

Unnamed: 0,ID1,ID2,HetHet,IBS0,Kinship,ID1.hsi,ID2.hsi
18716,1872767,3945318,0.048,0.0128,0.0822,2,0
18717,1872767,4490044,0.071,0.0053,0.226,2,2
18718,1872767,5849383,0.046,0.0147,0.0576,2,3
51668,3428259,6024683,0.078,0.004,0.2638,0,1
52469,3465351,3603014,0.08,0.0047,0.2661,2,4
52471,3465351,5110221,0.079,0.0055,0.2573,2,3
53452,3509789,3465351,0.078,0.0052,0.2514,0,2
53453,3509789,3603014,0.076,0.0045,0.2526,0,4
53455,3509789,5110221,0.074,0.0038,0.2505,0,3
59678,3807272,3428259,0.079,0.0049,0.259,5,0


In [16]:
# Keep 1 each from the three families
remove.list <- c(remove.list, 
                 setdiff(degree.3, 
                        c("5849383", "3807272", "3603014")))
remove.list
# exclude individual
related.hsi <- related.hsi[ (! related.hsi$ID1 %in% remove.list) & (! related.hsi$ID2 %in% remove.list), ]
# degree
table(table(c(related.hsi$ID1, related.hsi$ID2)))


   1    2 
1614   85 

### Remove ones with 2 relatives
One complicated family, removed manually

In [17]:
# Remove degree = 2
degree.2 <- names(table(c(related.hsi$ID1, related.hsi$ID2))[which(table(c(related.hsi$ID1, related.hsi$ID2)) == 2)])
tmp <- related.hsi[(related.hsi$ID1 %in% degree.2) | (related.hsi$ID2 %in% degree.2),]
tmp$ID1.hsi <- dat$heaviness[match(tmp$ID1, dat$eid)]
tmp$ID2.hsi <- dat$heaviness[match(tmp$ID2, dat$eid)]
head(tmp)
dim(tmp)

Unnamed: 0,ID1,ID2,HetHet,IBS0,Kinship,ID1.hsi,ID2.hsi
1128,1054543,3901066,0.046,0.0151,0.0561,4,2
2480,1118785,5960697,0.053,0.0094,0.1274,2,0
2481,1118785,5997372,0.076,0.0035,0.2562,2,2
5876,1274655,3492974,0.075,0.0053,0.2387,3,3
6935,1322350,1048890,0.045,0.0144,0.0598,6,3
6936,1322350,4432199,0.043,0.0154,0.0444,6,3


In [18]:
length(unique(c(tmp$ID1, tmp$ID2)))
degree.2.remove <- c()
while(dim(tmp)[1] > 0){
    tmp.fam <- tmp[(tmp$ID1 %in% c(tmp$ID1[1], tmp$ID2[1])) | (tmp$ID2 %in% c(tmp$ID1[1], tmp$ID2[1])),]
    fam <- unique(c(tmp.fam$ID1, tmp.fam$ID2))
    l <- 0
    # there are complicated family structure
    while(length(fam) != l){
        l <- length(fam)
        tmp.fam <- tmp[tmp$ID1 %in% fam | tmp$ID2 %in% fam,]
        fam <- unique(c(tmp.fam$ID1, tmp.fam$ID2))
    }

    tmp.fam <- tmp[tmp$ID1 %in% fam | tmp$ID2 %in% fam,]
    if(dim(tmp.fam)[1] > 3) {
        print(tmp.fam)
    } else {
        fam.hsi <- dat$heaviness[match(fam, dat$eid)]
        keep <- sample(which(fam.hsi == max(fam.hsi)), 1)
        degree.2.remove <- c(degree.2.remove, fam[-keep])
    }
    # remove family from tmp
    tmp <- tmp[(! tmp$ID1 %in% fam) & (! tmp$ID2 %in% fam), ]
}
length(degree.2.remove)

           ID1     ID2 HetHet   IBS0 Kinship ID1.hsi ID2.hsi
73107  4452717 1904744  0.042 0.0148  0.0447       2       3
73149  4452717 6002751  0.041 0.0136  0.0475       2       0
89659  5216767 4358391  0.041 0.0146  0.0444       2       1
106762 6002751 4358391  0.041 0.0141  0.0456       0       1


### Check
143 - 5 (special family) = 138

138 / 3 = 46 families of three

number removed = 46*2 = 92

number of independent ids = 46*3 + 5 = 143

Correct

In [19]:
degree.2.remove <- c(degree.2.remove, "4452717", "4358391")


### Remove ones with 1 relatives

In [20]:
# Keep 1 each from the three families
remove.list <- c(remove.list, degree.2.remove)
# exclude individual
related.hsi <- related.hsi[ (! related.hsi$ID1 %in% remove.list) & (! related.hsi$ID2 %in% remove.list), ]
# degree
table(table(c(related.hsi$ID1, related.hsi$ID2)))



   1 
1556 

In [21]:
dim(related.hsi)

In [22]:
# Remove degree = 1
related.hsi$ID1.hsi <- dat$heaviness[match(related.hsi$ID1, dat$eid)]
related.hsi$ID2.hsi <- dat$heaviness[match(related.hsi$ID2, dat$eid)]
head(related.hsi)

Unnamed: 0,ID1,ID2,HetHet,IBS0,Kinship,ID1.hsi,ID2.hsi
92,1005357,2112322,0.077,0.0059,0.2404,3,0
140,1007749,2606200,0.055,0.0086,0.1394,1,3
223,1011536,3959448,0.047,0.0136,0.0716,0,1
311,1015603,4854910,0.043,0.0152,0.0444,2,1
544,1026903,2646986,0.081,0.0043,0.271,1,3
686,1034249,3995206,0.045,0.0148,0.058,2,6


In [23]:
related.hsi$remove <- related.hsi$ID1
related.hsi$remove[related.hsi$ID1.hsi > related.hsi$ID2.hsi] <- related.hsi$ID2[related.hsi$ID1.hsi > related.hsi$ID2.hsi]
head(related.hsi)

Unnamed: 0,ID1,ID2,HetHet,IBS0,Kinship,ID1.hsi,ID2.hsi,remove
92,1005357,2112322,0.077,0.0059,0.2404,3,0,2112322
140,1007749,2606200,0.055,0.0086,0.1394,1,3,1007749
223,1011536,3959448,0.047,0.0136,0.0716,0,1,1011536
311,1015603,4854910,0.043,0.0152,0.0444,2,1,4854910
544,1026903,2646986,0.081,0.0043,0.271,1,3,1026903
686,1034249,3995206,0.045,0.0148,0.058,2,6,1034249


In [24]:
remove.list <- c(remove.list, related.hsi$remove)

In [25]:
unrelated <- dat[ !is.na(dat$heaviness) & (! dat$eid %in% remove.list), ]

# Age, sex, PCs

In [26]:
unrelated$hsi <- 0
unrelated$hsi[unrelated$heaviness >= 3 & unrelated$heaviness <= 4] <- 1
unrelated$hsi[unrelated$heaviness >= 5 & unrelated$heaviness <= 6] <- 2
table(unrelated$hsi, unrelated$heaviness)

unrelated$Sex <- unrelated$X31.0.0
unrelated$Age <- scale(unrelated$X21022.0.0)
unrelated$Age2 <- scale(unrelated$Age * unrelated$Age)
unrelated$Age.Sex <- unrelated$Age * unrelated$Sex
unrelated$Age2.Sex <- unrelated$Age2 * unrelated$Sex
for (i in 1:20){
    unrelated[[paste0("PC", i)]] <- unrelated[[paste0("X22009.0.", i)]]
}

   
       0    1    2    3    4    5    6
  0 5696 5328 7492    0    0    0    0
  1    0    0    0 9126 4627    0    0
  2    0    0    0    0    0 1819  592

In [27]:
## Fit the model among all UKB samples
fit <- lm(hsi ~ Age + Age2 + Sex + Age.Sex + Age2.Sex +
          PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
          PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20,
          data = unrelated)
summary(fit)$coefficient


Unnamed: 0,Estimate,Std. Error,t value,Pr(>|t|)
(Intercept),0.4558371938,0.004753419,95.896694,0.0
Age,-0.0146036021,0.004779602,-3.05540125,0.002249345
Age2,-0.0297690165,0.004731022,-6.2923019,3.16617e-10
Sex,0.1394813877,0.006704344,20.80462808,1.560534e-95
Age.Sex,0.0032795079,0.006687094,0.49042346,0.6238375
Age2.Sex,-0.007116818,0.006678506,-1.06563031,0.2865984
PC1,-0.0008588929,6.380421e-05,-13.46138192,3.370176e-41
PC2,0.000764333,0.0001385915,5.51500488,3.513168e-08
PC3,-0.0013244145,0.0002613074,-5.06841599,4.032505e-07
PC4,0.0012022078,0.0003456371,3.47823733,0.000505357


In [28]:
keep <- unrelated[, c("eid", "hsi", "Sex", "Age", "Age2", "Age.Sex", "Age2.Sex", paste0("PC", 1:20))]
keep <- keep[ rowSums(is.na(keep)) == 0, ]
write.table(keep, "/share/storage/FTND/UKBiobank/HeavinessSmokingIndex/phenotypes/HSI~Sex.Age.Age2.AgeSex.Age2Sex.20EVs.txt", row.names = FALSE, col.names = TRUE, quote = FALSE)
write.table(keep$eid, "/share/storage/FTND/UKBiobank/HeavinessSmokingIndex/phenotypes/HSI.samples", row.names = FALSE, col.names = FALSE, quote = FALSE)

In [29]:
length(remove.list)

In [30]:
mrBig <- read.table("/share/storage/FTND/UKBiobank/observed/rs16969968/ukb_rs16969968.mach.mldose", stringsAsFactors = FALSE)
keep$rs16969968 <- mrBig$V3[match(keep$eid, mrBig$V1)]
keep <- keep[, c("eid", "hsi", "rs16969968", "Sex", "Age", "Age2", "Age.Sex", "Age2.Sex", paste0("PC", 1:20))]
write.table(keep, "/share/storage/FTND/UKBiobank/HeavinessSmokingIndex/phenotypes/HSI~rs16969968.Sex.Age.Age2.AgeSex.Age2Sex.20EVs.txt", row.names = FALSE, col.names = TRUE, quote = FALSE)


## Update 03/27/2019 - Keep Cauc only using X22006 and keep only PCs explaining 75% var

In [41]:
unrelated_white <- unrelated[(unrelated$X21000.0.0 %in% c(1, 1001, 1002, 1003)),]
keep <- unrelated_white[, c("eid", "hsi", "Sex", "Age", paste0("PC", 1:20))]
keep <- keep[ rowSums(is.na(keep)) == 0, ]
fit <- lm(hsi ~ Age + Sex + 
          PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
          PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20,
          data = unrelated)


In [55]:
ss <- anova(fit)[3:22, 2]
names(ss) <- paste0("PC", 1:20)
ss_sorted <- sort(ss, decreasing = T)
ss_pct <- ss_sorted / sum(ss_sorted)
ss_pct

In [57]:
unrelated_white <- unrelated[(unrelated$X21000.0.0 %in% c(1, 1001, 1002, 1003)),]
keep <- unrelated_white[, c("eid", "hsi", "Sex", "Age", paste0("PC", c(1,5,2,3)))]
keep <- keep[ rowSums(is.na(keep)) == 0, ]

In [58]:
write.table(keep, "/share/storage/FTND/UKBiobank/HeavinessSmokingIndex/phenotypes/HSI~Sex.Age.4EVs.white.txt", row.names = FALSE, col.names = TRUE, quote = FALSE)
write.table(keep$eid, "/share/storage/FTND/UKBiobank/HeavinessSmokingIndex/phenotypes/HSI.white.samples", row.names = FALSE, col.names = FALSE, quote = FALSE)
mrBig <- read.table("/share/storage/FTND/UKBiobank/observed/rs16969968/ukb_rs16969968.mach.mldose", stringsAsFactors = FALSE)
keep$rs16969968 <- mrBig$V3[match(keep$eid, mrBig$V1)]
keep <- keep[, c("eid", "hsi", "rs16969968", "Sex", "Age", paste0("PC", c(1,5,2,3)))]
write.table(keep, "/share/storage/FTND/UKBiobank/HeavinessSmokingIndex/phenotypes/HSI~rs16969968.Sex.Age.4EVs.white.txt", row.names = FALSE, col.names = TRUE, quote = FALSE)


In [59]:
dim(keep)