In [1]:
#Pvalue for table 1 manuscript, 23.09.08
library(dplyr)

set.seed(1)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
data_df = read.csv("../../../preprocessed_data/meta/patient_info_for_statistics.v3.tsv", sep="\t")

control_df = filter(data_df, acpa == 0) #control
acpa_pos_df = filter(data_df, acpa == 1) #acpa-pos
acpa_neg_df = filter(data_df, acpa == 2) #acpa-neg
ra_df = filter(data_df, acpa == 1 | acpa == 2) 

In [3]:
run_ftest_3class <- function(acpa_neg_df, acpa_pos_df, control_df, condition_of_interest){

    condition_of_interest_yes <- paste(condition_of_interest, "_yes", sep="")
    condition_of_interest_no <- paste(condition_of_interest, "_no", sep="")

    acpa_neg_yes = sum(acpa_neg_df[[condition_of_interest]] == 1, na.rm = TRUE)
    acpa_neg_no = sum(acpa_neg_df[[condition_of_interest]] == 0, na.rm = TRUE)
    
    acpa_pos_yes = sum(acpa_pos_df[[condition_of_interest]] == 1, na.rm = TRUE)
    acpa_pos_no = sum(acpa_pos_df[[condition_of_interest]] == 0, na.rm = TRUE)

    control_yes = sum(control_df[[condition_of_interest]] == 1, na.rm = TRUE)
    control_no = sum(control_df[[condition_of_interest]] == 0, na.rm = TRUE)    

                        #c(yes no)
    dat <- data.frame(acpa_neg = c(acpa_neg_yes, acpa_neg_no),
                      acpa_pos = c(acpa_pos_yes, acpa_pos_no),
                      control = c(control_yes, control_no),
                      row.names = c(condition_of_interest_yes, condition_of_interest_no),
                      stringsAsFactors = FALSE)
    colnames(dat) <- c("acpa-neg", "acpa-pos","control")

    print (dat)
    results <- fisher.test(dat)

    return (results)
    }

run_ftest_2class <- function(acpa_neg_df, acpa_pos_df, condition_of_interest){

    condition_of_interest_yes <- paste(condition_of_interest, "_yes", sep="")
    condition_of_interest_no <- paste(condition_of_interest, "_no", sep="")

    acpa_neg_yes = sum(acpa_neg_df[[condition_of_interest]] == 1, na.rm = TRUE)
    acpa_neg_no = sum(acpa_neg_df[[condition_of_interest]] == 0, na.rm = TRUE)
    
    acpa_pos_yes = sum(acpa_pos_df[[condition_of_interest]] == 1, na.rm = TRUE)
    acpa_pos_no = sum(acpa_pos_df[[condition_of_interest]] == 0, na.rm = TRUE)

                        #c(yes no)
    dat <- data.frame(acpa_neg = c(acpa_neg_yes, acpa_neg_no),
                      acpa_pos = c(acpa_pos_yes, acpa_pos_no),
                      row.names = c(condition_of_interest_yes, condition_of_interest_no),
                      stringsAsFactors = FALSE)
    colnames(dat) <- c("acpa-neg", "acpa-pos")
    print (dat)
    results <- fisher.test(dat)
    
    return (results)
    }


run_ftest_2class_rf <- function(acpa_neg_df, acpa_pos_df, condition_of_interest){

    condition_of_interest_yes <- paste(condition_of_interest, "_yes", sep="")
    condition_of_interest_no <- paste(condition_of_interest, "_no", sep="")

    acpa_neg_yes = sum(acpa_neg_df[[condition_of_interest]] == 1, na.rm = TRUE)
    acpa_neg_no = sum(acpa_neg_df[[condition_of_interest]] == 2, na.rm = TRUE)
    
    acpa_pos_yes = sum(acpa_pos_df[[condition_of_interest]] == 1, na.rm = TRUE)
    acpa_pos_no = sum(acpa_pos_df[[condition_of_interest]] == 2, na.rm = TRUE)

                        #c(yes no)
    dat <- data.frame(acpa_neg = c(acpa_neg_yes, acpa_neg_no),
                      acpa_pos = c(acpa_pos_yes, acpa_pos_no),
                      row.names = c(condition_of_interest_yes, condition_of_interest_no),
                      stringsAsFactors = FALSE)
    colnames(dat) <- c("acpa-neg", "acpa-pos")
    print (dat)
    results <- fisher.test(dat)
    
    return (results)
    }

# kruskal.test(bmi ~ group, data = my_data)



In [4]:
#Fisher-Exact test

ftest_3class_list <- c("sex","smoking")
ftest_2class_list <- c("mtx","pred")

for (condition_of_interest in ftest_3class_list) {
    print (condition_of_interest)
    results <- run_ftest_3class(acpa_neg_df, acpa_pos_df, control_df, condition_of_interest)
    print (results)
}

#rf
results <- run_ftest_2class_rf(acpa_neg_df, acpa_pos_df, "rf")
print (results)

for (condition_of_interest in ftest_2class_list) {
    print (condition_of_interest)
    results <- run_ftest_2class(acpa_neg_df, acpa_pos_df, condition_of_interest)
    print (results)
}

[1] "sex"
        acpa-neg acpa-pos control
sex_yes       12       11      12
sex_no        28       29      28

	Fisher's Exact Test for Count Data

data:  dat
p-value = 1
alternative hypothesis: two.sided

[1] "smoking"
            acpa-neg acpa-pos control
smoking_yes        2        1       4
smoking_no        38       39      32

	Fisher's Exact Test for Count Data

data:  dat
p-value = 0.2933
alternative hypothesis: two.sided

       acpa-neg acpa-pos
rf_yes       14       28
rf_no        26       12

	Fisher's Exact Test for Count Data

data:  dat
p-value = 0.003345
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.08101359 0.64725178
sample estimates:
odds ratio 
 0.2354311 

[1] "mtx"
        acpa-neg acpa-pos
mtx_yes       19       22
mtx_no        21       18

	Fisher's Exact Test for Count Data

data:  dat
p-value = 0.6549
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.2803446 1.9495582

In [94]:
kruskal.test(bmi ~ acpa, data = data_df)
kruskal.test(esr ~ acpa, data = ra_df)
kruskal.test(crp ~ acpa, data = ra_df)
kruskal.test(das28crp ~ acpa, data = ra_df)


	Kruskal-Wallis rank sum test

data:  bmi by acpa
Kruskal-Wallis chi-squared = 2.643, df = 2, p-value = 0.2667



	Kruskal-Wallis rank sum test

data:  esr by acpa
Kruskal-Wallis chi-squared = 0.16892, df = 1, p-value = 0.6811



	Kruskal-Wallis rank sum test

data:  crp by acpa
Kruskal-Wallis chi-squared = 2.8427, df = 1, p-value = 0.09179



	Kruskal-Wallis rank sum test

data:  das28crp by acpa
Kruskal-Wallis chi-squared = 3.2547, df = 1, p-value = 0.07122


In [79]:
head(data_df)

Unnamed: 0_level_0,sample_ID,mayo_ID,acpa,sex,age,bmi,rf,smoking,rf_tit,acpa_tit,⋯,adalimumab,golimumab,certolizumab,tocilizumab,anakinra,abatacept,rituximab,pred,csDMARDs,bDMARDs
Unnamed: 0_level_1,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,1402101782,10663503,0,0,41.01027,47.8923,,0.0,,,⋯,0,0,0,0,0,0,0,0,0,0
2,1401405787,1867679,0,0,63.00068,21.71,,,,,⋯,0,0,0,0,0,0,0,0,0,0
3,1305800706,2939561,0,1,59.99452,28.7801,,0.0,,,⋯,0,0,0,0,0,0,0,0,0,0
4,1431803035,9898318,0,0,44.78576,21.2299,,0.0,,,⋯,0,0,0,0,0,0,0,0,0,0
5,1315600833,2919944,0,1,63.00068,26.7382,,0.0,,,⋯,0,0,0,0,0,0,0,0,0,0
6,1321207870,7448718,0,0,48.0,29.79,,0.0,,,⋯,0,0,0,0,0,0,0,0,0,0


In [37]:
#Analysis after recieving feedback: 2024.02.20

disease_duration_df = read.csv("../../../analysis_addressing_feedback/disease_duration/data/disease_duration_years.csv")
kruskal.test(list(disease_duration_df$ACPA.neg, disease_duration_df$ACPA.pos))


	Kruskal-Wallis rank sum test

data:  list(disease_duration_df$ACPA.neg, disease_duration_df$ACPA.pos)
Kruskal-Wallis chi-squared = 0.18781, df = 1, p-value = 0.6647


In [35]:
#Double checking the concept
disease_duration_test_df = read.csv("../../../analysis_addressing_feedback/disease_duration/data/disease_duration_years_test.csv")
kruskal.test(value ~ acpa_status, data = disease_duration_test_df)



	Kruskal-Wallis rank sum test

data:  value by acpa_status
Kruskal-Wallis chi-squared = 0.18781, df = 1, p-value = 0.6647
