In [1]:
#disease_duration   24.02.05
#
#Perform Kruskal–Wallis test to see the difference between ACPA– and ACPA+ RA.
#EAC (Early arthritis cohort = disease duration = 50 day)
#ACPA+ RA = 1
#ACPA– RA = 2

#This analysis was done after recieving internal review from the Drs.
#And will be inserted into Table 1

import pandas as pd
import re
from datetime import datetime
import numpy as np

In [51]:
def get_disease_duration(patient_ID, all_visit_df):

    temp_df = all_visit_df[all_visit_df["Study ID"] == patient_ID]
    diagnosis_date = temp_df["Date of RA physician diagnosis"].tolist()[0]
    visit_date = temp_df["Date of study visit"].tolist()[0]

    date_format = "%m/%d/%y"
    
    diagnosis_date = datetime.strptime(diagnosis_date, date_format)
    visit_date = datetime.strptime(visit_date, date_format)
    
    delta_day = (visit_date - diagnosis_date).days

    return delta_day

def report_disease_duration_statistics(day_list):
    average = np.mean(day_list)
    
    # Calculate standard deviation
    std_dev = np.std(day_list)
    
    # Calculate interquartile range (IQR)
    Q1 = np.percentile(day_list, 25)
    Q3 = np.percentile(day_list, 75)

    print (average, "+-", std_dev)
    print (Q1, "-", Q3)
    print (min(day_list), max(day_list))
        

In [52]:
manifest_file = "../../analysis_addressing_feedback/disease_duration/data/sample_manifest.csv"
all_visit_file = "../../analysis_addressing_feedback/disease_duration/data/Rheum_Biobank_All_Visits.csv"

manifest_df = pd.read_csv(manifest_file)
r, c = manifest_df.shape

all_visit_df = pd.read_csv(all_visit_file)

In [54]:
count_missing_info = 0
data_dict = {}

for i in range(r):
    sample_ID = int(manifest_df["Sample Id"][i])
    ra_status = manifest_df["RA_status"][i]
    temp_patient_ID = manifest_df["Patient Ids"][i]
    study_info = manifest_df["Study"][i]
    ext_study_ID = manifest_df["External Participant Id"][i]

    # print (temp_patient_ID)
    # print (ext_study_ID)

    if ra_status != "control":
        try: 
            # Extract only the matched part for elements containing the pattern
            pattern = r'RHB\.RAM\.\d{4}\.PT'
            matched_pattern = re.search(pattern, temp_patient_ID)
            patient_ID = matched_pattern.group()
            
            # print (sample_ID, i, ra_status, patient_ID)
            
            disease_duration = get_disease_duration(patient_ID, all_visit_df)/365
            print (disease_duration)
            try: 
                data_dict[ra_status].append(disease_duration)
            except KeyError: 
                data_dict[ra_status] = [disease_duration]
            
        except TypeError :

            disease_duration = 50/365
            print (disease_duration)
            try: 
                data_dict[ra_status].append(disease_duration)
            except KeyError: 
                data_dict[ra_status] = [disease_duration]

        except AttributeError:

            disease_duration = 50/365
            print (disease_duration)
            try: 
                data_dict[ra_status].append(disease_duration)
            except KeyError: 
                data_dict[ra_status] = [disease_duration]

####TODO 24.02.19
#value: 0???

0.2684931506849315
0.5424657534246575
2.3698630136986303
6.4301369863013695
8.668493150684931
3.2465753424657535
4.063013698630137
7.909589041095891
0.8410958904109589
0.5753424657534246
0.7232876712328767
0.136986301369863
3.180821917808219
5.054794520547945
3.0876712328767124
0.136986301369863
8.027397260273972
0.7835616438356164
0.6904109589041096
0.136986301369863
1.5726027397260274
1.0054794520547945
9.463013698630137
4.794520547945205
4.750684931506849
5.273972602739726
2.0931506849315067
2.408219178082192
0.9863013698630136
3.106849315068493
4.2
9.306849315068494
8.164383561643836
3.1945205479452055
4.471232876712329
5.964383561643835
2.1315068493150684
7.531506849315068
5.635616438356164
0.7671232876712328
3.1890410958904107
7.052054794520548
1.441095890410959
4.375342465753425
0.9753424657534246
1.9123287671232876
7.134246575342465
0.136986301369863
3.8054794520547945
3.989041095890411
7.994520547945205
2.0657534246575344
0.16986301369863013
0.136986301369863
0.136986301369863

In [55]:
print ("Missing information: ", count_missing_info)
print ("data_dict keys:" , list(data_dict.keys()))
print ("ACPA-pos #:", len(data_dict["'ACPA-pos'"]))
report_disease_duration_statistics(data_dict["'ACPA-pos'"])

print ("ACPA-neg #:", len(data_dict["'ACPA-neg'"]))
report_disease_duration_statistics(data_dict["'ACPA-neg'"])


Missing information:  0
data_dict keys: ["'ACPA-pos'", "'ACPA-neg'"]
ACPA-pos #: 40
2.773698630136986 +- 2.6446204243645317
0.136986301369863 - 4.473287671232876
0.0 9.306849315068494
ACPA-neg #: 40
2.636575342465753 +- 2.8433096981171464
0.136986301369863 - 3.8965753424657534
0.136986301369863 9.463013698630137


In [45]:
result

KruskalResult(statistic=0.2727272727272734, pvalue=0.6015081344405895)

In [58]:
series_neg = pd.Series(data_dict["'ACPA-neg'"])
series_pos = pd.Series(data_dict["'ACPA-pos'"])

# Create a DataFrame
tmp_df = pd.DataFrame({'ACPA-neg': series_neg, 'ACPA-pos': series_pos})

# Save DataFrame to CSV
tmp_df.to_csv('../../analysis_addressing_feedback/disease_duration/data/disease_duration_years.csv', index=False)

In [60]:
stats.kruskal(data_dict["'ACPA-neg'"], data_dict["'ACPA-pos'"])

KruskalResult(statistic=0.1878140944835067, pvalue=0.6647421887996687)

In [62]:
x = [2.9, 3.0, 2.5, 2.6, 3.2] # normal subjects
z = [2.8, 3.4, 3.7, 2.2, 2.0]

stats.kruskal(x, z)

KruskalResult(statistic=0.010909090909095198, pvalue=0.9168149485280722)