In [1]:
import cudf
import numpy as np
import os 
import shutil
import pandas as pd 
from scipy import stats
from scipy.stats import chi2_contingency

# I use rapids 21-12 in Hipergator

# 1. Get array of population IDs

## PD or HC or subset of PD (prevalent, or incident)

# 1.1 Overall PD 

In [9]:
#PD subjects guaranteede
subject_ids = ["1114277",
"1178636",
"1227372",
"1293346",
"1328327",
"1344871",
"1385201",
"1448986",
"1604568",
"1683568",
"1686653",
"1702601",
"1729129",
"1779711",
"1851133",
"2045185",
"2063006",
"2167017",
"2234655",
"2253228",
"2259013",
"2311988",
"2517130",
"2583580",
"2791680",
"2890808",
"2932050",
"2984777",
"3033844",
"3134693",
"3156084",
"3198287",
"3382506",
"3383380",
"3391006",
"3554033",
"3561789",
"3568167",
"3598466",
"3706056",
"3756652",
"3802938",
"3826232",
"4011374",
"4038762",
"4069794",
"4127426",
"4130938",
"4215662",
"4244556",
"4322638",
"4375090",
"4415530",
"4472792",
"4530859",
"4532453",
"4535648",
"4548991",
"4614800",
"4619705",
"4738821",
"4751589",
"4798230",
"4868455",
"4907629",
"5035719",
"5101325",
"5157987",
"5208752",
"5221906",
"5229530",
"5424494",
"5471094",
"5527470",
"5563399",
"5572442",
"5640521",
"5651883",
"5659619",
"5749875",
"5776153",
"5841706",
"5861534",
"5953087"]

print(len(subject_ids))

84


# 1.2 HC 

In [5]:
#Healthy Subjects guaranteed incase I revisit this
subject_ids = ["1034220",
"1112794",
"1136758",
"1143056",
"1207549",
"1291253",
"1300508",
"1338245",
"1385711",
"1445308",
"1454985",
"1463230",
"1523027",
"1523223",
"1555296",
"1558677",
"1561855",
"1562567",
"1578746",
"1605541",
"1619489",
"1646019",
"1714891",
"1743307",
"1756582",
"1762289",
"1786145",
"1823712",
"1855467",
"1956515",
"1991498",
"1998264",
"2003452",
"2005964",
"2022296",
"2054405",
"2100562",
"2252178",
"2317118",
"2325512",
"2338505",
"2341415",
"2518197",
"2558526",
"2573039",
"2682558",
"2747870",
"2770153",
"2770564",
"2789351",
"2819601",
"2822488",
"2848575",
"2873857",
"2923883",
"2950751",
"3056090",
"3214958",
"3247304",
"3486864",
"3573739",
"3595368",
"3687402",
"3821117",
"3850168",
"3943027",
"3946341",
"3963896",
"4001612",
"4287033",
"4356152",
"4528562",
"4627046",
"4652340",
"4754281",
"4855507",
"4959352",
"5168495",
"5208893",
"5286204",
"5318943",
"5394841",
"5444877",
"5462419",
"5602865",
"5623257",
"5745206",
"5749166",
"5810422",
"5887044",
"5955668"]

print(len(subject_ids))

91


# 1.3 Prevalent PD

In [6]:
subject_ids = ["1178636",
"1227372",
"1448986",
"1604568",
"1683568",
"1702601",
"1779711",
"1851133",
"2045185",
"2234655",
"2253228",
"2259013",
"2311988",
"2517130",
"2791680",
"2932050",
"3033844",
"3134693",
"3156084",
"3383380",
"3554033",
"3561789",
"3568167",
"3706056",
"3756652",
"3826232",
"4011374",
"4038762",
"4127426",
"4130938",
"4415530",
"4472792",
"4530859",
"4535648",
"4548991",
"4619705",
"4738821",
"4798230",
"4907629",
"5035719",
"5157987",
"5208752",
"5221906",
"5229530",
"5471094",
"5527470",
"5640521",
"5651883",
"5749875",
"5776153",
"5841706",
"5861534",
"5953087"]

print(len(subject_ids))

53


# 1.4 Incident PD 

In [7]:
subject_ids = ["1114277",
"1293346",
"1328327",
"1344871",
"1385201",
"1686653",
"1729129",
"2063006",
"2167017",
"2583580",
"2890808",
"2984777",
"3198287",
"3382506",
"3391006",
"3598466",
"3802938",
"4069794",
"4215662",
"4244556",
"4322638",
"4375090",
"4532453",
"4614800",
"4751589",
"4868455",
"5101325",
"5424494",
"5563399",
"5572442",
"5659619"]

# 2. Use RAPIDS Cudf to read csv and load data statistics

In [10]:
# 


csv_dir = '/blue/ruogu.fang/share/RetinaPD/csv/ukb669006.csv'

df4 = cudf.read_csv(csv_dir)
df4 = df4.loc[df4['eid'].isin(subject_ids)]


print('------------------------------------------------------------------------------')
print('Townsend Indices')
print('Mean of Townsend Indices', df4['189-0.0'].astype('float64').mean())
print('Standard Deviation of Townsend Indices', df4['189-0.0'].astype('float64').std())
print('------------------------------------------------------------------------------')
print('Stroke History')
print('Number of Stroke Subjects', df4['42006-0.0'].notnull().sum())
print('Number of Non-Stroke Subjects', df4['42006-0.0'].isnull().sum())
print('------------------------------------------------------------------------------')
print('Smoking History')
print('Number of Smoking Subjects', (df4['20116-0.0'].astype('uint8') != 0).sum())
print('Number of Non-Smoking Subjects', (df4['20116-0.0'].astype('uint8') == 0).sum())
print('------------------------------------------------------------------------------')
print('Alcohol Status')
print('Number of Alcohol Subjects', (df4['20117-0.0'].astype('uint8') != 0).sum())
print('Number of Non-Alcohol Subjects', (df4['20117-0.0'].astype('uint8') == 0).sum())
print('------------------------------------------------------------------------------')
print('Obesity-Diabetes Status')
print('Number of Obesity Subjects', (df4['21001-0.0'].astype('float64') > 30).sum())
print('Number of Non-Obesity Subjects', (df4['21001-0.0'].astype('float64') < 30).sum())
print('------------------------------------------------------------------------------')
print('Psychotropic Medication')
print('Number of Medication Subjects', (df4['20466-0.0'].astype('uint8') == 1).sum())
print('Number of Non-Medication Subjects', (df4['20466-0.0'].astype('uint8') != 1).sum() 
      + df4['20466-0.0'].astype('uint8').isnull().sum())

#df4.to_csv('/blue/ruogu.fang/charlietran/UKB/csv/PD_characteristics.csv')


------------------------------------------------------------------------------
Townsend Indices
Mean of Townsend Indices -1.4788360714285713
Standard Deviation of Townsend Indices 2.947206113955094
------------------------------------------------------------------------------
Stroke History
Number of Stroke Subjects 3
Number of Non-Stroke Subjects 81
------------------------------------------------------------------------------
Smoking History
Number of Smoking Subjects 30
Number of Non-Smoking Subjects 54
------------------------------------------------------------------------------
Alcohol Status
Number of Alcohol Subjects 80
Number of Non-Alcohol Subjects 4
------------------------------------------------------------------------------
Obesity-Diabetes Status
Number of Obesity Subjects 18
Number of Non-Obesity Subjects 66
------------------------------------------------------------------------------
Psychotropic Medication
Number of Medication Subjects 0
Number of Non-Medication Subj

In [11]:
csv_dir = '/blue/ruogu.fang/share/RetinaPD/csv/ukb52184.csv'

df4 = cudf.read_csv(csv_dir)
df4 = df4.loc[df4['eid'].isin(subject_ids)]

A = df4[['eid', '21003-0.0', '31-0.0', '21000-0.0', '5206-0.0', '5199-0.0',]]
A['sum_total'] = A['5206-0.0'].astype('float64') + A['5199-0.0'].astype('float64')

print('Mean Age:', A['21003-0.0'].astype('float64').mean())
print('Stdev Age:', A['21003-0.0'].astype('float64').std())
print('----------------------------------------------------------------')
print('Number of Males', (A['31-0.0'].astype('float64') == 1).sum(), '| Percentage ', (A['31-0.0'].astype('float64') == 1).sum() / len(df4))
print('Number of Females', (A['31-0.0'].astype('float64') == 0).sum(), '| Percentage ', (A['31-0.0'].astype('float64') == 0).sum() / len(df4))
print('----------------------------------------------------------------')
print('Number of White Ethnicity',  (A['21000-0.0'].astype('int32') == 1001).sum() + (A['21000-0.0'].astype('int32') == 1002).sum(),
     ' | Percentage ', 100* ((A['21000-0.0'].astype('int32') == 1001).sum() + (A['21000-0.0'].astype('int32') == 1002).sum()) / len(df4) 
     )
print('Number of White Ethnicity',  len(df4) - ((A['21000-0.0'].astype('int32') == 1001).sum() + (A['21000-0.0'].astype('int32') == 1002).sum()),
     ' | Percentage ', 100* (len(df4) - ((A['21000-0.0'].astype('int32') == 1001).sum() + (A['21000-0.0'].astype('int32') == 1002).sum())) / len(df4)
     )
print('----------------------------------------------------------------')
print('Mean Visual Acuity',  A['sum_total'].mean())
print('Std Visual Acuity',  A['sum_total'].std())

Mean Age: 61.82142857142857
Stdev Age: 5.929628920886516
----------------------------------------------------------------
Number of Males 49 | Percentage  0.5833333333333334
Number of Females 35 | Percentage  0.4166666666666667
----------------------------------------------------------------
Number of White Ethnicity 82  | Percentage  97.61904761904762
Number of White Ethnicity 2.0  | Percentage  2.380952380952381
----------------------------------------------------------------
Mean Visual Acuity 0.28400000000000003
Std Visual Acuity 0.3834551314532611


In [12]:
csv_dir = '/blue/ruogu.fang/charlietran/UKB/csv/ukb52152.csv'
df4 = cudf.read_csv(csv_dir)
df4 = df4.loc[df4['eid'].isin(subject_ids)]

A = df4[['6148-0.0']]

# healthy accounts is

print('Number of Vision Problem Subjects', (len(df4) - (A['6148-0.0'].astype('float64').isin([-3, -7, -1]).sum() 
                                            + A['6148-0.0'].astype('float64').isnull().sum())),
     ' | Percentage ', 100 * (len(df4) - (A['6148-0.0'].astype('float64').isin([-3, -7, -1]).sum() 
                                            + A['6148-0.0'].astype('float64').isnull().sum())) / len(df4))

print('Number of Healthy Vision Subjects', (A['6148-0.0'].astype('float64').isin([-3, -7, -1]).sum() 
                                            + A['6148-0.0'].astype('float64').isnull().sum()),
     ' | Percentage ', 100 * (A['6148-0.0'].astype('float64').isin([-3, -7, -1]).sum() 
                                            + A['6148-0.0'].astype('float64').isnull().sum()) / len(df4))

Number of Vision Problem Subjects 9.0  | Percentage  10.714285714285714
Number of Healthy Vision Subjects 75  | Percentage  89.28571428571429


In [14]:
from scipy.stats import chi2_contingency

yes = np.array([1, 2])
no = np.array([52, 29])

obs = obs = np.vstack([yes, no])
chi, p, _, expected = chi2_contingency(obs, correction = False)
print('pvalue', p)
expected
#_, p _, _ = chi2_contingency(true_obs, false_obs)
#p


pvalue 0.2766503035738431


array([[ 1.89285714,  1.10714286],
       [51.10714286, 29.89285714]])