In [None]:
from scipy.stats import fisher_exact
from scipy.stats import norm
import scipy
import numpy as np

In [None]:
def remove_na_vals(ls1, ls2): # only keep the intersection of two lists without "n/a" values
  new_ls1 = []
  new_ls2 = []
  for i in range(len(ls1)):
    if(ls1[i] != "n/a" and ls2[i] != "n/a"):
      new_ls1.append(ls1[i])
      new_ls2.append(ls2[i])
  return new_ls1, new_ls2

In [None]:
def perform_fisher_test(ls1, ls2):
  new_ls1, new_ls2 = remove_na_vals(ls1, ls2)

  # set up 2x2 contingency table
  top_left  = 0 # new_ls1 = 1 AND new_ls2 = 1
  top_right = 0 # new_ls1 = 1 AND new_ls2 = 0
  bot_left  = 0 # new_ls1 = 0 AND new_ls2 = 1
  bot_right = 0 # new_ls1 = 0 AND new_ls2 = 0

  for i in range(len(new_ls1)):
    if(new_ls1[i] == 1 and new_ls2[i] == 1): top_left += 1
    if(new_ls1[i] == 1 and new_ls2[i] == 0): top_right += 1
    if(new_ls1[i] == 0 and new_ls2[i] == 1): bot_left += 1
    if(new_ls1[i] == 0 and new_ls2[i] == 0): bot_right += 1

  print(f'{new_ls1}')
  print(f'{new_ls2}')
  print()

  print(f'{top_left} {top_right}')
  print(f'{bot_left} {bot_right}')
  print(f'Sum: {len(new_ls1)}')

  contingency_tbl = np.array([[top_left, top_right],
                              [bot_left, bot_right]])

  # perform Fisher's Exact Test
  fisher_odds_ratio, p_value = fisher_exact(contingency_tbl)

  # perform 95% confidence interval (using Woolf logit method)
  a, b, c, d = contingency_tbl.flatten()
  log_odds_ratio = np.log((a * d) / (b * c))
  se_log_odds_ratio = np.sqrt(1/a + 1/b + 1/c + 1/d)

  alpha = 0.05
  z_crit = norm.ppf(1 - alpha / 2)

  lower_log_ci = log_odds_ratio - z_crit * se_log_odds_ratio
  upper_log_ci = log_odds_ratio + z_crit * se_log_odds_ratio

  lower_odds_ratio_ci = np.exp(lower_log_ci)
  upper_odds_ratio_ci = np.exp(upper_log_ci)

  confidence_interval = [lower_odds_ratio_ci, upper_odds_ratio_ci]

  return fisher_odds_ratio, p_value, confidence_interval

In [None]:
# progressed (y/n), PM TRPV4 (+/-), HG (y/n), PR (+/-), cardiovascular disease (y/n), smoking (y/n), metabolic disease (y/n)
prog = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
trpv4 = [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1]
hgis1 = [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0]
hgorimgis1 = [1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0]
er = [1, 1, 1, 0, "n/a", 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "n/a", 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1]
neg_er = [0, 0, 0, 1, "n/a", 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "n/a", 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0]
card_disease = [1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1]
smoking = [1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0]
meta_disease = [0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0]
time = [2166, 1032, 3627, 5429, 1786, 1024, 3429, 4675, 3296, 2612, 4321, 2737, 3835, 1133, 2597, 2283, 1518, 750, 505, 1524, 1636, 726, 1394, 146, 2327, 4822, 4698, 4418, 4440, 2634, 4145, 4030, 3982, 3841, 2798, 3720, 4045, 3380, 3373, 3267, 2986, 2490, 2408, 2040]

In [None]:
# PROGRESSED only
yes_prog = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
yes_trpv4 = [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]
yes_hgis1 = [1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]
yes_hgorimgis1 = [1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0]
yes_neg_er = [0, 0, 0, 1, "n/a", 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
yes_card_disease = [1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1]
yes_smoking = [1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0]
yes_meta_disease = [0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1]
yes_time = [2166, 1032, 3627, 5429, 1786, 1024, 3429, 4675, 3296, 2612, 4321, 2737, 3835, 1133, 2597, 2283, 1518, 750, 505, 1524, 1636, 726, 1394, 146]

In [None]:
# NOT PROGRESSED (5 YEAR FOLLOW UP) only
no_prog = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
no_trpv4 = [1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1]
no_hgis1 = [1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0]
no_hgorimgis1 = [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0]
no_neg_er = [0, 0, 0, 0, 0, 0, "n/a", 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0]
no_card_disease = [0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1]
no_smoking = [0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0]
no_meta_disease = [0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0]
no_time = [2327, 4822, 4698, 4418, 4440, 2634, 4145, 4030, 3982, 3841, 2798, 3720, 4045, 3380, 3373, 3267, 2986, 2490, 2408, 2040]

In [None]:
yes_lump_alone = [0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0]
no_lump_alone = [0,1,0,0,1,0,0,0,1]
yes_lump_other = [1,1,1,1,1,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1]
no_lump_other = [1,0,1,1,0,1,1,1,0]
lump_other = [1,1,1,1,1,1,1,1,1,0,1,0,0,0,1,1,1,1,1,1,0,1,0,0,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1]
yes_trpv4_alone = [1] * 35
no_trpv4_alone = [0] * 9

In [None]:
fisher_odds_ratio, p_value, confidence_interval = perform_fisher_test(prog, meta_disease)
print()
print(f'Odds Ratio: {fisher_odds_ratio}')
print(f'Confidence Interval: {confidence_interval}') # 95% confidence interval
print(f'P-Value: {p_value}')

In [None]:
# binary num: represents positive/negative, yes/no, etc... (ex: ER negative would be 0 while ER positive would be 1)
def find_cnt_and_pct(ls, binary_num):
  subset_cnt = ls.count(binary_num)
  total_cnt = len(ls)

  cnt = f'{subset_cnt}/{total_cnt}'
  pct = (subset_cnt / total_cnt) * 100

  return cnt, pct

In [None]:
# print(f'# Prog: {len(yes_prog)}')
# print(f'# Non-Prog: {len(no_prog)}')
print()
print(find_cnt_and_pct(no_meta_disease, 1))