In [None]:
import os 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dataframe_image as dfi

import statsmodels.api as sm

from lifelines import CoxPHFitter
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer 

from scipy.stats import ttest_rel
from scipy.stats import shapiro
from scipy.stats import kruskal
from scipy import stats
from scipy.stats import ttest_ind
from scipy.stats import friedmanchisquare
from scipy.stats import fisher_exact
from scipy.stats import chi2_contingency

from datetime import datetime

import rpy2.robjects as ro
from rpy2.robjects import numpy2ri 
from rpy2.robjects.packages import importr


from statsmodels.stats.contingency_tables import mcnemar

In [None]:
df=pd.read_excel('.xlsx')

In [None]:
df=df[:265]

In [None]:
df = df.dropna(subset=['No'])

In [None]:
df['gender']= df['gender'].map({'남': 1, '여': 0})

In [None]:
df.columns = [col.replace('.', '_') for col in df.columns]

In [None]:
# 데이터프레임 생성
df = pd.DataFrame(df)

# 컬럼 이름 부분적으로 변경
df= df.rename(columns={
    'SBP': 'Visit1_SBP',
    'SBP_1': 'Visit2_SBP',
    'SBP_2':'Visit3_SBP',
    'SBP_3': 'Visit4_SBP',
    'SBP_4': 'Visit5_SBP',
    'HR': 'Visit1_HR',
    'HR_1': 'Visit2_HR',
    'HR_2':'Visit3_HR',
    'HR_3': 'Visit4_HR',
    'HR_4': 'Visit5_HR',
    'DBP': 'Visit1_DBP',
    'DBP_1': 'Visit2_DBP',
    'DBP_2':'Visit3_DBP',
    'DBP_3': 'Visit4_DBP',
    'DBP_4': 'Visit5_DBP',
    'platelet_1': 'Visit1_PLT',
    'platelet': 'Visit5_PLT',
    'WBC_1' : 'Visit1_WBC',
    'WBC_2': 'Visit5_WBC',
    'Hct_1': 'Visit1_HCT',
    'Hct_2': 'Visit5_HCT',
    'Hb_1': 'Visit1_HB',
    'Hb_2': 'Visit5_HB',
    'heartburn':'Visit1_heartburn',
    'heartburn_1':'Visit2_heartburn',
    'heartburn_2':'Visit3_heartburn',
    'heartburn_3':'Visit4_heartburn',
    'heartburn_4':'Visit5_heartburn',
    'nausea': 'Visit1_nausea',
    'nausea_1': 'Visit2_nausea',
    'nausea_2': 'Visit3_nausea',
    'nausea_3': 'Visit4_nausea',
    'nausea_4': 'Visit5_nausea',
    'vomiting': 'Visit1_vomiting',
    'vomiting_1': 'Visit2_vomiting',
    'vomiting_2': 'Visit3_vomiting',
    'vomiting_3': 'Visit4_vomiting',
    'vomiting_4': 'Visit5_vomiting',
    'diarrhea': 'Visit1_diarrhea',
    'diarrhea_1': 'Visit2_diarrhea',
    'diarrhea_2': 'Visit3_diarrhea',
    'diarrhea_3': 'Visit4_diarrhea',
    'diarrhea_4': 'Visit5_diarrhea',
    'abdominal pain' : 'Visit1_abdominal_pain',
    'abdominal pain_1' : 'Visit2_abdominal_pain',
    'abdominal pain_2' : 'Visit3_abdominal_pain',
    'abdominal pain_3' : 'Visit4_abdominal_pain',
    'abdominal pain_4' : 'Visit5_abdominal_pain',
    'dyspepsia': 'Visit1_dyspepsia',
    'dyspepsia_1': 'Visit2_dyspepsia',
    'dyspepsia_2': 'Visit3_dyspepsia',
    'dyspepsia_3': 'Visit4_dyspepsia',
    'dyspepsia_4': 'Visit5_dyspepsia',
    'other': 'Visit1_other',
    'other_1': 'Visit2_other',
    'other_2': 'Visit3_other',
    'other_3': 'Visit4_other',
    'other_4': 'Visit5_other',
    'hematochezia': 'Visit1_hematochezia',
    'hematochezia_1': 'Visit2_hematochezia',
    'hematochezia_2': 'Visit3_hematochezia',
    'hematochezia_3': 'Visit4_hematochezia',
    'hematochezia_4': 'Visit5_hematochezia',
    'melena': 'Visit1_melena',
    'melena_1': 'Visit2_melena',
    'melena_2': 'Visit3_melena',
    'melena_3': 'Visit4_melena',
    'melena_4': 'Visit5_melena',
    'hematemesis': 'Visit1_hematemesis',
    'hematemesis_1': 'Visit2_hematemesis',
    'hematemesis_2': 'Visit3_hematemesis',
    'hematemesis_3': 'Visit4_hematemesis',
    'hematemesis_4': 'Visit5_hematemesis',
    # 'hospitalization': 'Visit1_hospitalization',
    # 'hospitalization_1': 'Visit2_hospitalization',
    # 'hospitalization_2': 'Visit3_hospitalization',
    # 'hospitalization_3': 'Visit4_hospitalization',
    # 'hospitalization_4': 'Visit5_hospitalization',
})



In [None]:
df.loc[df['Visit5_HB'] == 23.2, 'Visit5_HB'] = 13.2
print(df['Visit5_HB'].max())

In [None]:
df.loc[df['Visit1_melena'] == 2009.0, 'Visit1_melena'] = 0
df.loc[df['Visit1_hematemesis'] == 2009.0, 'Visit1_hematemesis'] = 0

In [None]:
# 데이터 타입을 숫자로 변환
df = df.apply(pd.to_numeric, errors='coerce')

# 다중 대체를 위한 IterativeImputer 사용
imputer = IterativeImputer(max_iter=10, random_state=0)

# 결측값이 있는 컬럼만 선택하여 대체 진행
cols_to_impute = [
    'Visit1_SBP', 'Visit2_SBP', 'Visit3_SBP', 'Visit4_SBP', 'Visit5_SBP',
    'Visit1_HR', 'Visit2_HR', 'Visit3_HR', 'Visit4_HR', 'Visit5_HR',
    'Visit1_DBP', 'Visit2_DBP', 'Visit3_DBP', 'Visit4_DBP', 'Visit5_DBP',
    'Visit1_PLT', 'Visit5_PLT', 'Visit1_WBC', 'Visit5_WBC',
    'Visit1_HCT', 'Visit5_HCT', 'Visit1_HB', 'Visit5_HB'
]

# Imputer를 사용하여 결측값 대체
df_imputed = df.copy()
df_imputed[cols_to_impute] = imputer.fit_transform(df[cols_to_impute])
df=df_imputed
# 대체된 데이터 출력
print(df)

In [None]:
print(df['Age'].describe())

In [None]:
df['Age'].median()

In [None]:
df['smoking_now']=df['smoking_now'].fillna(0)

In [None]:
df['smoking_now'].value_counts()

In [None]:
df['smoking'].value_counts()

In [None]:
df['Peripheral artery disease']=df['Peripheral artery disease'].fillna(0)
df['Peripheral artery disease'].value_counts()

In [None]:
df['GI Bleeding history']=df['GI Bleeding history'].fillna(0)
df['GI Bleeding history'].value_counts()

In [None]:
df.columns[:50]

In [None]:
# BMI 계산하여 새로운 컬럼 추가
df['bmi'] = df['weight'] / (df['height'] / 100) ** 2
df['bmi'].std()

In [None]:
df['alcohol']=df['alcohol'].fillna(0)
df['alcohol'].value_counts()

In [None]:
df['AF']=df['AF'].fillna(0)
df['AF'].value_counts()

In [None]:
df['angina']=df['angina'].fillna(0)
df['angina'].value_counts()

In [None]:
df['MI']=df['MI'].fillna(0)
df['MI'].value_counts()

In [None]:
df['CI']=df['CI'].fillna(0)
df['CI'].value_counts()

In [None]:
def classify(row):
    if row['AF'] == 1:
        return 'AF'
    elif row['MI'] == 1:
        return 'MI'
    elif row['CI'] ==1:
        return 'CI'
    elif row['angina'] == 1:
        return 'angina'
    else:
        return 'None'

df['condition'] = df.apply(classify, axis=1)

# 값 세기
condition_counts = df['condition'].value_counts()

condition_counts

In [None]:
# # 현재 질환 
# AF	angina	MI	CI

In [None]:
# # 과거력
# HTN
# systemic embolism
# DM
# DL
# HF
# angina_1
# MI
# PCI
# Ischemic stroke
# Hemorrhagic stroke
# TIA

In [None]:
def classify_past_conditions(row):
    if row['Ischemic stroke'] == 1:
        return 'Ischemic stroke'
    elif row['Hemorrhagic stroke'] == 1:
        return 'Hemorrhagic stroke'
    elif row['MI'] == 1:
        return 'MI'
    elif row['HF'] == 1:
        return 'HF'
    elif row['systemic embolism'] == 1:
        return 'systemic embolism'
    elif row['TIA'] == 1:
        return 'TIA'
    elif row['PCI'] == 1:
        return 'PCI'
    elif row['angina_1'] == 1:
        return 'angina_1'
    elif row['DM'] == 1:
        return 'DM'
    elif row['DL'] == 1:
        return 'DL'
    elif row['HTN'] == 1:
        return 'HTN'
    else:
        return 'None'

df['past_condition'] = df.apply(classify_past_conditions, axis=1)

# 값 세기
past_condition_counts = df['past_condition'].value_counts()

# 비율 계산 (소수점 첫째 자리까지)
total_count = len(df)
past_condition_percentages = (past_condition_counts / total_count) * 100
past_condition_percentages = past_condition_percentages.round(1)

# 결과 출력
print(past_condition_counts)
print(past_condition_percentages)

In [None]:
df['GI Bleeding history'].value_counts()
df['previous bleeding'].value_counts()

In [None]:
# 조건에 따라 우선순위 적용하여 새로운 열 생성
def classify_past_conditions(row):
    if row['GI Bleeding history'] == 1:
        return 'GI Bleeding history'
    elif row['Hemorrhagic stroke'] == 1:
        return 'previous bleeding'
    else:
        return None

df['bleed_condition'] = df.apply(classify_past_conditions, axis=1)

# 값 세기
past_condition_counts = df['bleed_condition'].value_counts()

# 비율 계산 (소수점 첫째 자리까지)
total_count = len(df)
past_condition_percentages = (past_condition_counts / total_count) * 100
past_condition_percentages = past_condition_percentages.round(1)

# 결과 출력
print(past_condition_counts)
print(past_condition_percentages)

In [None]:
def calculate_percentage(group_count, total_count):
    if total_count == 0:
        return 0
    percentage = (group_count / total_count) * 100
    return round(percentage, 1)

group_count = 256
total_count = 265

percentage = calculate_percentage(group_count, total_count)
print(f"Group Count: {group_count}, Total Count: {total_count}, Percentage: {percentage}%")


In [None]:
df.replace({
    'Aspirin protect 100mg': 'Aspirin',
    'Aspirin Enteric coated Tab': 'Aspirin',
    'Aspirin Enteric coated Tab 100mg' : 'Aspirin',
    'Aspirin Enteric coated 100mg' : 'Aspirin',
    'Astrix Cap 100mg': 'Aspirin',
    
    'PlaVIX 75mg': 'Clopidogrel',
    'Plavix': 'Clopidogrel',
    'Platless 75mg': 'Clopidogrel',
    'PlaVIX' :'Clopidogrel',
    'Plavix A tab': 'Clopidogrel',
    'Pidogul Tab 75mg': 'Clopidogrel',
    'Pidogul 75mg': 'Clopidogrel',
    'PlaviTOR 75mg': 'Clopidogrel',
    'PlaviTOR': 'Clopidogrel',
    
    'Closone 75/100mg': 'Clopidogrel + Aspirin',

    
    'Brilinta 90mg': 'Ticagrelor',
    'Brilinta 60mg': 'Ticagrelor',
    
    'Eliquis 2.5mg': 'Apixaban',
    'Lixiana 60mg': 'Edoxaban',
    'Xareloto 20mg': 'Rivaroxaban',
    
}, inplace=True)

In [None]:
df['Visit1_처방'].value_counts()

In [None]:
# '-'를 NaN으로 변환
df.replace('-', np.nan, inplace=True)

# 각 환자의 처방 데이터가 동일한지 확인
df['처방_동일여부'] = df.apply(lambda row: row['Visit1_처방'] == row['Visit2_처방'] == row['Visit3_처방'] == row['Visit4_처방'] == row['Visit5_처방'], axis=1)

# 동일한 처방을 받은 환자의 비율 확인
same_prescription_count = df['처방_동일여부'].sum()
total_count = df.shape[0]
same_prescription_ratio = same_prescription_count / total_count

print(f"처방이 동일한 환자 수: {same_prescription_count}")
print(f"전체 환자 수: {total_count}")
print(f"처방이 동일한 환자의 비율: {same_prescription_ratio:.2%}")

In [None]:
# 처방이 동일한 환자의 ID 리스트
same_prescription_patients = df[df['처방_동일여부'] == True]['ID'].tolist()

# 처방이 다른 환자의 ID 리스트
different_prescription_patients = df[df['처방_동일여부'] == False]['ID'].tolist()

print("처방이 동일한 환자의 ID 리스트:")
print(same_prescription_patients)

print("처방이 다른 환자의 ID 리스트:")
print(different_prescription_patients)


In [None]:
# 동일한 처방을 받은 환자들의 처방 데이터 요약
same_prescription_data = df[df['처방_동일여부'] == True][['ID', 'Visit1_처방', 'Visit2_처방', 'Visit3_처방', 'Visit4_처방', 'Visit5_처방']]

# 다른 처방을 받은 환자들의 처방 데이터 요약
different_prescription_data = df[df['처방_동일여부'] == False][['ID', 'Visit1_처방', 'Visit2_처방', 'Visit3_처방', 'Visit4_처방', 'Visit5_처방']]

print("동일한 처방을 받은 환자들의 데이터:")
print(same_prescription_data)

print("다른 처방을 받은 환자들의 데이터:")
print(different_prescription_data)


In [None]:
# 평균과 신뢰구간 계산 함수
def mean_confidence_interval(data, confidence=0.95):
    n = len(data)
    m, se = np.mean(data), stats.sem(data)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h

# 통계 검정 수행 함수
def perform_tests(group1, group2, label):
    print(f"\nPaired t-test ({label}):")
    stat, p = stats.ttest_rel(group1, group2)
    print(f'Paired t-test : Statistics={stat:.5f}, p={p:.5f}')
    print(f'p-value interpretation: {p_value_stars(p)}')

    print(f"\nWilcoxon signed-rank test ({label}):")
    stat, p = stats.wilcoxon(group1, group2)
    print(f'Wilcoxon signed-rank test: Statistics={stat:.5f}, p={p:.5f}')
    print(f'p-value interpretation: {p_value_stars(p)}')

def p_value_stars(p_value):
    if p_value < 0.001:
        return 'p < 0.001***'
    elif p_value < 0.01:
        return 'p < 0.01**'
    elif p_value < 0.05:
        return 'p < 0.05*'
    else:
        return f"p = {p_value:.2f}"

# 데이터 전처리 및 그룹 나누기 함수
def process_data(df, column):
    cleaned_df_total = df[[f'Visit1_{column}', f'Visit5_{column}']].dropna()
    mean_threshold = df[f'Visit1_{column}'].mean()

    high_group = df[df[f'Visit1_{column}'] > mean_threshold].copy()
    low_group = df[df[f'Visit1_{column}'] <= mean_threshold].copy()

    high_group.loc[:, f'{column}_diff'] = high_group[f'Visit1_{column}'] - high_group[f'Visit5_{column}']
    low_group.loc[:, f'{column}_diff'] = low_group[f'Visit1_{column}'] - low_group[f'Visit5_{column}']

    return cleaned_df_total, high_group, low_group

# 시각화 함수
def visualize_data(df, high_group, low_group, column):
    # 전체 데이터 변화 시각화
    plt.figure(figsize=(8, 12))

    total_means = [df[f'Visit1_{column}'].mean(), df[f'Visit5_{column}'].mean()]
    total_ci = [mean_confidence_interval(df[f'Visit1_{column}']), mean_confidence_interval(df[f'Visit5_{column}'])]

    high_means = [high_group[f'Visit1_{column}'].mean(), high_group[f'Visit5_{column}'].mean()]
    high_ci = [mean_confidence_interval(high_group[f'Visit1_{column}']), mean_confidence_interval(high_group[f'Visit5_{column}'])]

    low_means = [low_group[f'Visit1_{column}'].mean(), low_group[f'Visit5_{column}'].mean()]
    low_ci = [mean_confidence_interval(low_group[f'Visit1_{column}']), mean_confidence_interval(low_group[f'Visit5_{column}'])]
    
    plt.subplot(2, 1, 1)
    plt.errorbar(['Visit1', 'Visit5'], total_means, yerr=[(ci[2]-ci[1])/2 for ci in total_ci], fmt='-o', label=f'Total {column}', color='green', capsize=5)
    # plt.errorbar(['Visit1', 'Visit5'], high_means, yerr=[(ci[2]-ci[1])/2 for ci in high_ci], fmt='-o', label=f'High {column}', color='blue', capsize=5)
    # plt.errorbar(['Visit1', 'Visit5'], low_means, yerr=[(ci[2]-ci[1])/2 for ci in low_ci], fmt='-o', label=f'Low {column}', color='red', capsize=5)
    plt.title(f'Total Mean {column} Levels with 95% Confidence Intervals')
    plt.ylabel(f'{column} Level')
    plt.legend()

    plt.tight_layout()
    plt.show()

    # 변화량 시각화
    plt.figure(figsize=(14, 6))

    plt.subplot(1, 2, 1)
    sns.histplot(high_group[f'{column}_diff'], bins=10, kde=True, color='blue')
    plt.title(f'High {column} Group - {column}_diff')
    plt.xlabel(f'{column}_diff (Visit1_{column} - Visit5_{column})')
    plt.ylabel('Count')

    plt.subplot(1, 2, 2)
    sns.histplot(low_group[f'{column}_diff'], bins=10, kde=True, color='red')
    plt.title(f'Low {column} Group - {column}_diff')
    plt.xlabel(f'{column}_diff (Visit1_{column} - Visit5_{column})')
    plt.ylabel('Count')

    plt.tight_layout()
    plt.show()

    # 그룹별 변화 비교 시각화
    plt.figure(figsize=(14, 6))

    plt.subplot(1, 2, 1)
    plt.plot(['Visit1', 'Visit5'], [high_group[f'Visit1_{column}'].mean(), high_group[f'Visit5_{column}'].mean()], marker='o', label=f'High {column}', color='blue')
    plt.plot(['Visit1', 'Visit5'], [low_group[f'Visit1_{column}'].mean(), low_group[f'Visit5_{column}'].mean()], marker='o', label=f'Low {column}', color='red')
    plt.title(f'Mean {column} Levels by Group')
    plt.ylabel(f'{column} Level')
    plt.legend()

    plt.subplot(1, 2, 2)
    sns.boxplot(data=[high_group[f'{column}_diff'], low_group[f'{column}_diff']], palette=['blue', 'red'])
    plt.xticks([0, 1], [f'High {column}', f'Low {column}'])
    plt.title(f'{column}_diff Box Plot by Group')
    plt.ylabel(f'{column}_diff (Visit1_{column} - Visit5_{column})')

    plt.tight_layout()
    plt.show()

# 분석 및 시각화 수행 함수
def analyze_and_visualize(df, column):
    cleaned_df_total, high_group, low_group = process_data(df, column)
    
    perform_tests(df[f'Visit1_{column}'], df[f'Visit5_{column}'], 'Total')
    perform_tests(high_group[f'Visit1_{column}'], high_group[f'Visit5_{column}'], f'High {column}')
    perform_tests(low_group[f'Visit1_{column}'], low_group[f'Visit5_{column}'], f'Low {column}')

    visualize_data(df, high_group, low_group, column)

    # 기초 통계 분석 출력
    print(f"\nTotal {column} Group Statistics:")
    print(cleaned_df_total.describe())

    print(f"\nHigh {column} Group Statistics:")
    print(high_group[[f'Visit1_{column}', f'Visit5_{column}']].describe())

    print(f"\nLow {column} Group Statistics:")
    print(low_group[[f'Visit1_{column}', f'Visit5_{column}']].describe())

# WBC에 대해 분석 및 시각화 수행
analyze_and_visualize(df, 'HB')


In [None]:
def p_value_stars(p_value):
    if p_value < 0.001:
        return 'p < 0.001***'
    elif p_value < 0.01:
        return 'p < 0.01**'
    elif p_value < 0.05:
        return 'p < 0.05*'
    else:
        return f"p = {p_value:.1f}"
        
# 평균과 신뢰구간 계산 함수
def mean_confidence_interval(data, confidence=0.95):
    n = len(data)
    m, se = np.mean(data), stats.sem(data)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h
    
# 통계 검정 수행 함수
def perform_tests(group1, group2, label):
    results = {
        'Group': [label],
        'Paired t-test': '',
        'Wilcoxon signed-rank test': '',
        'Independent Samples t-test': '',
        "Welch's t-test": '',
        'Mann-Whitney U Test': ''
    }

    # 1번 같은 집단의 두 시점 간 평균을 비교
    stat, p = stats.ttest_rel(group1, group2)
    results['Paired t-test'] = p_value_stars(p)

    # 2번 대응 표본의 중앙값을 비교하는 비모수 검정
    stat, p = stats.wilcoxon(group1, group2)
    results['Wilcoxon signed-rank test'] = p_value_stars(p)

    # 3번 두 독립된 집단의 평균을 비교
    stat, p = ttest_ind(group1, group2)
    results['Independent Samples t-test'] = p_value_stars(p)
    
    # 4번 두 독립된 집단의 평균을 비교하지만, 분산이 다를 수 있다고 가정
    stat, p = ttest_ind(group1, group2, equal_var=False)
    results["Welch's t-test"] = p_value_stars(p)
    
    # 5번 두 독립된 집단의 중앙값을 비교하는 비모수 검정
    stat, p = stats.mannwhitneyu(group1, group2)
    results['Mann-Whitney U Test'] = p_value_stars(p)

    return pd.DataFrame(results)
    
# 데이터 전처리 및 그룹 나누기 함수
def process_data(df, column):
    if column == 'HB':
        cleaned_df_total = df[[f'Visit1_{column}', f'Visit5_{column}']].dropna()
    
        condition1 = (df['Visit1_HB'] < 13) & (df['gender'] == 1) #남자
        condition2 = (df['Visit1_HB'] < 12) & (df['gender'] == 0) #여자   
        low_group = df[condition1 | condition2]
        
        condition1 = (df['Visit1_HB'] >= 13) & (df['gender'] == 1) #남자
        condition2 = (df['Visit1_HB'] >= 12) & (df['gender'] == 0) #여자
        Normal_group = df[condition1 | condition2]

    
    elif column == 'WBC':
        cleaned_df_total = df[[f'Visit1_{column}', f'Visit5_{column}']].dropna()
    
        condition1 = df['Visit1_WBC'] < 4.8
        low_group = df[condition1]
        condition2 = df['Visit1_WBC'] >= 4.8
        
        Normal_group = df[condition2]
    elif column == 'PLT':
        cleaned_df_total = df[[f'Visit1_{column}', f'Visit5_{column}']].dropna()
    
        condition1 = df['Visit1_PLT'] < 150
        low_group = df[condition1]
        condition2 = df['Visit1_PLT'] >= 150
        
        Normal_group = df[condition2]
    else:
        cleaned_df_total = df[[f'Visit1_{column}', f'Visit5_{column}']].dropna()
    
        condition1 = (df['Visit1_HCT'] < 42) & (df['gender'] == 1) #남자
        condition2 = (df['Visit1_HCT'] < 37) & (df['gender'] == 0) #여자
        low_group = df[condition1 | condition2]
        condition1 = (df['Visit1_HCT'] >= 42) & (df['gender'] == 1) #남자
        condition2 = (df['Visit1_HCT'] >= 37) & (df['gender'] == 0) #여자
        
        Normal_group = df[condition1 | condition2]
    
    Normal_group.loc[:, f'{column}_diff'] = Normal_group[f'Visit1_{column}'] - Normal_group[f'Visit5_{column}']
    low_group.loc[:, f'{column}_diff'] = low_group[f'Visit1_{column}'] - low_group[f'Visit5_{column}']
    
    return cleaned_df_total, Normal_group, low_group
    
# 시각화 함수
def visualize_data(df, Normal_group, low_group, column):
    
    # 전체 데이터 변화 시각화
    plt.figure(figsize=(8, 12))
    total_means = [df[f'Visit1_{column}'].mean(), df[f'Visit5_{column}'].mean()]
    total_ci = [mean_confidence_interval(df[f'Visit1_{column}']), mean_confidence_interval(df[f'Visit5_{column}'])]
    Normal_means = [Normal_group[f'Visit1_{column}'].mean(), Normal_group[f'Visit5_{column}'].mean()]
    Normal_ci = [mean_confidence_interval(Normal_group[f'Visit1_{column}']), mean_confidence_interval(Normal_group[f'Visit5_{column}'])]
    low_means = [low_group[f'Visit1_{column}'].mean(), low_group[f'Visit5_{column}'].mean()]
    low_ci = [mean_confidence_interval(low_group[f'Visit1_{column}']), mean_confidence_interval(low_group[f'Visit5_{column}'])]
    
    plt.subplot(2, 1, 1)
    plt.errorbar(['Visit1', 'Visit5'], total_means, yerr=[(ci[2]-ci[1])/2 for ci in total_ci], fmt='-o', label=f'Total {column}', color='green', capsize=5)
    plt.errorbar(['Visit1', 'Visit5'], Normal_means, yerr=[(ci[2]-ci[1])/2 for ci in Normal_ci], fmt='-o', label=f'Normal {column}', color='blue', capsize=5)
    plt.errorbar(['Visit1', 'Visit5'], low_means, yerr=[(ci[2]-ci[1])/2 for ci in low_ci], fmt='-o', label=f'Low {column}', color='red', capsize=5)
    plt.title(f'Total Mean {column} Levels with 95% Confidence Intervals')
    plt.ylabel(f'{column} Level')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # 변화량 시각화
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    sns.histplot(Normal_group[f'{column}_diff'], bins=10, kde=True, color='blue')
    plt.title(f'Normal {column} Group - {column}_diff')
    plt.xlabel(f'{column}_diff (Visit1_{column} - Visit5_{column})')
    plt.ylabel('Count')
    plt.subplot(1, 2, 2)
    sns.histplot(low_group[f'{column}_diff'], bins=10, kde=True, color='red')
    plt.title(f'Low {column} Group - {column}_diff')
    plt.xlabel(f'{column}_diff (Visit1_{column} - Visit5_{column})')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
    
    # 그룹별 변화 비교 시각화
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    plt.plot(['Visit1', 'Visit5'], [Normal_group[f'Visit1_{column}'].mean(), Normal_group[f'Visit5_{column}'].mean()], marker='o', label=f'Normal {column}', color='blue')
    plt.plot(['Visit1', 'Visit5'], [low_group[f'Visit1_{column}'].mean(), low_group[f'Visit5_{column}'].mean()], marker='o', label=f'Low {column}', color='red')
    plt.title(f'Mean {column} Levels by Group')
    plt.ylabel(f'{column} Level')
    plt.legend()
    plt.subplot(1, 2, 2)
    sns.boxplot(data=[Normal_group[f'{column}_diff'], low_group[f'{column}_diff']], palette=['blue', 'red'])
    plt.xticks([0, 1], [f'Normal {column}', f'Low {column}'])
    plt.title(f'{column}_diff Box Plot by Group')
    plt.ylabel(f'{column}_diff (Visit1_{column} - Visit5_{column})')
    plt.tight_layout()
    plt.show()
    
# 분석 및 시각화 수행 함수
def analyze_and_visualize(df, column):
    cleaned_df_total, Normal_group, low_group = process_data(df, column)
    
    total=perform_tests(df[f'Visit1_{column}'], df[f'Visit5_{column}'], 'Total')
    Normal=perform_tests(Normal_group[f'Visit1_{column}'], Normal_group[f'Visit5_{column}'], f'Normal {column}')
    low=perform_tests(low_group[f'Visit1_{column}'], low_group[f'Visit5_{column}'], f'Low {column}')
    visualize_data(df, Normal_group, low_group, column)
    
    # 기초 통계 분석 출력
    sf_total=cleaned_df_total.describe()
    sf_Normal=Normal_group[[f'Visit1_{column}', f'Visit5_{column}']].describe()
    sf_low=low_group[[f'Visit1_{column}', f'Visit5_{column}']].describe()
    
    return total, Normal, low, sf_total, sf_Normal, sf_low

# 모든 값을 소수점 첫째 자리까지 반올림하는 함수
def round_to_one_decimal(df):
    return df.round(1)

# 전체 통계 돌리기
total,Normal,low,sf_total,sf_Normal,sf_low = analyze_and_visualize(df, 'HB')

# 통계 요약 데이터프레임을 소수점 첫째 자리까지 반올림
sf_total = round_to_one_decimal(sf_total)
sf_Normal = round_to_one_decimal(sf_Normal)
sf_low = round_to_one_decimal(sf_low)

# 변환된 데이터프레임 출력
print(sf_total)
print(sf_Normal)
print(sf_low)


# 컬럼 이름 수정
sf_Normal.columns = [f'Normal_{col}' for col in sf_Normal.columns]
sf_low.columns = [f'Low_{col}' for col in sf_low.columns]

# 데이터프레임 결합
result1 = pd.concat([total,Normal,low])
result2 = pd.concat([sf_total,sf_Normal,sf_low],axis=1)

In [None]:
# 결과 1 검정 결과 저장
result1.to_excel('기준점_test_result.xlsx',index=False)
# 결과 2 전체 기초 통계 저장
result2.to_excel('기준점_sf_result.xlsx')

In [None]:
def p_value_stars(p_value):
    if p_value < 0.001:
        return 'p < 0.001***'
    elif p_value < 0.01:
        return 'p < 0.01**'
    elif p_value < 0.05:
        return 'p < 0.05*'
    else:
        return f"p = {p_value:.1f}"
        
# 평균과 신뢰구간 계산 함수
def mean_confidence_interval(data, confidence=0.95):
    n = len(data)
    m, se = np.mean(data), stats.sem(data)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h
    
# 통계 검정 수행 함수
def perform_tests(group1, group2, label):
    results = {
        'Group': [label],
        'Paired t-test': '',
        'Wilcoxon signed-rank test': '',
        'Independent Samples t-test': '',
        "Welch's t-test": '',
        'Mann-Whitney U Test': ''
    }

    try:
        # 1번 같은 집단의 두 시점 간 평균을 비교
        stat, p = stats.ttest_rel(group1, group2)
        results['Paired t-test'] = p_value_stars(p)
    except ValueError:
        results['Paired t-test'] = 'N/A'
    
    try:
        # 2번 대응 표본의 중앙값을 비교하는 비모수 검정
        stat, p = stats.wilcoxon(group1, group2)
        results['Wilcoxon signed-rank test'] = p_value_stars(p)
    except ValueError:
        results['Wilcoxon signed-rank test'] = 'N/A'

    try:
        # 3번 두 독립된 집단의 평균을 비교
        stat, p = ttest_ind(group1, group2)
        results['Independent Samples t-test'] = p_value_stars(p)
    except ValueError:
        results['Independent Samples t-test'] = 'N/A'
    
    try:
        # 4번 두 독립된 집단의 평균을 비교하지만, 분산이 다를 수 있다고 가정
        stat, p = ttest_ind(group1, group2, equal_var=False)
        results["Welch's t-test"] = p_value_stars(p)
    except ValueError:
        results["Welch's t-test"] = 'N/A'
    
    try:
        # 5번 두 독립된 집단의 중앙값을 비교하는 비모수 검정
        stat, p = stats.mannwhitneyu(group1, group2)
        results['Mann-Whitney U Test'] = p_value_stars(p)
    except ValueError:
        results['Mann-Whitney U Test'] = 'N/A'

    return pd.DataFrame(results)
    
# 데이터 전처리 및 그룹 나누기 함수
def process_data(df, column):
    cleaned_df_total = df[[f'Visit1_{column}', f'Visit5_{column}']].dropna()
    
    groups = []
    ranges = [(9, 10), (10, 11), (11, 12), (12, 13), (13, 14), (14, 15), (15, 16), (16, 17), (17, 18), (18, 19)]  # 예시 범위, 필요에 따라 추가 가능
    
    for low, high in ranges:
        condition = (df[f'Visit1_{column}'] >= low) & (df[f'Visit1_{column}'] < high)
        group = df[condition].copy()
        group[f'{column}_diff'] = group[f'Visit1_{column}'] - group[f'Visit5_{column}']
        groups.append((f'{low} ≤ {column} < {high}', group))
    
    return cleaned_df_total, groups
    
# 시각화 함수
def visualize_data(df, groups, column):
    # 전체 데이터 변화 시각화
    plt.figure(figsize=(8, 12))
    total_means = [df[f'Visit1_{column}'].mean(), df[f'Visit5_{column}'].mean()]
    total_ci = [mean_confidence_interval(df[f'Visit1_{column}']), mean_confidence_interval(df[f'Visit5_{column}'])]
    
    plt.subplot(2, 1, 1)
    plt.errorbar(['Visit1', 'Visit5'], total_means, yerr=[(ci[2]-ci[1])/2 for ci in total_ci], fmt='-o', label=f'Total {column}', color='green', capsize=5)
    
    for label, group in groups:
        means = [group[f'Visit1_{column}'].mean(), group[f'Visit5_{column}'].mean()]
        ci = [mean_confidence_interval(group[f'Visit1_{column}']), mean_confidence_interval(group[f'Visit5_{column}'])]
        plt.errorbar(['Visit1', 'Visit5'], means, yerr=[(ci[2]-ci[1])/2 for ci in ci], fmt='-o', label=label, capsize=5)
    
    plt.title(f'Total Mean {column} Levels with 95% Confidence Intervals')
    plt.ylabel(f'{column} Level')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # 변화량 시각화
    plt.figure(figsize=(14, 6))
    for i, (label, group) in enumerate(groups, start=1):
        plt.subplot(len(groups), 1, i)
        sns.histplot(group[f'{column}_diff'], bins=10, kde=True)
        plt.title(f'{label} - {column}_diff')
        plt.xlabel(f'{column}_diff (Visit1_{column} - Visit5_{column})')
        plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
    
    # 그룹별 변화 비교 시각화
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    for label, group in groups:
        plt.plot(['Visit1', 'Visit5'], [group[f'Visit1_{column}'].mean(), group[f'Visit5_{column}'].mean()], marker='o', label=label)
    plt.title(f'Mean {column} Levels by Group')
    plt.ylabel(f'{column} Level')
    # 범례를 두 줄로 하단 중앙에 배치
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=5)
    
    # plt.subplot(1, 2, 2)
    # sns.boxplot(data=[group[f'{column}_diff'] for label, group in groups], palette='Set2')
    # plt.xticks(range(len(groups)), [label for label, group in groups])
    # plt.title(f'{column}_diff Box Plot by Group')
    # plt.ylabel(f'{column}_diff (Visit1_{column} - Visit5_{column})')
    # plt.tight_layout()
    # plt.show()
    
def analyze_and_visualize(df, column):
    cleaned_df_total, groups = process_data(df, column)
    
    total = perform_tests(df[f'Visit1_{column}'], df[f'Visit5_{column}'], 'Total')
    
    group_results = []
    for label, group in groups:
        result = perform_tests(group[f'Visit1_{column}'], group[f'Visit5_{column}'], label)
        group_results.append(result)
    
    visualize_data(df, groups, column)
    
    # 기초 통계 분석 출력
    sf_total = cleaned_df_total.describe()
    sf_group_stats = []
    for label, group in groups:
        sf_group_stat = group[[f'Visit1_{column}', f'Visit5_{column}']].describe()
        sf_group_stat.columns = [f'{label}_{col}' for col in sf_group_stat.columns]
        sf_group_stats.append(sf_group_stat)
    
    return total, group_results, sf_total, sf_group_stats

# 모든 값을 소수점 첫째 자리까지 반올림하는 함수
def round_to_one_decimal(df):
    return df.round(1)

# 전체 통계 돌리기
total, group_results, sf_total, sf_group_stats = analyze_and_visualize(df, 'HB')

# 통계 요약 데이터프레임을 소수점 첫째 자리까지 반올림
sf_total = round_to_one_decimal(sf_total)
sf_group_stats = [round_to_one_decimal(sf_group_stat) for sf_group_stat in sf_group_stats]

# 변환된 데이터프레임 출력
print(sf_total)
for sf_group_stat in sf_group_stats:
    print(sf_group_stat)

# 데이터프레임 결합
sf_group_stats_combined = pd.concat(sf_group_stats, axis=1)
result = pd.concat([total] + group_results)

# 결과 출력
print(result)
print(sf_group_stats_combined)

In [None]:
result.to_excel('구간별로테스트.xlsx')
sf_group_stats_combined.to_excel('구간별로기본통계.xlsx')

In [None]:
def p_value_stars(p_value):
    if p_value < 0.001:
        return 'p < 0.001***'
    elif p_value < 0.01:
        return 'p < 0.01**'
    elif p_value < 0.05:
        return 'p < 0.05*'
    else:
        return f"p = {p_value:.1f}"
        
# 평균과 신뢰구간 계산 함수
def mean_confidence_interval(data, confidence=0.95):
    n = len(data)
    m, se = np.mean(data), stats.sem(data)
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m-h, m+h
    
# 통계 검정 수행 함수
def perform_tests(group1, group2, label):
    results = {
        'Group': [label],
        'Paired t-test': '',
        'Wilcoxon signed-rank test': '',
        'Independent Samples t-test': '',
        "Welch's t-test": '',
        'Mann-Whitney U Test': ''
    }

    try:
        # 1번 같은 집단의 두 시점 간 평균을 비교
        stat, p = stats.ttest_rel(group1, group2)
        results['Paired t-test'] = p_value_stars(p)
    except ValueError:
        results['Paired t-test'] = 'N/A'
    
    try:
        # 2번 대응 표본의 중앙값을 비교하는 비모수 검정
        stat, p = stats.wilcoxon(group1, group2)
        results['Wilcoxon signed-rank test'] = p_value_stars(p)
    except ValueError:
        results['Wilcoxon signed-rank test'] = 'N/A'

    try:
        # 3번 두 독립된 집단의 평균을 비교
        stat, p = ttest_ind(group1, group2)
        results['Independent Samples t-test'] = p_value_stars(p)
    except ValueError:
        results['Independent Samples t-test'] = 'N/A'
    
    try:
        # 4번 두 독립된 집단의 평균을 비교하지만, 분산이 다를 수 있다고 가정
        stat, p = ttest_ind(group1, group2, equal_var=False)
        results["Welch's t-test"] = p_value_stars(p)
    except ValueError:
        results["Welch's t-test"] = 'N/A'
    
    try:
        # 5번 두 독립된 집단의 중앙값을 비교하는 비모수 검정
        stat, p = stats.mannwhitneyu(group1, group2)
        results['Mann-Whitney U Test'] = p_value_stars(p)
    except ValueError:
        results['Mann-Whitney U Test'] = 'N/A'

    return pd.DataFrame(results)
    
# 데이터 전처리 및 그룹 나누기 함수
def process_data(df, column):
    cleaned_df_total = df[[f'Visit1_{column}', f'Visit5_{column}']].dropna()
    
    groups = []
    ranges = [10,11,12, 13, 14, 15, 16, 17, 18, 19]  # 예시 범위, 필요에 따라 추가 가능
    
    for high in ranges:
        condition = (df[f'Visit1_{column}'] < high)
        group = df[condition].copy()
        group[f'{column}_diff'] = group[f'Visit1_{column}'] - group[f'Visit5_{column}']
        groups.append((f'{column} < {high}', group))
    
    return cleaned_df_total, groups
    
# 시각화 함수
def visualize_data(df, groups, column):
    # 전체 데이터 변화 시각화
    plt.figure(figsize=(8, 12))
    total_means = [df[f'Visit1_{column}'].mean(), df[f'Visit5_{column}'].mean()]
    total_ci = [mean_confidence_interval(df[f'Visit1_{column}']), mean_confidence_interval(df[f'Visit5_{column}'])]
    
    plt.subplot(2, 1, 1)
    plt.errorbar(['Visit1', 'Visit5'], total_means, yerr=[(ci[2]-ci[1])/2 for ci in total_ci], fmt='-o', label=f'Total {column}', color='green', capsize=5)
    
    for label, group in groups:
        means = [group[f'Visit1_{column}'].mean(), group[f'Visit5_{column}'].mean()]
        ci = [mean_confidence_interval(group[f'Visit1_{column}']), mean_confidence_interval(group[f'Visit5_{column}'])]
        plt.errorbar(['Visit1', 'Visit5'], means, yerr=[(ci[2]-ci[1])/2 for ci in ci], fmt='-o', label=label, capsize=5)
    
    plt.title(f'Total Mean {column} Levels with 95% Confidence Intervals')
    plt.ylabel(f'{column} Level')
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # 변화량 시각화
    plt.figure(figsize=(14, 6))
    for i, (label, group) in enumerate(groups, start=1):
        plt.subplot(len(groups), 1, i)
        sns.histplot(group[f'{column}_diff'], bins=10, kde=True)
        plt.title(f'{label} - {column}_diff')
        plt.xlabel(f'{column}_diff (Visit1_{column} - Visit5_{column})')
        plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
    
    # 그룹별 변화 비교 시각화
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    for label, group in groups:
        plt.plot(['Visit1', 'Visit5'], [group[f'Visit1_{column}'].mean(), group[f'Visit5_{column}'].mean()], marker='o', label=label)
    plt.title(f'Mean {column} Levels by Group')
    plt.ylabel(f'{column} Level')
    # 범례를 두 줄로 하단 중앙에 배치
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=5)
    
    # plt.subplot(1, 2, 2)
    # sns.boxplot(data=[group[f'{column}_diff'] for label, group in groups], palette='Set2')
    # plt.xticks(range(len(groups)), [label for label, group in groups])
    # plt.title(f'{column}_diff Box Plot by Group')
    # plt.ylabel(f'{column}_diff (Visit1_{column} - Visit5_{column})')
    # plt.tight_layout()
    # plt.show()
    
def analyze_and_visualize(df, column):
    cleaned_df_total, groups = process_data(df, column)
    
    total = perform_tests(df[f'Visit1_{column}'], df[f'Visit5_{column}'], 'Total')
    
    group_results = []
    for label, group in groups:
        result = perform_tests(group[f'Visit1_{column}'], group[f'Visit5_{column}'], label)
        group_results.append(result)
    
    visualize_data(df, groups, column)
    
    # 기초 통계 분석 출력
    sf_total = cleaned_df_total.describe()
    sf_group_stats = []
    for label, group in groups:
        sf_group_stat = group[[f'Visit1_{column}', f'Visit5_{column}']].describe()
        sf_group_stat.columns = [f'{label}_{col}' for col in sf_group_stat.columns]
        sf_group_stats.append(sf_group_stat)
    
    return total, group_results, sf_total, sf_group_stats

# 모든 값을 소수점 첫째 자리까지 반올림하는 함수
def round_to_one_decimal(df):
    return df.round(1)

# 전체 통계 돌리기
total, group_results, sf_total, sf_group_stats = analyze_and_visualize(df, 'HB')

# 통계 요약 데이터프레임을 소수점 첫째 자리까지 반올림
sf_total = round_to_one_decimal(sf_total)
sf_group_stats = [round_to_one_decimal(sf_group_stat) for sf_group_stat in sf_group_stats]

# 변환된 데이터프레임 출력
print(sf_total)
for sf_group_stat in sf_group_stats:
    print(sf_group_stat)

# 데이터프레임 결합
sf_group_stats_combined = pd.concat(sf_group_stats, axis=1)
result = pd.concat([total] + group_results)

# 결과 출력
print(result)
print(sf_group_stats_combined)

In [None]:
result.to_excel('미만 테스트.xlsx')
sf_group_stats_combined.to_excel('미만 통계.xlsx')

In [None]:
def calculate_percentage(group_count, total_count):
    """
    그룹의 수와 총 수를 입력받아 백분율을 계산하는 함수
    
    :param group_count: 그룹의 수 (정수)
    :param total_count: 총 수 (정수)
    :return: 백분율 (소수점 첫째 자리까지 반올림된 값)
    """
    if total_count == 0:
        return 0
    percentage = (group_count / total_count) * 100
    return round(percentage, 1)


group_count = 264
total_count = 265

percentage = calculate_percentage(group_count, total_count)
print(f"Group Count: {group_count}, Total Count: {total_count}, Percentage: {percentage}%")


In [None]:
# 소화기관 출혈 데이터 추출
bleeding_cols = [
    'Visit1_hematochezia', 'Visit1_melena', 'Visit1_hematemesis',
    'Visit2_hematochezia', 'Visit2_melena', 'Visit2_hematemesis',
    'Visit3_hematochezia', 'Visit3_melena', 'Visit3_hematemesis',
    'Visit4_hematochezia', 'Visit4_melena', 'Visit4_hematemesis',
    'Visit5_hematochezia', 'Visit5_melena', 'Visit5_hematemesis','previous bleeding'
]

bleeding_data = df[bleeding_cols]

# 각 방문별 출혈 여부 확인 (환자가 출혈이 있으면 1, 없으면 0으로 표시)
bleeding_data['Visit1'] = bleeding_data[['Visit1_hematochezia', 'Visit1_melena', 'Visit1_hematemesis','previous bleeding']].any(axis=1).astype(int)
bleeding_data['Visit2'] = bleeding_data[['Visit2_hematochezia', 'Visit2_melena', 'Visit2_hematemesis']].any(axis=1).astype(int)
bleeding_data['Visit3'] = bleeding_data[['Visit3_hematochezia', 'Visit3_melena', 'Visit3_hematemesis']].any(axis=1).astype(int)
bleeding_data['Visit4'] = bleeding_data[['Visit4_hematochezia', 'Visit4_melena', 'Visit4_hematemesis']].any(axis=1).astype(int)
bleeding_data['Visit5'] = bleeding_data[['Visit5_hematochezia', 'Visit5_melena', 'Visit5_hematemesis']].any(axis=1).astype(int)

# 각 방문별 출혈 발생 빈도 계산
bleeding_summary = {
    "Visit1": bleeding_data['Visit1'].sum(),
    "Visit2": bleeding_data['Visit2'].sum(),
    "Visit3": bleeding_data['Visit3'].sum(),
    "Visit4": bleeding_data['Visit4'].sum(),
    "Visit5": bleeding_data['Visit5'].sum()
}

bleeding_summary_df = pd.DataFrame(list(bleeding_summary.items()), columns=['Visit', 'Total_Bleeding'])

# 출혈 발생 빈도 그래프 그리기
plt.figure(figsize=(10, 6))
plt.plot(bleeding_summary_df['Visit'], bleeding_summary_df['Total_Bleeding'], marker='o', linestyle='-', color='red')
plt.title('Total Bleeding Events per Visit')
plt.xlabel('Visit')
plt.ylabel('Total Bleeding Events')
plt.grid(True)
plt.show()


In [None]:
# 각 방문별 출혈 발생 빈도 계산
bleeding_summary = {
    'Bleeding': ['Hematochezia', 'Melena', 'Hematemesis'],
    'Visit1': [
        bleeding_data['Visit1_hematochezia'].sum(),
        bleeding_data['Visit1_melena'].sum(),
        bleeding_data['Visit1_hematemesis'].sum(),
    ],
    'Visit2': [
        bleeding_data['Visit2_hematochezia'].sum(),
        bleeding_data['Visit2_melena'].sum(),
        bleeding_data['Visit2_hematemesis'].sum(),
    ],
    'Visit3': [
        bleeding_data['Visit3_hematochezia'].sum(),
        bleeding_data['Visit3_melena'].sum(),
        bleeding_data['Visit3_hematemesis'].sum(),
    ],
    'Visit4': [
        bleeding_data['Visit4_hematochezia'].sum(),
        bleeding_data['Visit4_melena'].sum(),
        bleeding_data['Visit4_hematemesis'].sum(),
    ],
    'Visit5': [
        bleeding_data['Visit5_hematochezia'].sum(),
        bleeding_data['Visit5_melena'].sum(),
        bleeding_data['Visit5_hematemesis'].sum(),
    ]
}

bleeding_summary_df = pd.DataFrame(bleeding_summary)

# 테이블 형식으로 출력
print(bleeding_summary_df)

# HTML 테이블로 출력 (논문에 포함하기 위해)
bleeding_summary_df.to_html('bleeding_summary.html', index=False)

# 테이블 시각화
fig, ax = plt.subplots(figsize=(10, 6))
ax.axis('tight')
ax.axis('off')
ax.table(cellText=bleeding_summary_df.values, colLabels=bleeding_summary_df.columns, cellLoc='center', loc='center')
plt.title('Bleeding Events Summary per Visit')
plt.show()

# 출혈 총합 그래프 그리기
plt.figure(figsize=(10, 6))
for symptom in bleeding_summary_df['Bleeding']:
    plt.plot(bleeding_summary_df.columns[1:], bleeding_summary_df.loc[bleeding_summary_df['Bleeding'] == symptom].values.flatten()[1:], marker='o', linestyle='-', label=symptom)

plt.title('Total Bleeding Events per Visit')
plt.xlabel('Visit')
plt.ylabel('Total Bleeding Events')
plt.legend()
plt.show()


In [None]:
# 각 방문별 출혈 발생 빈도 계산
bleeding_summary = {
    'Bleeding': ['Hematochezia', 'Melena', 'Hematemesis'],
    'Visit1': [
        bleeding_data['Visit1_hematochezia'].sum(),
        bleeding_data['Visit1_melena'].sum(),
        bleeding_data['Visit1_hematemesis'].sum(),
    ],
    'Visit2': [
        bleeding_data['Visit2_hematochezia'].sum(),
        bleeding_data['Visit2_melena'].sum(),
        bleeding_data['Visit2_hematemesis'].sum(),
    ],
    'Visit3': [
        bleeding_data['Visit3_hematochezia'].sum(),
        bleeding_data['Visit3_melena'].sum(),
        bleeding_data['Visit3_hematemesis'].sum(),
    ],
    'Visit4': [
        bleeding_data['Visit4_hematochezia'].sum(),
        bleeding_data['Visit4_melena'].sum(),
        bleeding_data['Visit4_hematemesis'].sum(),
    ],
    'Visit5': [
        bleeding_data['Visit5_hematochezia'].sum(),
        bleeding_data['Visit5_melena'].sum(),
        bleeding_data['Visit5_hematemesis'].sum(),
    ]
}

bleeding_summary_df = pd.DataFrame(bleeding_summary)

# 테이블 형식으로 출력
print(bleeding_summary_df)

# HTML 테이블로 출력 (논문에 포함하기 위해)
bleeding_summary_df.to_html('bleeding_summary.html', index=False)

# # 테이블 시각화
# fig, ax = plt.subplots(figsize(10, 6))
# ax.axis('tight')
# ax.axis('off')
# ax.table(cellText=bleeding_summary_df.values, colLabels=bleeding_summary_df.columns, cellLoc='center', loc='center')
# plt.title('Bleeding Events Summary per Visit')
# plt.show()

# # 출혈 총합 그래프 그리기
# plt.figure(figsize(10, 6))
# for symptom in bleeding_summary_df['Bleeding']:
#     plt.plot(bleeding_summary_df.columns[1:], bleeding_summary_df.loc[bleeding_summary_df['Bleeding'] == symptom].values.flatten()[1:], marker='o', linestyle='-', label=symptom)

# plt.title('Total Bleeding Events per Visit')
# plt.xlabel('Visit')
# plt.ylabel('Total Bleeding Events')
# plt.legend()
# plt.show()

# Hematochezia에 대한 Friedman’s test 적용
hematochezia = [
    bleeding_data['Visit1_hematochezia'],
    bleeding_data['Visit2_hematochezia'],
    bleeding_data['Visit3_hematochezia'],
    bleeding_data['Visit4_hematochezia'],
    bleeding_data['Visit5_hematochezia']
]
if np.any([len(np.unique(x)) > 1 for x in hematochezia]):
    stat_h, p_h = friedmanchisquare(*hematochezia)
    print('Hematochezia - Statistics=%.3f, p=%.3f' % (stat_h, p_h))
    if p_h < 0.05:
        print('The differences between visits for Hematochezia are statistically significant (reject H0)')
    else:
        print('The differences between visits for Hematochezia are not statistically significant (fail to reject H0)')
else:
    print('Hematochezia has no variance across visits, cannot perform Friedman’s test')

# Melena에 대한 Friedman’s test 적용
melena = [
    bleeding_data['Visit1_melena'],
    bleeding_data['Visit2_melena'],
    bleeding_data['Visit3_melena'],
    bleeding_data['Visit4_melena'],
    bleeding_data['Visit5_melena']
]
if np.any([len(np.unique(x)) > 1 for x in melena]):
    stat_m, p_m = friedmanchisquare(*melena)
    print('Melena - Statistics=%.3f, p=%.3f' % (stat_m, p_m))
    if p_m < 0.05:
        print('The differences between visits for Melena are statistically significant (reject H0)')
    else:
        print('The differences between visits for Melena are not statistically significant (fail to reject H0)')
else:
    print('Melena has no variance across visits, cannot perform Friedman’s test')

# Hematemesis에 대한 Friedman’s test 적용
hematemesis = [
    bleeding_summary_df.loc[2, 'Visit1'],
    bleeding_summary_df.loc[2, 'Visit2'],
    bleeding_summary_df.loc[2, 'Visit3'],
    bleeding_summary_df.loc[2, 'Visit4'],
    bleeding_summary_df.loc[2, 'Visit5']
]
if np.any([len(np.unique(x)) > 1 for x in hematemesis]):
    stat_he, p_he = friedmanchisquare(*hematemesis)
    print('Hematemesis - Statistics=%.3f, p=%.3f' % (stat_he, p_he))
    if p_he < 0.05:
        print('The differences between visits for Hematemesis are statistically significant (reject H0)')
    else:
        print('The differences between visits for Hematemesis are not statistically significant (fail to reject H0)')
else:
    print('Hematemesis has no variance across visits, cannot perform Friedman’s test')


In [None]:
# Assuming bleeding_data is already defined and populated
bleeding_summary = {
    'Bleeding': ['Hematochezia', 'Melena', 'Hematemesis'],
    'Visit1': [
        bleeding_data['Visit1_hematochezia'].sum(),
        bleeding_data['Visit1_melena'].sum(),
        bleeding_data['Visit1_hematemesis'].sum(),
    ],
    'Visit2': [
        bleeding_data['Visit2_hematochezia'].sum(),
        bleeding_data['Visit2_melena'].sum(),
        bleeding_data['Visit2_hematemesis'].sum(),
    ],
    'Visit3': [
        bleeding_data['Visit3_hematochezia'].sum(),
        bleeding_data['Visit3_melena'].sum(),
        bleeding_data['Visit3_hematemesis'].sum(),
    ],
    'Visit4': [
        bleeding_data['Visit4_hematochezia'].sum(),
        bleeding_data['Visit4_melena'].sum(),
        bleeding_data['Visit4_hematemesis'].sum(),
    ],
    'Visit5': [
        bleeding_data['Visit5_hematochezia'].sum(),
        bleeding_data['Visit5_melena'].sum(),
        bleeding_data['Visit5_hematemesis'].sum(),
    ]
}

bleeding_summary_df = pd.DataFrame(bleeding_summary)

# Calculate total bleeding events per visit
total_bleeding = bleeding_summary_df.iloc[:, 1:].sum(axis=0)
total_row = pd.DataFrame({'Bleeding': 'Total', 'Visit1': total_bleeding[0], 'Visit2': total_bleeding[1], 'Visit3': total_bleeding[2], 'Visit4': total_bleeding[3], 'Visit5': total_bleeding[4]}, index=[0])

# Append total row to the dataframe
bleeding_summary_df = pd.concat([bleeding_summary_df, total_row], ignore_index=True)

# Display table
print(bleeding_summary_df)

# Save as HTML table
bleeding_summary_df.to_html('bleeding_summary.html', index=False)

# Visualize table
fig, ax = plt.subplots(figsize=(10, 6))
ax.axis('tight')
ax.axis('off')
ax.table(cellText=bleeding_summary_df.values, colLabels=bleeding_summary_df.columns, cellLoc='center', loc='center')
plt.title('Bleeding Events Summary per Visit')
plt.show()

# Plot total bleeding events graph
plt.figure(figsize=(10, 6))
for symptom in bleeding_summary_df['Bleeding']:
    plt.plot(bleeding_summary_df.columns[1:], bleeding_summary_df.loc[bleeding_summary_df['Bleeding'] == symptom].values.flatten()[1:], marker='o', linestyle='-', label=symptom)

plt.title('Total Bleeding Events per Visit')
plt.xlabel('Visit')
plt.ylabel('Total Bleeding Events')
plt.legend()
plt.show()


In [None]:
bleeding_data

In [None]:
# 각 증상에 대해 Friedman’s test 수행
for symptom in ['Hematochezia', 'Melena', 'Hematemesis']:
    symptom_data = [
        bleeding_data[f'Visit1_{symptom.lower()}'],
        bleeding_data[f'Visit2_{symptom.lower()}'],
        bleeding_data[f'Visit3_{symptom.lower()}'],
        bleeding_data[f'Visit4_{symptom.lower()}'],
        bleeding_data[f'Visit5_{symptom.lower()}']
    ]
    
    stat, p = friedmanchisquare(*symptom_data)
    
    print(f'{symptom} - Statistics={stat:.3f}, p={p:.3f}')
    if p < 0.05:
        print(f'The differences between visits for {symptom} are statistically significant (reject H0)')
    else:
        print(f'The differences between visits for {symptom} are not statistically significant (fail to reject H0)')

In [None]:

# NaN 값을 0으로 채우기
bleeding_data = bleeding_data.fillna(0)

bleeding_list = ['hematochezia', 'melena', 'hematemesis']
kruskal_results = {}

for symptom in bleeding_list:
    visit_data = [
        bleeding_data[f'Visit1_{symptom}'],
        bleeding_data[f'Visit2_{symptom}'],
        bleeding_data[f'Visit3_{symptom}'],
        bleeding_data[f'Visit4_{symptom}'],
        bleeding_data[f'Visit5_{symptom}']
    ]
    
    # 검정을 시도하고 예외 발생 시 H-statistic과 p-value를 0으로 설정
    try:
        kruskal_results[symptom] = kruskal(*visit_data)
    except ValueError as e:
        if str(e) == 'All numbers are identical in kruskal':
            kruskal_results[symptom] = (0, 1.0)  # H-statistic = 0, p-value = 1.0
        else:
            raise

# 결과 출력
for symptom, result in kruskal_results.items():
    if isinstance(result, tuple):
        print(f"{symptom.capitalize()} Kruskal-Wallis test result: H-statistic = {result[0]}, p-value = {result[1]}")
    else:
        print(f"{symptom.capitalize()} Kruskal-Wallis test result: H-statistic = {result.statistic}, p-value = {result.pvalue}")

In [None]:
# NaN 값을 0으로 채우기
bleeding_data = bleeding_data.fillna(0)

bleeding_list = ['hematochezia', 'melena', 'hematemesis']
fisher_results = {}

for symptom in bleeding_list:
    fisher_results[symptom] = []
    for i in range(1, 5):
        for j in range(i + 1, 6):
            visit_i = bleeding_data[f'Visit{i}_{symptom}']
            visit_j = bleeding_data[f'Visit{j}_{symptom}']
            
            # # 교차표 생성
            # contingency_table = pd.crosstab(visit_i, visit_j)
            # print(f"\n{symptom.capitalize()} Visit{i} vs Visit{j} Contingency Table:\n", contingency_table)
            
            # 피셔의 정확 검정 수행
            try:
                odds_ratio, p_value = fisher_exact(contingency_table)
                fisher_results[symptom].append((f'Visit{i}_vs_Visit{j}', odds_ratio, p_value))
            except ValueError as e:
                fisher_results[symptom].append((f'Visit{i}_vs_Visit{j}', 'NA', 'NA'))

# 결과 출력
for symptom, results in fisher_results.items():
    print(f"\n{symptom.capitalize()} Fisher's exact test results:")
    for result in results:
        print(f"{result[0]}: Odds Ratio = {result[1]}, p-value = {result[2]}")



In [None]:
# Assuming bleeding_data is already defined and populated
symptoms_cols = [
    'Visit1_heartburn', 'Visit1_nausea', 'Visit1_vomiting', 'Visit1_diarrhea', 'Visit1_abdominal_pain', 'Visit1_dyspepsia', 'Visit1_other',
    'Visit2_heartburn', 'Visit2_nausea', 'Visit2_vomiting', 'Visit2_diarrhea', 'Visit2_abdominal_pain', 'Visit2_dyspepsia','Visit2_other',
    'Visit3_heartburn', 'Visit3_nausea', 'Visit3_vomiting', 'Visit3_diarrhea', 'Visit3_abdominal_pain', 'Visit3_dyspepsia','Visit3_other',
    'Visit4_heartburn', 'Visit4_nausea', 'Visit4_vomiting', 'Visit4_diarrhea', 'Visit4_abdominal_pain', 'Visit4_dyspepsia','Visit4_other',
    'Visit5_heartburn', 'Visit5_nausea', 'Visit5_vomiting', 'Visit5_diarrhea', 'Visit5_abdominal_pain', 'Visit5_dyspepsia','Visit5_other',
]

symptoms_data = df[symptoms_cols]

symptoms_summary = {
    'Symptom': ['Heartburn', 'Nausea', 'Vomiting', 'Diarrhea', 'Abdominal Pain', 'Dyspepsia','Other'],
    'Visit1': [
        symptoms_data['Visit1_heartburn'].sum(),
        symptoms_data['Visit1_nausea'].sum(),
        symptoms_data['Visit1_vomiting'].sum(),
        symptoms_data['Visit1_diarrhea'].sum(),
        symptoms_data['Visit1_abdominal_pain'].sum(),
        symptoms_data['Visit1_dyspepsia'].sum(),
        symptoms_data['Visit1_dyspepsia'].sum(),
    ],
    'Visit2': [
        symptoms_data['Visit2_heartburn'].sum(),
        symptoms_data['Visit2_nausea'].sum(),
        symptoms_data['Visit2_vomiting'].sum(),
        symptoms_data['Visit2_diarrhea'].sum(),
        symptoms_data['Visit2_abdominal_pain'].sum(),
        symptoms_data['Visit2_dyspepsia'].sum(),
        symptoms_data['Visit2_other'].sum(),
    ],
    'Visit3': [
        symptoms_data['Visit3_heartburn'].sum(),
        symptoms_data['Visit3_nausea'].sum(),
        symptoms_data['Visit3_vomiting'].sum(),
        symptoms_data['Visit3_diarrhea'].sum(),
        symptoms_data['Visit3_abdominal_pain'].sum(),
        symptoms_data['Visit3_dyspepsia'].sum(),
        symptoms_data['Visit3_other'].sum(),
    ],
    'Visit4': [
        symptoms_data['Visit4_heartburn'].sum(),
        symptoms_data['Visit4_nausea'].sum(),
        symptoms_data['Visit4_vomiting'].sum(),
        symptoms_data['Visit4_diarrhea'].sum(),
        symptoms_data['Visit4_abdominal_pain'].sum(),
        symptoms_data['Visit4_dyspepsia'].sum(),
        symptoms_data['Visit4_other'].sum(),
    ],
    'Visit5': [
        symptoms_data['Visit5_heartburn'].sum(),
        symptoms_data['Visit5_nausea'].sum(),
        symptoms_data['Visit5_vomiting'].sum(),
        symptoms_data['Visit5_diarrhea'].sum(),
        symptoms_data['Visit5_abdominal_pain'].sum(),
        symptoms_data['Visit5_dyspepsia'].sum(),
        symptoms_data['Visit5_other'].sum(),
    ]
}

symptoms_summary_df = pd.DataFrame(symptoms_summary)

# Calculate total symptoms per visit
total_symptoms = symptoms_summary_df.iloc[:, 1:].sum(axis=0)
total_row = pd.DataFrame({'Symptom': 'Total', 'Visit1': total_symptoms[0], 'Visit2': total_symptoms[1], 'Visit3': total_symptoms[2], 'Visit4': total_symptoms[3], 'Visit5': total_symptoms[4]}, index=[0])

# Append total row to the dataframe
symptoms_summary_df = pd.concat([symptoms_summary_df, total_row], ignore_index=True)

# Display table
print(symptoms_summary_df)

# Save as HTML table
symptoms_summary_df.to_html('symptoms_summary.html', index=False)

# Visualize table
fig, ax = plt.subplots(figsize=(10, 6))
ax.axis('tight')
ax.axis('off')
ax.table(cellText=symptoms_summary_df.values, colLabels=symptoms_summary_df.columns, cellLoc='center', loc='center')
plt.title('Symptoms Summary per Visit')
plt.show()

# Plot total symptoms events graph
plt.figure(figsize=(10, 6))
for symptom in symptoms_summary_df['Symptom']:
    plt.plot(symptoms_summary_df.columns[1:], symptoms_summary_df.loc[symptoms_summary_df['Symptom'] == symptom].values.flatten()[1:], marker='o', linestyle='-', label=symptom)

plt.title('Total Symptoms per Visit')
plt.xlabel('Visit')
plt.ylabel('Total Symptoms')
plt.legend()
plt.show()


In [None]:
# 각 증상별로 데이터 추출
symptoms_cols = [
    'Visit1_heartburn', 'Visit1_nausea', 'Visit1_vomiting', 'Visit1_diarrhea', 'Visit1_abdominal_pain', 'Visit1_dyspepsia', 'Visit1_other',
    'Visit2_heartburn', 'Visit2_nausea', 'Visit2_vomiting', 'Visit2_diarrhea', 'Visit2_abdominal_pain', 'Visit2_dyspepsia','Visit2_other',
    'Visit3_heartburn', 'Visit3_nausea', 'Visit3_vomiting', 'Visit3_diarrhea', 'Visit3_abdominal_pain', 'Visit3_dyspepsia','Visit3_other',
    'Visit4_heartburn', 'Visit4_nausea', 'Visit4_vomiting', 'Visit4_diarrhea', 'Visit4_abdominal_pain', 'Visit4_dyspepsia','Visit4_other',
    'Visit5_heartburn', 'Visit5_nausea', 'Visit5_vomiting', 'Visit5_diarrhea', 'Visit5_abdominal_pain', 'Visit5_dyspepsia','Visit5_other',
]

# symptoms_data = symptoms_data[symptoms_cols]

symptoms_summary = {
    'Symptom': ['Heartburn', 'Nausea', 'Vomiting', 'Diarrhea', 'Abdominal Pain', 'Dyspepsia', 'Other'],
    'Visit1': [
        symptoms_data['Visit1_heartburn'].sum(),
        symptoms_data['Visit1_nausea'].sum(),
        symptoms_data['Visit1_vomiting'].sum(),
        symptoms_data['Visit1_diarrhea'].sum(),
        symptoms_data['Visit1_abdominal_pain'].sum(),
        symptoms_data['Visit1_dyspepsia'].sum(),
        symptoms_data['Visit1_other'].sum(),
    ],
    # 'Visit2': [
    #     symptoms_data['Visit2_heartburn'].sum(),
    #     symptoms_data['Visit2_nausea'].sum(),
    #     symptoms_data['Visit2_vomiting'].sum(),
    #     symptoms_data['Visit2_diarrhea'].sum(),
    #     symptoms_data['Visit2_abdominal_pain'].sum(),
    #     symptoms_data['Visit2_dyspepsia'].sum(),
    #     symptoms_data['Visit2_other'].sum(),
    # ],
    # 'Visit3': [
    #     symptoms_data['Visit3_heartburn'].sum(),
    #     symptoms_data['Visit3_nausea'].sum(),
    #     symptoms_data['Visit3_vomiting'].sum(),
    #     symptoms_data['Visit3_diarrhea'].sum(),
    #     symptoms_data['Visit3_abdominal_pain'].sum(),
    #     symptoms_data['Visit3_dyspepsia'].sum(),
    #     symptoms_data['Visit3_other'].sum(),
    # ],
    # 'Visit4': [
    #     symptoms_data['Visit4_heartburn'].sum(),
    #     symptoms_data['Visit4_nausea'].sum(),
    #     symptoms_data['Visit4_vomiting'].sum(),
    #     symptoms_data['Visit4_diarrhea'].sum(),
    #     symptoms_data['Visit4_abdominal_pain'].sum(),
    #     symptoms_data['Visit4_dyspepsia'].sum(),
    #     symptoms_data['Visit4_other'].sum(),
    # ],
    'Visit5': [
        symptoms_data['Visit5_heartburn'].sum(),
        symptoms_data['Visit5_nausea'].sum(),
        symptoms_data['Visit5_vomiting'].sum(),
        symptoms_data['Visit5_diarrhea'].sum(),
        symptoms_data['Visit5_abdominal_pain'].sum(),
        symptoms_data['Visit5_dyspepsia'].sum(),
        symptoms_data['Visit5_other'].sum(),
    ]
}

symptoms_summary_df = pd.DataFrame(symptoms_summary)

# 방문별 총 증상 발생 수 계산
total_symptoms = symptoms_summary_df.iloc[:, 1:].sum(axis=0)
total_row = pd.DataFrame({'Symptom': ['Total'], 'Visit1': [total_symptoms['Visit1']], 'Visit5': [total_symptoms['Visit5']]})

# 총합 행을 데이터프레임에 추가
symptoms_summary_df = pd.concat([symptoms_summary_df, total_row], ignore_index=True)

# 테이블 출력
print(symptoms_summary_df)

symptoms_summary_df.columns = ['Symptom', 'Initial visit', 'Final visit']

# HTML 테이블로 저장
symptoms_summary_df.to_html('symptoms_summary.html', index=False)

# 테이블 시각화
fig, ax = plt.subplots(figsize=(10, 6))
ax.axis('tight')
ax.axis('off')
ax.table(cellText=symptoms_summary_df.values, colLabels=symptoms_summary_df.columns, cellLoc='center', loc='center')
plt.title('Symptoms Summary per Visit')
plt.show()

# 증상 총합 그래프 그리기
plt.figure(figsize=(12, 8))
for symptom in symptoms_summary_df['Symptom']:
    plt.plot(symptoms_summary_df.columns[1:], symptoms_summary_df[symptoms_summary_df['Symptom'] == symptom].values.flatten()[1:], marker='o', linestyle='-', label=symptom)

plt.ylabel('Total Symptoms')
plt.legend(bbox_to_anchor=(0.5, -0.1), loc='upper center', ncol=4)  # 레전드를 아래로 이동
plt.subplots_adjust(bottom=0.2)  # 하단 여백 확보
plt.savefig('img.jpg', dpi=600)
plt.show()


In [None]:
symptoms_summary_df.columns[1:]

In [None]:
# Rename the columns starting from the second column onwards


In [None]:
symptoms_summary_df.columns

In [None]:


# 각 증상에 대해 Fisher's exact test 수행 (Visit 1과 Visit 5만)
for symptom in ['heartburn', 'nausea', 'vomiting', 'diarrhea', 'abdominal_pain', 'dyspepsia', 'other']:
    # Visit 1과 Visit 5의 증상 빈도 데이터 생성
    symptom_data_visit1 = symptoms_data[f'Visit1_{symptom}'].value_counts().sort_index().reindex([0, 1], fill_value=0).values
    symptom_data_visit5 = symptoms_data[f'Visit5_{symptom}'].value_counts().sort_index().reindex([0, 1], fill_value=0).values

    # 2x2 테이블 생성
    contingency_table = np.array([symptom_data_visit1, symptom_data_visit5])

    # Fisher의 정확 검정 수행
    odds_ratio, p_value = fisher_exact(contingency_table)

    print(f'{symptom.capitalize()} - Odds Ratio={odds_ratio:.3f}, p={p_value:.3f}')
    if p_value < 0.05:
        print(f'The differences between visits for {symptom.capitalize()} are statistically significant (reject H0)')
    else:
        print(f'The differences between visits for {symptom.capitalize()} are not statistically significant (fail to reject H0)')


In [None]:
# 증상 리스트
symptoms = ['heartburn', 'nausea', 'vomiting', 'diarrhea', 'abdominal_pain', 'dyspepsia', 'other']

# 전체 증상에 대한 빈도 데이터 초기화
total_visit1 = np.zeros(2, dtype=int)
total_visit5 = np.zeros(2, dtype=int)

# 각 증상에 대해 빈도 데이터 집계
for symptom in symptoms:
    symptom_data_visit1 = symptoms_data[f'Visit1_{symptom}'].value_counts().sort_index().reindex([0, 1], fill_value=0).values
    symptom_data_visit5 = symptoms_data[f'Visit5_{symptom}'].value_counts().sort_index().reindex([0, 1], fill_value=0).values
    
    total_visit1 += symptom_data_visit1
    total_visit5 += symptom_data_visit5

# 2x2 테이블 생성
contingency_table_total = np.array([total_visit1, total_visit5])

# Fisher의 정확 검정 수행
odds_ratio_total, p_value_total = fisher_exact(contingency_table_total)

print(f'Total Symptoms - Odds Ratio={odds_ratio_total:.3f}, p={p_value_total:.3f}')
if p_value_total < 0.05:
    print('The differences between visits for total symptoms are statistically significant (reject H0)')
else:
    print('The differences between visits for total symptoms are not statistically significant (fail to reject H0)')


In [None]:
contingency_table_total

In [None]:
# 각 증상에 대해 Friedman’s test 수행

symptoms_cols = [
    'Visit1_heartburn', 'Visit1_nausea', 'Visit1_vomiting', 'Visit1_diarrhea', 'Visit1_abdominal_pain', 'Visit1_dyspepsia', 'Visit1_other',
    'Visit2_heartburn', 'Visit2_nausea', 'Visit2_vomiting', 'Visit2_diarrhea', 'Visit2_abdominal_pain', 'Visit2_dyspepsia','Visit2_other',
    'Visit3_heartburn', 'Visit3_nausea', 'Visit3_vomiting', 'Visit3_diarrhea', 'Visit3_abdominal_pain', 'Visit3_dyspepsia','Visit3_other',
    'Visit4_heartburn', 'Visit4_nausea', 'Visit4_vomiting', 'Visit4_diarrhea', 'Visit4_abdominal_pain', 'Visit4_dyspepsia','Visit4_other',
    'Visit5_heartburn', 'Visit5_nausea', 'Visit5_vomiting', 'Visit5_diarrhea', 'Visit5_abdominal_pain', 'Visit5_dyspepsia','Visit5_other',
]

for symptom in ['heartburn', 'nausea', 'vomiting', 'diarrhea', 'abdominal_pain', 'dyspepsia', 'other']:
    symptom_data = [
        symptoms_data[f'Visit1_{symptom.lower()}'],
        symptoms_data[f'Visit2_{symptom.lower()}'],
        symptoms_data[f'Visit3_{symptom.lower()}'],
        symptoms_data[f'Visit4_{symptom.lower()}'],
        symptoms_data[f'Visit5_{symptom.lower()}']
    ]
    
    stat, p = friedmanchisquare(*symptom_data)
    
    print(f'{symptom} - Statistics={stat:.3f}, p={p:.3f}')
    if p < 0.05:
        print(f'The differences between visits for {symptom} are statistically significant (reject H0)')
    else:
        print(f'The differences between visits for {symptom} are not statistically significant (fail to reject H0)')

In [None]:


# 각 증상에 대해 Chi-Square test 수행 (Visit 1과 Visit 5만)
for symptom in ['heartburn', 'nausea', 'vomiting', 'diarrhea', 'abdominal_pain', 'dyspepsia', 'other']:
    symptom_data_visit1 = symptoms_data[f'Visit1_{symptom}_score'].value_counts().sort_index().reindex([0, 1], fill_value=0).values
    symptom_data_visit5 = symptoms_data[f'Visit5_{symptom}_score'].value_counts().sort_index().reindex([0, 1], fill_value=0).values

    contingency_table = np.array([symptom_data_visit1, symptom_data_visit5])

    # 작은 값 0.5 추가
    contingency_table = contingency_table + 0.5

    chi2, p, _, _ = chi2_contingency(contingency_table)
    
    print(f'{symptom.capitalize()} - Chi2={chi2:.3f}, p={p:.3f}')
    if p < 0.05:
        print(f'The differences between visits for {symptom.capitalize()} are statistically significant (reject H0)')
    else:
        print(f'The differences between visits for {symptom.capitalize()} are not statistically significant (fail to reject H0)')


In [None]:

# 각 증상에 대해 McNemar's test 수행 (Visit 1과 Visit 5만)
for symptom in ['heartburn', 'nausea', 'vomiting', 'diarrhea', 'abdominal_pain', 'dyspepsia', 'other']:
    symptom_data_visit1 = symptoms_data[f'Visit1_{symptom}_score']
    symptom_data_visit5 = symptoms_data[f'Visit5_{symptom}_score']

    contingency_table = pd.crosstab(symptom_data_visit1, symptom_data_visit5)

    # 2x2 테이블 검사
    if contingency_table.shape == (2, 2):
        result = mcnemar(contingency_table, exact=True)

        print(f'{symptom.capitalize()} - Statistic={result.statistic:.3f}, p={result.pvalue:.3f}')
        if result.pvalue < 0.05:
            print(f'The differences between visits for {symptom.capitalize()} are statistically significant (reject H0)')
        else:
            print(f'The differences between visits for {symptom.capitalize()} are not statistically significant (fail to reject H0)')
    else:
        print(f'{symptom.capitalize()} - Not enough data to form a 2x2 table for McNemar\'s test')


In [None]:
# NaN 값을 0으로 채우기
symptoms_data = symptoms_data.fillna(0)

symptoms_list = ['heartburn', 'nausea', 'vomiting', 'diarrhea', 'abdominal_pain', 'dyspepsia', 'other']
fisher_results = {}

for symptom in symptoms_list:
    fisher_results[symptom] = []
    for i in range(1, 5):
        for j in range(i + 1, 6):
            visit_i = symptoms_data[f'Visit{i}_{symptom}']
            visit_j = symptoms_data[f'Visit{j}_{symptom}']
            
            # 교차표 생성
            contingency_table = pd.crosstab(visit_i, visit_j)
            print(f"\n{symptom.capitalize()} Visit{i} vs Visit{j} Contingency Table:\n", contingency_table)
            
            # 피셔의 정확 검정 수행
            try:
                odds_ratio, p_value = fisher_exact(contingency_table)
                fisher_results[symptom].append((f'Visit{i}_vs_Visit{j}', odds_ratio, p_value))
            except ValueError as e:
                fisher_results[symptom].append((f'Visit{i}_vs_Visit{j}', 'NA', 'NA'))

# 결과 출력
for symptom, results in fisher_results.items():
    print(f"\n{symptom.capitalize()} Fisher's exact test results:")
    for result in results:
        print(f"{result[0]}: Odds Ratio = {result[1]}, p-value = {result[2]}")



In [None]:
symptoms_data.to_csv('fisher.csv')

In [None]:
numpy2ri.activate()
stats=importr('stats')


test=stats.fisher_test(ro.r.matrix(qew.values, nrow=qew.shape[0], ncol=qew.shape[1]), alternative='two')
test=pd.Series({
   'conf_int_lower':test.rx2('conf.int')[0],
   'conf_int_upper':test.rx2('conf.int')[1],
   'statistic':test.rx2('estimate')[0],
   'pvalue':test.rx2('p.value')[0],
})

In [None]:
# HB_Difference 계산
df['HB_Difference'] = pd.to_numeric(df['Visit5_HB'], errors='coerce') - pd.to_numeric(df['Visit1_HB'], errors='coerce')

# HB_Difference가 -2보다 작거나 같은 사람들
dropped_df = df[df['HB_Difference'] < 0]

# HB_Difference가 -2보다 큰 사람들
not_dropped_df = df[df['HB_Difference'] >= 0]

In [None]:
dropped_df

In [None]:
not_dropped_df

In [None]:
# 주요 변수 선택
variables_to_compare = [
    'Visit1_SBP', 'Visit2_SBP', 'Visit3_SBP', 'Visit4_SBP', 'Visit5_SBP',
    'Visit1_DBP', 'Visit2_DBP', 'Visit3_DBP', 'Visit4_DBP', 'Visit5_DBP','gender'
]

# 결측치 및 비숫자 값 처리
for var in variables_to_compare:
    dropped_df[var] = pd.to_numeric(dropped_df[var], errors='coerce')
    not_dropped_df[var] = pd.to_numeric(not_dropped_df[var], errors='coerce')

dropped_df_clean = dropped_df[variables_to_compare].dropna()
not_dropped_df_clean = not_dropped_df[variables_to_compare].dropna()

# 주요 변수의 t-검정 수행
t_test_results = []
for var in variables_to_compare:
    t_stat, p_value = ttest_ind(dropped_df_clean[var], not_dropped_df_clean[var], equal_var=False)
    t_test_results.append((var, t_stat, p_value))
    print(f"Variable: {var}")
    print(f"T-test statistics: t={t_stat}, p={p_value}\n")

# 결과 요약
print("\nSummary of T-test results:")
for var, t_stat, p_value in t_test_results:
    print(f"{var}: t={t_stat:.4f}, p={p_value:.4f}")


In [None]:
def calculate_percentage(group_count, total_count):
    if total_count == 0:
        return 0
    percentage = (group_count / total_count) * 100
    return round(percentage, 1)

group_count = 5
total_count = 265

percentage = calculate_percentage(group_count, total_count)
print(f"Group Count: {group_count}, Total Count: {total_count}, Percentage: {percentage}%")


In [None]:


# Today's date
today = datetime.now()

# Deadline date
deadline = datetime(2026, 3, 1)

# Calculate the number of days remaining
days_remaining = (deadline - today).days

days_remaining


In [None]:
df['Visit5_heartburn']

In [None]:
df

In [None]:
# 증상 및 출혈 변수 정의
visit1_symptoms = ['Visit1_heartburn', 'Visit1_nausea', 'Visit1_vomiting', 'Visit1_diarrhea', 'Visit1_abdominal_pain', 'Visit1_dyspepsia', 'Visit1_other']
visit5_symptoms = ['Visit5_heartburn', 'Visit5_nausea', 'Visit5_vomiting', 'Visit5_diarrhea', 'Visit5_abdominal_pain', 'Visit5_dyspepsia', 'Visit5_other']
visit1_bleeding = ['Visit1_hematochezia', 'Visit1_melena', 'Visit1_hematemesis']
visit5_bleeding = ['Visit5_hematochezia', 'Visit5_melena', 'Visit5_hematemesis']

# 각 방문의 증상 및 출혈 발생 여부를 이진화
visit1_symptoms_binary = (df[visit1_symptoms] > 0).astype(int)
visit5_symptoms_binary = (df[visit5_symptoms] > 0).astype(int)
visit1_bleeding_binary = (df[visit1_bleeding] > 0).astype(int)
visit5_bleeding_binary = (df[visit5_bleeding] > 0).astype(int)

# 각 행에서 하나 이상의 증상 또는 출혈이 발생했는지 여부를 확인
visit1_total_binary = (visit1_symptoms_binary.sum(axis=1) > 0) | (visit1_bleeding_binary.sum(axis=1) > 0)
visit5_total_binary = (visit5_symptoms_binary.sum(axis=1) > 0) | (visit5_bleeding_binary.sum(axis=1) > 0)

# 발생 빈도 계산
visit1_occurrences = visit1_total_binary.sum()
visit5_occurrences = visit5_total_binary.sum()

# 전체 데이터 수
total_visit1 = len(df)
total_visit5 = len(df)

# 교차 테이블 생성
contingency_table = [
    [visit1_occurrences, total_visit1 - visit1_occurrences],
    [visit5_occurrences, total_visit5 - visit5_occurrences]
]

# 피셔의 정확한 검정 수행
_, p_value = fisher_exact(contingency_table)

# 결과 출력
results = {
    'p_value': p_value
}

print(results)

# Debugging: 각 단계의 결과 확인
print("visit1_symptoms_sum\n", visit1_symptoms_binary.sum(axis=1).value_counts())
print("visit1_bleeding_sum\n", visit1_bleeding_binary.sum(axis=1).value_counts())
print("visit1_total_binary\n", visit1_total_binary.value_counts())


In [None]:
qwe=pd.DataFrame(visit1_symptoms_sum)

In [None]:
qwe.value_counts()

In [None]:
qwe=pd.DataFrame(visit1_bleeding_sum)

In [None]:
qwe.value_counts()

In [None]:
qwe=pd.DataFrame(visit1_total)

In [None]:
qwe.value_counts()

In [None]:
df['Visit1_hematemesis'].value_counts()

In [None]:
df['Visit1_nausea'].value_counts()

In [None]:
df['Visit1_diarrhea'].value_counts()

In [None]:
df['Visit1_abdominal_pain'].value_counts()

In [None]:
df['Visit1_dyspepsia'].value_counts()

In [None]:
df['Visit1_other'].value_counts()

In [None]:
# 265명의 데이터 생성
data = {
    'Visit1': [0]*265,
    'Visit5': [0]*265
}

# DataFrame 생성
df = pd.DataFrame(data)

# Visit1에서 15명의 데이터에 1 설정
visit1_indices = df.sample(7).index
df.loc[visit1_indices, 'Visit1'] = 1

# Visit5에서 2명의 데이터에 1 설정
visit5_indices = df.sample(1).index
df.loc[visit5_indices, 'Visit5'] = 1

# 발생 빈도 계산
visit1_occurrences = df['Visit1'].sum()
visit5_occurrences = df['Visit5'].sum()

# 전체 데이터 수
total_visit1 = len(df)
total_visit5 = len(df)

# 교차 테이블 생성
contingency_table = [
    [visit1_occurrences, total_visit1 - visit1_occurrences],
    [visit5_occurrences, total_visit5 - visit5_occurrences]
]

# 피셔의 정확한 검정 수행
_, p_value = fisher_exact(contingency_table)

# 결과 출력
results = {
    'p_value': p_value
}

print(results)

print("visit1_occurrences: ", visit1_occurrences)
print("visit5_occurrences: ", visit5_occurrences)
print("contingency_table: ", contingency_table)


In [None]:
contingency_table