In [None]:
#load some libraries
import pandas as pd
import seaborn as sns
from ccf.box import LifespanBox
import yaml
from functions import *
from config import *
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
import plotly.io as pio
import plotly.express as px


In [None]:
#make interactive plots possible to plot
pio.renderers.default = "notebook+pdf"

In [None]:
#load aabc dataset
config = LoadSettings()
secret=pd.read_csv(config['config_files']['secrets'])
#box = LifespanBox(cache="./tmp")
aabcarms = redjson(tok=secret.loc[secret.source=='aabcarms','api_key'].reset_index().drop(columns='index').api_key[0])
aabc=getframe(struct=aabcarms,api_url=config['Redcap']['api_url10'])

In [None]:
#read the dictionary downloaded from REDCap
all_dict = pd.read_csv('AABC_REDCap_DataDictionary_2023-10-13.csv')

#choose the variables only when form name is lab_results
part_dict = all_dict[["Variable / Field Name", "Form Name"]]
part_dict = part_dict[part_dict['Form Name']=="lab_results"]

#form a list of variables name
vars_list = part_dict['Variable / Field Name'].tolist()
#print(vars_list)

In [None]:
#remove the variables which are not in REDCap dataset
elements_to_remove = ['cmp', 'lipid', 'hormones']
vars_list = [elem for elem in vars_list if elem not in elements_to_remove]

In [None]:
#inlcude other necessary variables
keepsies=['site','subject_id','sex','lab_results_complete','event_date']
keepsies = keepsies + vars_list

#function used to clean the dataframe 
def idvisits(aabcarmsdf,keepsies):
    #idvisit=aabcarmsdf[keepsies].copy()
    idvisit = aabcarmsdf[keepsies + ['redcap_event_name', 'study_id']].copy()
    
    #registers=idvisit.loc[idvisit.redcap_event_name.str.contains('register')][['subject_id','study_id','site']]
    registers = idvisit.loc[idvisit['redcap_event_name'].str.contains('register'), ['subject_id', 'study_id', 'site', 'sex']]
    
    idvisit = pd.merge(registers, idvisit.drop(columns=['site']), on='study_id', how='right')
    
    idvisit=idvisit.rename(columns={'subject_id_x':'subject','subject_id_y':'subject_id','sex_x':'sex'})
    
    idvisit['redcap_event']=idvisit.replace({'redcap_event_name':
                                           config['Redcap']['datasources']['aabcarms']['AABCeventmap']})['redcap_event_name']
    
    idvisit = idvisit.loc[~(idvisit.subject.astype(str).str.upper().str.contains('TEST'))]
    #idvisit = idvisit.loc[~idvisit['subject'].astype(str).str.upper().contains('TEST')]
    
    return idvisit

In [None]:
records = idvisits(aabc,keepsies)
#only choose subjects who visit and complete lab test
records = records.loc[(records['redcap_event'] == "V1") | (records['redcap_event'] == "V2") | (records['redcap_event'] == "V3") ]
records = records.loc[records['lab_results_complete'] == "2"]
records = records.loc[records['bld_drawresults'] == "1"]
records = records.drop(columns=['sex_y','subject_id'])
records = records.reset_index(drop=True)
#records

In [None]:
records.to_csv('records.csv', index=False)

In [None]:
#functions to classify numerical or catergorical columns
def classify_column_types_with_heuristic(df, unique_threshold=10, unique_percentage=0.05):
    numerical_columns = []
    categorical_columns = []
    
    for col in df.columns:
        # Attempt to convert columns with mixed types to numeric, replacing errors with NaN
        if df[col].dtype == 'object':
            try:
                numeric_col = pd.to_numeric(df[col], errors='coerce')
                if numeric_col.isnull().all():
                    categorical_columns.append(col)
                else:
                    # Check unique values against thresholds to classify as numerical or categorical
                    unique_values = numeric_col.nunique(dropna=True)
                    if unique_values <= unique_threshold or unique_values / len(df) <= unique_percentage:
                        categorical_columns.append(col)
                    else:
                        numerical_columns.append(col)
            except ValueError:
                categorical_columns.append(col)
        elif df[col].dtype in ['int64', 'float64']:
            # Check unique values against thresholds to classify as numerical or categorical
            unique_values = df[col].nunique(dropna=True)
            if unique_values <= unique_threshold or unique_values / len(df) <= unique_percentage:
                categorical_columns.append(col)
            else:
                numerical_columns.append(col)
        else:
            categorical_columns.append(col)
    
    return numerical_columns, categorical_columns

# Output the classification
numerical_cols, categorical_cols = classify_column_types_with_heuristic(records)


#numerical_cols, categorical_cols


In [None]:
numerical_cols.remove("study_id")
num = records[numerical_cols]
num = num.reset_index(drop=True)
#num

In [None]:
num.to_csv('numerical_cols.csv', index=False)

In [None]:
# Function to handle special values and convert columns to numeric
def preprocess_column(column):
    # Handle "<value" and ">value" by extracting "value"
    column = column.replace({'<': '', '>': ''}, regex=True)
    
    # Convert "ND" to 0 (or another small number as required)
    column = column.replace({'ND': 0})
    
    # Convert column to numeric, coercing errors to NaN (to find any non-converted values later if needed)
    column = pd.to_numeric(column, errors='coerce')
    return column

# Apply preprocessing to each column
for col in numerical_cols:
    num[col] = preprocess_column(num[col])

# Set the style of seaborn
sns.set(style="whitegrid")

# Plotting
fig, axes = plt.subplots(len(numerical_cols), 1, figsize=(10, 5 * len(numerical_cols)))

for i, col in enumerate(numerical_cols):
    sns.histplot(num[col], kde=True, ax=axes[i], binwidth=0.5)
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Calculate mean and standard deviation for each column of interest
for col in numerical_cols:
    mean = num[col].mean()
    std = num[col].std()
    
    # Define outliers as those outside ±3 (4) standard deviations from the mean
    num['outlier'] = ((num[col] < (mean - 4 * std)) | (num[col] > (mean + 4 * std)))
    
    # Violin Plot
    fig_violin = px.violin(num, y=col, color='outlier', box=True, points="all",
                           hover_data=[num.index+2], title=f'Violin Plot of {col}')
    fig_violin.show()

In [None]:
# Assuming numerical_cols is your list of numerical column names
num_all = numerical_cols.copy()  # Make a copy to avoid modifying the original list
num_all.append('study_id')       # Append 'study_id' to the list
num_all.append('subject')        # Append 'subject' to the list
num_all.append('site')           # Append 'site' to the list
num_all.append('event_date')     # Append 'event_date' to the list
num_all.append('redcap_event')   # Append 'redcap_event' to the list
num_outlier = records[num_all]
num_outlier = num_outlier.reset_index(drop=True)


# Function to handle special values and convert columns to numeric
def preprocess_column1(column):
    # Handle "<value" and ">value" by extracting "value"
    column = column.replace({'<': '', '>': ''}, regex=True)
    
    # Convert "ND" to 0 (or another small number as required)
    column = column.replace({'ND': 0})
    
    # Convert column to numeric, coercing errors to NaN (to find any non-converted values later if needed)
    #column = pd.to_numeric(column, errors='coerce')
    return column

# Apply preprocessing to each column
for col in num_outlier.columns:
    num_outlier[col] = preprocess_column1(num_outlier[col])

#num_outlier


In [None]:
# Initialize an empty list to hold the outlier information
outliers_info = []

# Iterate through each numerical column to identify outliers
for col in numerical_cols:
    try:
        num_outlier[col] = pd.to_numeric(num_outlier[col], errors='coerce')  # Ensure numeric, coerce errors to NaN
        mean = num_outlier[col].mean()
        std = num_outlier[col].std()

        # Define outliers
        is_outlier = ((num_outlier[col] < (mean - 4 * std)) | (num_outlier[col] > (mean + 4 * std)))
        
        # Filter outliers
        outliers = num_outlier[is_outlier]
        
        # Append information to the outliers_info list
        for index, row in outliers.iterrows():
            reason = f"{col}={row[col]}"
            outliers_info.append([row['subject'], row['redcap_event'], row['study_id'], row['site'], row['event_date'], reason])
    except Exception as e:
        # Skip columns that cannot be converted to numeric
        print(f"Skipping column {col} due to error: {e}")

# Convert the outliers_info list into a DataFrame
outliers_df = pd.DataFrame(outliers_info, columns=['subject','redcap_event', 'study_id', 'site', 'event_date', 'reason'])

# Group by 'study-id', 'subject', 'redcap_event' and merge 'reason' for the same subject
outliers_df = outliers_df.groupby(['subject','redcap_event', 'study_id', 'site', 'event_date'])['reason'].apply(lambda x: '; '.join(x)).reset_index()

# Adding "lab results: " prefix only once per grouped entry for tidiness
outliers_df['reason'] = 'lab_results: ' + outliers_df['reason']

outliers_df['datatype'] = 'REDCap'

In [None]:
outliers_df

In [None]:
outliers_df.to_csv('outliers.csv', index=False)

In [None]:
print(outliers_df.shape)
print(len(outliers_df['subject'].unique()))

In [None]:
#remove the variables which are not in REDCap dataset
elements_to_remove1 = ['subject', 'lab_results_complete', 'event_date', 'bld_drawresults', 'ldl', 'ldl_notes', 'blood_notes', 'labs_returned', 'labs_returned_notreason', 'redcap_event_name']
categorical_cols = [elem for elem in categorical_cols if elem not in elements_to_remove1]

# Plot distributions for selected categorical columns
fig, axs = plt.subplots(len(categorical_cols), 1, figsize=(10, len(categorical_cols)*4))

for i, col in enumerate(categorical_cols):
    sns.countplot(x=records[col], ax=axs[i])
    axs[i].set_title(f'Count Plot of {col}')
    axs[i].set_xlabel(col)
    axs[i].set_ylabel('Count')

plt.tight_layout()
plt.show()
