In [1]:
import pandas as pd
from tabulate import tabulate

def print_null_columns_info(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Find rows with any null values
    rows_with_nulls = df[df.isnull().any(axis=1)]
    
    # Print patient ID, visit date, and count of null columns
    if not rows_with_nulls.empty:
        print(f"Rows with null values in {file_path}:")
        for index, row in rows_with_nulls.iterrows():
            null_columns_count = row.isnull().sum()
            patient_id = row['Patient ID']
            visit_date = row['Visit Date']
            print(f"Patient ID: {patient_id}, Visit Date: {visit_date}, Null Columns Count: {null_columns_count}")
    else:
        print(f"No rows with null values in {file_path}.")

# Example usage
file_path = r"B:\Projects\PycharmProjects\PPMI_Research_on_Parkinson's\src\mahbub\ppmi_master_dataset.csv"
print_null_columns_info(file_path)


Rows with null values in B:\Projects\PycharmProjects\PPMI_Research_on_Parkinson's\src\mahbub\ppmi_master_dataset.csv:
Patient ID: 3059, Visit Date: 28/02/2012, Null Columns Count: 2
Patient ID: 3078, Visit Date: 26/04/2016, Null Columns Count: 25
Patient ID: 3105, Visit Date: 10/08/2011, Null Columns Count: 63
Patient ID: 3105, Visit Date: 17/04/2012, Null Columns Count: 63
Patient ID: 3105, Visit Date: 19/04/2013, Null Columns Count: 63
Patient ID: 3105, Visit Date: 31/03/2015, Null Columns Count: 63
Patient ID: 3168, Visit Date: 09/10/2012, Null Columns Count: 1
Patient ID: 3168, Visit Date: 27/08/2013, Null Columns Count: 1
Patient ID: 3212, Visit Date: 14/07/2011, Null Columns Count: 63
Patient ID: 3212, Visit Date: 27/06/2012, Null Columns Count: 63
Patient ID: 3212, Visit Date: 26/07/2013, Null Columns Count: 63
Patient ID: 3212, Visit Date: 22/07/2015, Null Columns Count: 63
Patient ID: 3323, Visit Date: 06/06/2012, Null Columns Count: 1
Patient ID: 3323, Visit Date: 10/08/2016,

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.decomposition import PCA
from lifelines import KaplanMeierFitter
import os

In [3]:
def preprocess_data(file_path):
    data = pd.read_csv(file_path)
    data['Visit Date'] = pd.to_datetime(data['Visit Date'], dayfirst=True)
    data = data.sort_values(by=['Patient ID', 'Visit Date'])
    data.fillna(data.mode().iloc[0], inplace=True)
    data['VisitNumber'] = data.groupby('Patient ID').cumcount() + 1
    non_feature_columns = ['Patient ID', 'Visit Date', 'VisitNumber', 'EVENT_ID']
    features = [col for col in data.columns if col not in non_feature_columns]
    return data, features

In [4]:
if __name__ == "__main__":
    file_path = r"B:\Projects\PycharmProjects\PPMI_Research_on_Parkinson's\src\mahbub\ppmi_master_dataset.csv"
    data,features = preprocess_data(file_path)
      
    # Save the preprocessed data to a CSV file
    data.to_csv('preprocessed_ppmi_master_dataset.csv', index=False)
    output_dir = "plots"

    # Ensure the output directory exists or create it
    os.makedirs(output_dir, exist_ok=True)

In [5]:

def plot_line_plots(data, features, pdf_filename):
    """
    Generate line plots for each feature across four visits and save them to a PDF.

    Parameters:
    - data (DataFrame): The input DataFrame containing the dataset.
    - features (list): List of features (columns) in the dataset to plot.
    - pdf_filename (str): File name for the PDF to save the plots.
    """
    with PdfPages(pdf_filename) as pdf:
        for feature in features:
            fig, axes = plt.subplots(1, 4, figsize=(20, 5))  # Create subplots for 4 visits
            
            for visit in range(1, 5):
                ax = axes[visit - 1]  # Select the current subplot
                visit_data = data[data['VisitNumber'] == visit][feature]
                ax.plot(visit_data)
                ax.set_title(f'Visit {visit}')
                ax.set_xlabel('Observation')
                ax.set_ylabel(feature)
                ax.legend([f'Visit {visit}'])
            
            fig.suptitle(f'Line Plot of {feature} Across Visits', y=1.05)
            fig.tight_layout()
            pdf.savefig()
            plt.close()

        print(f"PDF saved successfully: {pdf_filename}")

print('Working with line plot')
plot_line_plots(data, features, os.path.join(output_dir, "line_plots.pdf"))


Working with line plot
PDF saved successfully: plots\line_plots.pdf


In [6]:
def plot_scatter_plots(data, features, pdf_filename):
    """
    Generate scatter plots for pairs of features between consecutive visits and save them to a PDF.

    Parameters:
    - data (DataFrame): The input DataFrame containing the dataset.
    - features (list): List of features (columns) in the dataset to plot.
    - pdf_filename (str): File name for the PDF to save the plots.
    """
    with PdfPages(pdf_filename) as pdf:
        for feature in features:
            fig, axes = plt.subplots(1, 3, figsize=(18, 5))  # Create subplots for 3 pairs (1-2, 2-3, 3-4)

            for i, visit in enumerate(range(1, 4)):
                ax = axes[i]  # Select the current subplot
                visit_data_x = data[data['VisitNumber'] == visit][feature]
                visit_data_y = data[data['VisitNumber'] == visit + 1][feature]
                ax.scatter(visit_data_x, visit_data_y, label=f'Visit {visit}-{visit + 1}')
                ax.set_title(f'Scatter Plot of {feature} Between Visits {visit} and {visit + 1}')
                ax.set_xlabel(f'Visit {visit}')
                ax.set_ylabel(f'Visit {visit + 1}')
                ax.legend()

            fig.suptitle(f'Scatter Plots of {feature} Between Consecutive Visits', y=1.05)
            fig.tight_layout()

            # Save the figure with all three subplots to the PDF
            pdf.savefig()
            plt.close()

        print(f"PDF saved successfully: {pdf_filename}")
        
print('Working with scatter plot')
plot_scatter_plots(data, features, os.path.join(output_dir, "scatter_plots.pdf"))


Working with scatter plot
PDF saved successfully: plots\scatter_plots.pdf


In [7]:

def plot_box_plots(data, features, pdf_filename):
    """
    Generate box plots for each feature across all visits and save them to a PDF.

    Parameters:
    - data (DataFrame): The input DataFrame containing the dataset.
    - features (list): List of features (columns) in the dataset to plot.
    - pdf_filename (str): File name for the PDF to save the plots.
    """
    with PdfPages(pdf_filename) as pdf:
        for feature in features:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x='VisitNumber', y=feature, data=data)
            plt.title(f'Box Plot of {feature} Across Visits')
            plt.xlabel('Visit Number')
            plt.ylabel(feature)
            pdf.savefig()
            plt.close()
        print(f"PDF saved successfully: {pdf_filename}")

print('Working with box plot')
plot_box_plots(data, features, os.path.join(output_dir, "box_plots.pdf"))


Working with box plot
PDF saved successfully: plots\box_plots.pdf


In [8]:
def plot_violin_plots(data, features, pdf_filename):
    """
    Generate violin plots for each feature across all visits and save them to a PDF.

    Parameters:
    - data (DataFrame): The input DataFrame containing the dataset.
    - features (list): List of features (columns) in the dataset to plot.
    - pdf_filename (str): File name for the PDF to save the plots.
    """
    with PdfPages(pdf_filename) as pdf:
        for feature in features:
            plt.figure(figsize=(10, 6))
            sns.violinplot(x='VisitNumber', y=feature, data=data)
            plt.title(f'Violin Plot of {feature} Across Visits')
            plt.xlabel('Visit Number')
            plt.ylabel(feature)
            pdf.savefig()
            plt.close()
        print(f"PDF saved successfully: {pdf_filename}")

print('Working with violin plot')
plot_violin_plots(data, features, os.path.join(output_dir, "violin_plots.pdf"))


Working with violin plot
PDF saved successfully: plots\violin_plots.pdf


In [9]:
def plot_histograms(data, features, pdf_filename):
    """
    Generate histograms for each feature across all visits and save them to a PDF.

    Parameters:
    - data (DataFrame): The input DataFrame containing the dataset.
    - features (list): List of features (columns) in the dataset to plot.
    - pdf_filename (str): File name for the PDF to save the plots.
    """
    with PdfPages(pdf_filename) as pdf:
        for feature in features:
            fig, axes = plt.subplots(1, 4, figsize=(20, 5))  # Create subplots for 4 visits
            
            for visit in range(1, 5):
                ax = axes[visit - 1]  # Select the current subplot
                visit_data = data[data['VisitNumber'] == visit][feature]
                ax.hist(visit_data, bins=20, alpha=0.5, label=f'Visit {visit}')
                ax.set_title(f'Histogram of {feature}, Visit {visit}')
                ax.set_xlabel(feature)
                ax.set_ylabel('Frequency')
                ax.legend()

            fig.suptitle(f'Histograms of {feature} Across Visits', y=1.05)
            fig.tight_layout()

            # Save the figure with all four subplots to the PDF
            pdf.savefig()
            plt.close()

        print(f"PDF saved successfully: {pdf_filename}")

print('Working with histograms')
plot_histograms(data, features, os.path.join(output_dir, "histograms.pdf"))



Working with histograms
PDF saved successfully: plots\histograms.pdf


In [10]:
def plot_density_plots(data, features, pdf_filename):
    """
    Generate probability density function (PDF) plots for each feature across all visits and save them to a PDF.

    Parameters:
    - data (DataFrame): The input DataFrame containing the dataset.
    - features (list): List of features (columns) in the dataset to plot.
    - pdf_filename (str): File name for the PDF to save the plots.
    """
    with PdfPages(pdf_filename) as pdf:
        for feature in features:
            plt.figure(figsize=(10, 6))
            for visit in range(1, 5):
                visit_data = data[data['VisitNumber'] == visit][feature]
                sns.kdeplot(visit_data, label=f'Visit {visit}')
            plt.title(f'Probability Density Function (PDF) of {feature} Across Visits')
            plt.xlabel(feature)
            plt.ylabel('Density')
            plt.legend()
            pdf.savefig()
            plt.close()
        print(f"PDF saved successfully: {pdf_filename}")
        
print('Working with density_plots')
plot_density_plots(data, features, os.path.join(output_dir, "density_plots.pdf"))


Working with density_plots


  sns.kdeplot(visit_data, label=f'Visit {visit}')
  sns.kdeplot(visit_data, label=f'Visit {visit}')
  sns.kdeplot(visit_data, label=f'Visit {visit}')
  sns.kdeplot(visit_data, label=f'Visit {visit}')
  sns.kdeplot(visit_data, label=f'Visit {visit}')
No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


PDF saved successfully: plots\density_plots.pdf


In [11]:
def plot_heatmaps(data, features, pdf_filename):
    """
    Generate heatmaps for selected features and save them to a PDF.

    Parameters:
    - data (DataFrame): The input DataFrame containing the dataset.
    - features (list): List of features (columns) in the dataset to plot as heatmap.
    - pdf_filename (str): File name for the PDF to save the plots.
    """
    # Compute correlation matrix
    corr_matrix = data[features].corr()

    with PdfPages(pdf_filename) as pdf:
        plt.figure(figsize=(50, 40))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
        plt.title('Correlation Matrix Heatmap')
        pdf.savefig()
        plt.close()
        print(f"PDF saved successfully: {pdf_filename}")
        
print('Working with heatmaps')
plot_heatmaps(data, features, os.path.join(output_dir, "heatmaps.pdf"))

Working with heatmaps
PDF saved successfully: plots\heatmaps.pdf


In [16]:
def plot_pair_plots(data, features, pdf_filename):
    """
    Generate pair plots for selected features and save them to a PDF.

    Parameters:
    - data (DataFrame): The input DataFrame containing the dataset.
    - features (list): List of features (columns) in the dataset to plot in pair plots.
    - pdf_filename (str): File name for the PDF to save the plots.
    """
    sns.set(style="ticks")
    pair_plot_data = data.loc[:, features + ['VisitNumber']]

    with PdfPages(pdf_filename) as pdf:
        sns.pairplot(pair_plot_data, hue='VisitNumber')
        pdf.savefig()
        plt.close()
        print(f"PDF saved successfully: {pdf_filename}")
        
print('Working with pair plots')
plot_pair_plots(data, features, os.path.join(output_dir, "pair_plots.pdf"))

Working with pair plots


  with PdfPages(pdf_filename) as pdf:

KeyboardInterrupt



Error in callback <function _draw_all_if_interactive at 0x000001F3344B63E0> (for post_execute), with arguments args (),kwargs {}:



KeyboardInterrupt



Error in callback <function flush_figures at 0x000001F34D76A340> (for post_execute), with arguments args (),kwargs {}:


KeyboardInterrupt: 

In [12]:
def plot_pca_biplots(data, features, pdf_filename):
    """
    Generate PCA biplots for selected features and save them to a PDF.

    Parameters:
    - data (DataFrame): The input DataFrame containing the dataset.
    - features (list): List of features (columns) in the dataset to use for PCA.
    - pdf_filename (str): File name for the PDF to save the plots.
    """
    pca = PCA(n_components=2)
    pca_data = data[features].dropna()
    pca_components = pca.fit_transform(pca_data)

    with PdfPages(pdf_filename) as pdf:
        plt.figure(figsize=(10, 6))
        plt.scatter(pca_components[:, 0], pca_components[:, 1], c=data['VisitNumber'], cmap='viridis')
        plt.title('PCA Biplots')
        plt.xlabel('Principal Component 1')
        plt.ylabel('Principal Component 2')
        pdf.savefig()
        plt.close()
        print(f"PDF saved successfully: {pdf_filename}")
        
print('Working with pca biplots')
plot_pca_biplots(data, features, os.path.join(output_dir, "pca_biplots.pdf"))

Working with pca biplots
PDF saved successfully: plots\pca_biplots.pdf


In [13]:
def plot_time_series_plots(data, time_col, group_col, pdf_filename):
    print("Generating time series plots for all features...")
    with PdfPages(pdf_filename) as pdf:
        features = [col for col in data.columns if col not in [time_col, group_col, 'Patient ID', 'EVENT_ID']]
        for feature in features:
            for group in data[group_col].unique():
                group_data = data[data[group_col] == group]
                plt.figure(figsize=(10, 6))
                plt.plot(group_data[time_col], group_data[feature], marker='o', linestyle='-')
                plt.title(f'Time Series Plot of {feature} for Visit {group}')
                plt.xlabel(time_col)
                plt.ylabel(feature)
                pdf.savefig()
                plt.close()
    print(f"PDF saved successfully: {pdf_filename}")

# Call the function
plot_time_series_plots(data, 'Visit Date', 'VisitNumber', os.path.join(output_dir, "time_series_plots.pdf"))


Generating time series plots for all features...
PDF saved successfully: plots\time_series_plots.pdf


In [14]:
def plot_area_plots(data, time_col, group_col, pdf_filename):
    print("Generating area plots for all features...")
    with PdfPages(pdf_filename) as pdf:
        features = [col for col in data.columns if col not in [time_col, group_col, 'Patient ID', 'EVENT_ID']]
        for feature in features:
            for group in data[group_col].unique():
                group_data = data[data[group_col] == group]
                plt.figure(figsize=(10, 6))
                plt.fill_between(group_data[time_col], group_data[feature], alpha=0.5)
                plt.plot(group_data[time_col], group_data[feature], marker='', linestyle='-')
                plt.title(f'Area Plot of {feature} for Visit {group}')
                plt.xlabel(time_col)
                plt.ylabel(feature)
                pdf.savefig()
                plt.close()
    print(f"PDF saved successfully: {pdf_filename}")

# Call the function
plot_area_plots(data, 'Visit Date', 'VisitNumber', os.path.join(output_dir, "area_plots.pdf"))


Generating area plots for all features...
PDF saved successfully: plots\area_plots.pdf


In [15]:
def plot_ridgeline_plots(data, time_col, group_col, pdf_filename):
    print("Generating ridgeline plots for all features...")
    with PdfPages(pdf_filename) as pdf:
        features = [col for col in data.columns if col not in [time_col, group_col, 'Patient ID', 'EVENT_ID']]
        for feature in features:
            plt.figure(figsize=(10, 6))
            sns.kdeplot(data=data, x=time_col, hue=group_col, fill=True)
            plt.title(f'Ridgeline Plot of {feature} Over {time_col} by {group_col}')
            plt.xlabel(time_col)
            plt.ylabel(feature)
            pdf.savefig()
            plt.close()
    print(f"PDF saved successfully: {pdf_filename}")

# Call the function
plot_ridgeline_plots(data, 'Visit Date', 'VisitNumber', os.path.join(output_dir, "ridgeline_plots.pdf"))


Generating ridgeline plots for all features...
PDF saved successfully: plots\ridgeline_plots.pdf


In [None]:

def plot_kaplan_meier_plots(data, event_col, time_col, group_col, pdf_filename):
    """
    Generate Kaplan-Meier plots for survival analysis and save them to a PDF.

    Parameters:
    - data (DataFrame): The input DataFrame containing the survival data.
    - event_col (str): Column name indicating the event/censorship status.
    - time_col (str): Column name indicating the survival time.
    - group_col (str): Column name indicating the group/category.
    - pdf_filename (str): File name for the PDF to save the plots.
    """
    from lifelines import KaplanMeierFitter
    from matplotlib.backends.backend_pdf import PdfPages
    
    kmf = KaplanMeierFitter()

    with PdfPages(pdf_filename) as pdf:
        plt.figure(figsize=(10, 6))
        for group in data[group_col].unique():
            group_data = data[data[group_col] == group]
            kmf.fit(group_data[time_col], event_observed=group_data[event_col], label=group)
            kmf.plot()
        plt.title('Kaplan-Meier Survival Curves')
        plt.xlabel('Time')
        plt.ylabel('Survival Probability')
        pdf.savefig()
        plt.close()
        print(f"PDF saved successfully: {pdf_filename}")

# Ensure you pass the correct column names when calling the function
plot_kaplan_meier_plots(data, 'EVENT_ID', 'Time', 'Group', "kaplan_meier_plots.pdf")