# Packages

In [None]:
import numpy as np
from pandas import DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import sklearn 
import sspa
import sspa.utils
import gseapy.plot as gp
import networkx
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
import statsmodels.api as sm
import plotly.graph_objects as go
import plotly.express as px
import urllib.request
import statsmodels
import networkx as nx
import math
import itertools 
from scipy.stats import hypergeom as hg
import textwrap
from itertools import chain
import missforest

# Multi-Omics Data

In [None]:
# loading data  
file_path = '/Users/judepops/Documents/PathIntegrate/Datasets/Covid/tzydswhhb5-5/Table S1. Human subject details, plasma proteomic and metabolomic datasets and analysis, and CITE-seq antibodies. Related to Figures 1 and S1.xlsx'
excel_file = pd.ExcelFile(file_path)
sheet_names = excel_file.sheet_names
print(sheet_names)

# selecting and subsetting for baseline
metabolomics_data = pd.read_excel(file_path, sheet_name='S1.4 Plasma Metabolomic Data')
metabolomics_data['sample_id'] = metabolomics_data['sample_id'].astype(str)
metabolomics_data = metabolomics_data[~metabolomics_data['sample_id'].str.endswith('-AC')]
metabolomics_data = metabolomics_data[metabolomics_data['Healthy donor sample or COVID19 sample'] != 'Healthy Donor ']

proteomics_data = pd.read_excel(file_path, sheet_name='S1.3 Plasma Proteomic Data')
proteomics_data['sample_id'] = proteomics_data['sample_id'].astype(str)
proteomics_data = proteomics_data[~proteomics_data['sample_id'].str.endswith('-AC')]
proteomics_data = proteomics_data[proteomics_data['Healthy donor sample or COVID19 sample'] != 'Healthy Donor ']

# selecting metadata file for concatenation
metadata = pd.read_excel(file_path, sheet_name='S1.1 Patient Clinical Data')

# saving the raw data files
metabolomics_data.to_csv('../Processing_Cleaned/metabolomics_data_covid.csv')
proteomics_data.to_csv('../Processing_Cleaned/proteomics_data_covid.csv')
metadata.to_csv('../Processing_Cleaned/metadata_covid.csv')

# Filtering

In [None]:
# numeric data
metabolomics_data = metabolomics_data.iloc[:, 2:]

# data shape
print(metabolomics_data.shape)

# removing samples where Na count is > than 50%
metabolomics_data = metabolomics_data.loc[metabolomics_data.isnull().mean(axis=1) < 0.5, :]
print("samples with NA values < 50%:", metabolomics_data.shape) #didnt drop any rows)

# only keeping  metabolites where the number of NA  is < 60% 
metabolomics_data = metabolomics_data.loc[:, metabolomics_data.isnull().mean() < 0.6] #dropped 9 columns (metalbolites)
print("metabolites with NA < 60%:", metabolomics_data.shape, " -> dropped 121 metabolites ")

In [None]:
# numeric data
proteomics_data = proteomics_data.iloc[:, 2:]

# data shape
print(proteomics_data.shape)

# removing samples where Na count is > than 50%
proteomics_data = proteomics_data.loc[proteomics_data.isnull().mean(axis=1) < 0.5, :]
print("samples with NA values < 50%", proteomics_data.shape) #didnt drop any rows
# 50% is a fairly good compromise

# only keeping proteins where the number of NA  is < 40% 
proteomics_data = proteomics_data.loc[:, proteomics_data.isnull().mean() < 0.4] #dropped 9 columns i.e proteins
print("proteins with NA < 40%:", proteomics_data.shape, " -> Dropped 9 proteins! ")

# Imputation

In [None]:
# Metaboloimcs - using SVD

from fancyimpute import SoftImpute

soft_impute = SoftImpute()
imputed_data_met = soft_impute.fit_transform(metabolomics_data.values)

imputed_data_met = pd.DataFrame(imputed_data_met, columns=metabolomics_data.columns)
imputed_data_met.to_csv('imputed_metabolomics_data.csv')

In [None]:
# Proteoimcs - using RF

import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest

randomFores = MissForest()
imputed_data_prot = pd.DataFrame(randomFores.fit_transform(proteomics_data))

imputed_data_prot.columns = proteomics_data.columns

# Log Transformation + Scaling 

In [None]:
# Metabolomics

negative_values_check = (imputed_data_met < 0).any()
negative_values_check


# log transform + small constant = shifts all values away from zero
# global minimum
min_value = imputed_data_met.min().min()  
#ensuring positive values
shift = -min_value + 1  
adjusted_data = imputed_data_met + shift

# log transformation
numeric_data_log_transformed = np.log(adjusted_data)

# scaling
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_data_log_transformed)
scaled_data_met = pd.DataFrame(scaled_data, columns=metabolomics_data.columns, index=metabolomics_data.index)

In [None]:
# Proteomics - already log transformed

prot_standard = sklearn.preprocessing.StandardScaler().fit_transform(imputed_data_prot)

#   data frame
prot_standard = pd.DataFrame(data = prot_standard)

#  standardised data into a data frame with column IDs
scaled_data_prot = pd.DataFrame(data=prot_standard.values, columns=proteomics_data.columns)
scaled_data_prot.index=proteomics_data.index


# Adding in and manipulating metadata 

### Metabolomics

In [None]:
# Data
metadata = pd.read_csv('metadata_covid.csv')
met_original = pd.read_csv('metabolomics_data_covid.csv')
prot_original = pd.read_csv('proteomics_data_covid.csv')

In [None]:
def categorise_who(value):
    if value in ['1', '1 or 2']:
        return 'Mild'
    elif value in ['3', '4', '5', '6', '7']: 
        return 'Severe'
    elif value == '0':
        return 'Healthy'
    else:
        return 'Other'  
    
def categorize_race(value):
    if value in ['White', 'Asian', 'Black or African American']:
        return value
    else:
        return 'Other'

In [None]:
# original metabolomics data
scaled_data_met.reset_index(drop=True, inplace=True)
scaled_data_met['sample_id'] = met_original['sample_id']
scaled_data_met['sample_id'] = scaled_data_met['sample_id'].apply(lambda x: str(x).split('-')[0])

# preparing metadata
metadata['Sample ID'] = metadata['Sample ID'].apply(lambda x: x.split('-')[0])
metadata = metadata.drop_duplicates(subset='Study Subject ID', keep='first')

# columsn to add on
columns_to_merge = ['Study Subject ID', 'Who Ordinal Scale', 'Race', 'Age']

# merging  refined metadata with the original metabolomic values dataframe
merged_df = pd.merge(scaled_data_met, metadata[columns_to_merge],
                     left_on='sample_id', right_on='Study Subject ID', how='left')

# filling missing values in WHO with 0 or unknown
merged_df['Who Ordinal Scale'].fillna(0, inplace=True)
merged_df['Race'].fillna('Unknown', inplace=True)
merged_df['Age'].fillna('Unknown', inplace=True)

# manipulating meged df
merged_df.drop(columns=['Study Subject ID'], inplace=True)
merged_df.rename(columns={'Who Ordinal Scale': 'Who'}, inplace=True)
merged_df['Group'] = met_original['Healthy donor sample or COVID19 sample']
merged_df.set_index('sample_id', inplace=True)

final_data_met = merged_df.copy()

### Proteomics

In [None]:
# original proteomics metadata
scaled_data_prot.reset_index(drop=True, inplace=True)
scaled_data_prot['sample_id'] = prot_original['sample_id']
scaled_data_prot['sample_id'] = scaled_data_prot['sample_id'].apply(lambda x: str(x).split('-')[0])
metadata['Sample ID'] = metadata['Sample ID'].apply(lambda x: x.split('-')[0])
metadata = metadata.drop_duplicates(subset='Study Subject ID', keep='first')

columns_to_merge = ['Study Subject ID', 'Who Ordinal Scale', 'Race', 'Age']

merged_df = pd.merge(scaled_data_prot, metadata[columns_to_merge],
                     left_on='sample_id', right_on='Study Subject ID', how='left')

merged_df['Who Ordinal Scale'].fillna(0, inplace=True)
merged_df['Race'].fillna('Unknown', inplace=True)
merged_df['Age'].fillna('Unknown', inplace=True)
merged_df.drop(columns=['Study Subject ID'], inplace=True)
merged_df.rename(columns={'Who Ordinal Scale': 'Who'}, inplace=True)
merged_df['Group'] = prot_original['Healthy donor sample or COVID19 sample']
merged_df.set_index('sample_id', inplace=True)

final_data_prot = merged_df.copy()


### Adding Groupings

In [None]:
# metaboloimcs
final_data_met['Condition_Group'] = final_data_met['Who'].apply(categorise_who)
final_data_met['Age'] = final_data_met['Age'].astype(float).astype(int)
final_data_met['Age_Group'] = pd.cut(final_data_met['Age'].astype(int), bins=range(0, 101, 10), right=False, labels=[f"{i}-{i+9}" for i in range(0, 100, 10)])
final_data_met['Race_Group'] = final_data_met['Race'].apply(categorize_race)

# proteoimcs
final_data_prot['Condition_Group'] = final_data_prot['Who'].apply(categorise_who)
final_data_prot['Age'] = final_data_prot['Age'].astype(float).astype(int)
final_data_prot['Age_Group'] = pd.cut(final_data_prot['Age'].astype(int), bins=range(0, 101, 10), right=False, labels=[f"{i}-{i+9}" for i in range(0, 100, 10)])
final_data_prot['Race_Group'] = final_data_prot['Race'].apply(categorize_race)

# saving
final_data_met.to_csv('cleaned_metabolomics_data_covid.csv')
final_data_prot.to_csv('cleaned_proteomics_data_covid.csv')

# Outlier Detection

In [None]:
# metabolomics 

met_numeric = final_data_met.iloc[:, :-7]
met_numeric

# z score
met_numeric_mean = met_numeric.mean()

In [None]:
# plotting the outliers usign z score

means = met_numeric.mean()
stds = met_numeric.std()

z_scores = (met_numeric - means) / stds

overall_z_scores = np.sqrt((z_scores**2).sum(axis=1))

top_indices = overall_z_scores.nlargest(3).index

row_index = 'INCOV064'

if row_index in overall_z_scores.index:
    row_overall_z_score = overall_z_scores.loc[row_index]

    plt.figure(figsize=(10, 6))
    plt.hist(overall_z_scores, bins=30, color='gray', alpha=0.7, label='Overall Z-Scores')
    
    for idx in top_indices:
        plt.axvline(overall_z_scores.loc[idx], color='red' if idx == row_index else 'blue', linestyle='dashed', linewidth=2)
        plt.text(overall_z_scores.loc[idx], 5, f'{idx}', horizontalalignment='right')
    
    plt.xlabel('Overall Z-Score')
    plt.ylabel('Frequency')
    plt.title('Distribution of Overall Z-Scores with Top 3 Samples Highlighted')
    plt.legend()
    plt.show()
else:
    pass


In [None]:
# proteomics

prot_numeric = final_data_prot.iloc[:, :-7]
means = prot_numeric.mean()

z_scores = (prot_numeric - means) / stds

overall_z_scores = np.sqrt((z_scores**2).sum(axis=1))

top_indices = overall_z_scores.nlargest(3).index

row_index = 'INCOV064'

if row_index in overall_z_scores.index:
    row_overall_z_score = overall_z_scores.loc[row_index]

    plt.figure(figsize=(10, 6))
    plt.hist(overall_z_scores, bins=30, color='gray', alpha=0.7, label='Overall Z-Scores')
    for idx in top_indices:
        plt.axvline(overall_z_scores.loc[idx], color='red' if idx == row_index else 'blue', linestyle='dashed', linewidth=2)
        plt.text(overall_z_scores.loc[idx], 5, f'{idx}', horizontalalignment='right')
    plt.xlabel('Overall Z-Score')
    plt.ylabel('Frequency')
    plt.title('Distribution of Overall Z-Scores with Top 3 Samples Highlighted')
    plt.legend()
    plt.show()
else:
    print(f"Row index {row_index} not found in DataFrame.")

In [None]:
# REMOVING the outlier cases

# 3 REMOVED FROM MET

final_data_met = final_data_met.drop('INCOV064', axis=0)
final_data_met = final_data_met.drop('INCOV090', axis=0)
final_data_met = final_data_met.drop('INCOV028', axis=0)


# 2 REMOVED FROM PROT

final_data_prot = final_data_prot.drop('INCOV064', axis=0)
final_data_prot = final_data_prot.drop('INCOV090', axis=0)