# Linear Mixed Models

Build some linear mixed models about our data

## Import data

In [1]:
import os
import sys

import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
import matplotlib.colors as mcolors
import matplotlib.dates as mdates
from matplotlib.colors import ListedColormap
import pandas as pd
import seaborn as sns
import json

from itertools import cycle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error

from scipy.stats import linregress

import statsmodels.api as sm
import statsmodels.formula.api as smf

# there is a FutureWarning in sklearn StandardScalar which is really annoying. This ignores it.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

try:
  import google.colab
  IN_COLAB = True
  !pip install adjustText
  from google.colab import drive
  drive.mount('/content/drive')
  datadir = '/content/drive/MyDrive/Projects/CF/Adelaide/CF_Data_Analysis'
except ImportError:
  IN_COLAB = False
  datadir = '..'


sys.path.append('..')
import cf_analysis_lib


## Read the data frames

In [2]:
sequence_type = "MGI"
datadir = '..'
#sslevel = 'level2_norm_ss.tsv.gz'
sslevel = 'subsystems_norm_ss.tsv.gz'
ss_df = cf_analysis_lib.read_subsystems(os.path.join(datadir, sequence_type, "FunctionalAnalysis", "subsystems", sslevel), sequence_type)
ss_df = ss_df.T
print(f"The subsystems df has shape: {ss_df.shape}")

taxa = "genus"
genus_otu = cf_analysis_lib.read_taxonomy(datadir, sequence_type, taxa)
genus_otu = genus_otu.T
print(f"The taxonomy df has shape: {genus_otu.shape}")
metadata = cf_analysis_lib.read_metadata(datadir, sequence_type)
print(f"The metadata df has shape: {metadata.shape}")

df = ss_df.merge(genus_otu, left_index=True, right_index=True, how='inner')

The subsystems df has shape: (127, 769)
The taxonomy df has shape: (127, 3581)
The metadata df has shape: (127, 166)


Setting IP vs OP to category
Setting Hospital to category
Setting Age groups to category
Setting Paediatric vs Adult to category
Setting Gender to category
Setting Sample_Type to category
Setting NTM to category
Setting FEV1_RATIO_SCORE to category
Setting FEV1_Obstruction_Rank to category
Setting Cystic Fibrosis related diabetes (CFRD) to category
Setting Pancreatic insufficiency (PI) to category
Setting CF gene 1 to category
Setting CFLD to category
Setting CS_mucoid to category
Setting CS_non-mucoid to category
Setting CS_Pseudomonas aeruginosa to category
Setting CS_Oral flora to category
Setting CS_Stenophotomonas maltophilia to category
Setting CS_Aspergillus fumigatus to category
Setting CS_Aspergillus flavus to category
Setting CS_Candida albicans to category
Setting CS_Mycobacteroides abscessus to category
Setting CS_Mycobacterium intracellulare to category
Setting CS_Staphylococcus  aureus to category
Setting CS_Inquilinus limosus to category
Setting CS_Achromobacter xylosoxi

In [3]:
metadata.dtypes

minion                                       object
MGI                                          object
pwCF_ID                                       int64
Sample date                                  object
IP vs OP                                   category
                                             ...   
DNA Conc. (ng/ul)                           float64
Index I7                                     object
Index I5                                     object
Mean_Size_BP                                  int64
Total Clusters Passing Filter (Million)     float64
Length: 166, dtype: object

In [4]:
metadata['IPc'] = metadata['IP vs OP'].astype('category')
metadata.dtypes

minion                                       object
MGI                                          object
pwCF_ID                                       int64
Sample date                                  object
IP vs OP                                   category
                                             ...   
Index I7                                     object
Index I5                                     object
Mean_Size_BP                                  int64
Total Clusters Passing Filter (Million)     float64
IPc                                        category
Length: 167, dtype: object

In [5]:

mdx_types = cf_analysis_lib.metadata_definitions()
mdx_types

{'Column header': 'Type of data',
 'NAME': 'Text',
 'minion': 'Text',
 'MGI': 'Text',
 'pwCF_ID': 'Text',
 'Sample date': 'Date',
 'IP vs OP': 'Categorical',
 'Hospital': 'Categorical',
 'Room': 'Text',
 'Age': 'Numeric',
 'Age groups': 'Categorical',
 'Paediatric vs Adult': 'Categorical',
 'Gender': 'Categorical',
 'Sample_Type': 'Categorical',
 'H2_Uncorrected': 'Numeric',
 'CH4_Uncorrected': 'Numeric',
 'CO2': 'Numeric',
 'H2_Corrected': 'Numeric',
 'CH4_Corrected': 'Numeric',
 'CH4/H2 ratio_corrected': 'Numeric',
 'Corr.': 'Numeric',
 'Culture Result (Matched with sequenced sample)': 'Text',
 'NTM': 'Categorical',
 'Pseudomonas': 'Categorical',
 'Cutured in previous 12 months': 'Text',
 'Others cultured': 'Text',
 'IgE': 'Numeric',
 'Spec IgE': 'Numeric',
 'Spec IgG': 'Numeric',
 'Precipitins': 'Numeric',
 'FVC': 'Numeric',
 'FEV1': 'Numeric',
 'Best FEV1': 'Numeric',
 'FEV1/best FEV1': 'Numeric',
 'FEV1_RATIO_SCORE': 'Categorical',
 'FEV1_Obstruction_Rank': 'Categorical',
 'Cystic

## Get rid of the spaces!

For statsmodels, it really helps if we don't have spaces. We can get rid of them here and then things become easier later

In [20]:
dcolumns_no_spaces = {}
for c in df.columns:
    if ' ' in c:
        dcolumns_no_spaces[c] = c.replace(' ', '_')
df.rename(columns=dcolumns_no_spaces, inplace=True)

mcolumns_no_spaces = {}
for c in metadata.columns:
    if ' ' in c:
        mcolumns_no_spaces[c] = c.replace(' ', '_')
metadata.rename(columns=mcolumns_no_spaces, inplace=True)

In [21]:
df.head(3)

Unnamed: 0,"2,3-diacetamido-2,3-dideoxy-d-mannuronic_acid",2-O-alpha-mannosyl-D-glycerate_utilization,2-aminophenol_Metabolism,2-ketoacid_oxidoreductases_disambiguation,2-oxoglutarate_dehydrogenase_,2-phosphoglycolate_salvage,3-amino-5-hydroxybenzoic_Acid_Synthesis,4-hydroxybenzoyl-CoA_reductase,5-methylaminomethyl-2-thiouridine,A_Hypothetical_Protein_Related_to_Proline_Metabolism,...,Oceaniferula,Persicirhabdus,Phragmitibacter,Prosthecobacter,Roseibacillus,Roseimicrobium,Sulfuriroseicoccus,Verrucomicrobium,Eremiobacter,Methylomirabilis
1068841_20180306_S,10.085904,2784.895948,516.160945,231.778018,122.573392,342.327431,783.041862,0.0,4136.703664,827.439614,...,1.347606,0.0,0.0,0.0,0.539042,0.0,0.0,0.539042,0.0,0.0
1447437_20171212_S,59.260325,1065.438272,543.947408,554.136026,428.171446,301.915763,679.310468,0.0,3913.676651,732.956657,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128691_20171206_S,0.0,426.619709,912.76775,49.606943,213.640568,277.79888,423.31258,0.0,2093.412992,236.459761,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
metadata.head(3)

Unnamed: 0_level_0,minion,MGI,pwCF_ID,Sample_date,IP_vs_OP,Hospital,Room,Age,Age_groups,Paediatric_vs_Adult,...,Sum_of_meds,Sum_of_antifungals,Sum_of_steroid_+_mabs,DNA_extraction__conc,SAGC_ULN,DNA_Conc._(ng/ul),Index_I7,Index_I5,Mean_Size_BP,Total_Clusters_Passing_Filter_(Million)
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
634207_20180510_S,,634207_20180510_S,634207,5/10/2018,IP,WCH,Adol Rm9,17,3,Paediatric,...,1,0,0,0.0,SAGCFN_22_01856,7.82,CGGACGATTC,CCACCACCTA,651,2.9
634207_20180517_S,,634207_20180517_S,634207,5/17/2018,IP,WCH,Adol Rm9,17,3,Paediatric,...,1,0,0,0.134,SAGCFN_22_01827,22.8,AGCGATAG,CCTATCCT,633,2.4
715927_20180205_S,715927_20180205_S,715927_20180205_S,715927,2/05/2018,OP,WCH,Level 6 DK Office,13,3,Paediatric,...,1,0,0,0.326,SAGCFN_22_01797,16.5,TAATGCGC,AGGCGAAG,516,3.4


# Convert our metadata to categories

# Predict something "simple"



# Early trial

Ignore the stuff down here, its probably wrong :)

In [3]:
metadata

Unnamed: 0_level_0,minion,MGI,pwCF_ID,Sample date,IP vs OP,Hospital,Room,Age,Age groups,Paediatric vs Adult,...,Sum of meds,Sum of antifungals,Sum of steroid + mabs,DNA_extraction_ conc,SAGC ULN,DNA Conc. (ng/ul),Index I7,Index I5,Mean_Size_BP,Total Clusters Passing Filter (Million)
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
634207_20180510_S,,634207_20180510_S,634207,5/10/2018,IP,WCH,Adol Rm9,17,3,Paediatric,...,1,0,0,0.000,SAGCFN_22_01856,7.82,CGGACGATTC,CCACCACCTA,651,2.9
634207_20180517_S,,634207_20180517_S,634207,5/17/2018,IP,WCH,Adol Rm9,17,3,Paediatric,...,1,0,0,0.134,SAGCFN_22_01827,22.80,AGCGATAG,CCTATCCT,633,2.4
715927_20180205_S,715927_20180205_S,715927_20180205_S,715927,2/05/2018,OP,WCH,Level 6 DK Office,13,3,Paediatric,...,1,0,0,0.326,SAGCFN_22_01797,16.50,TAATGCGC,AGGCGAAG,516,3.4
715927_20180213_S,,715927_20180213_S,715927,2/13/2018,IP,WCH,Adol Room 11,13,3,Paediatric,...,3,0,0,0.234,SAGCFN_22_01811,31.00,TCCGCGAA,CCTATCCT,443,2.7
715927_20180226_S,,715927_20180226_S,715927,2/26/2018,OP,WCH,OPD 8,13,3,Paediatric,...,2,0,0,0.108,SAGCFN_22_01833,15.10,TAACTTGGTC,GATTCACGAC,510,2.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651490_20180206_S,1651490_20180206_S,1651490_20180206_S,1651490,2/06/2018,OP,RAH,Chest Clinic 1,27,5,Adult,...,1,0,0,4.760,SAGCFN_22_01741,26.20,ATTACTCG,AGGCGAAG,507,6.4
1651490_20171215_S,1651490_20171215_S,1651490_20171215_S,1651490,12/15/2017,OP,RAH,Chest Clinic 4,26,5,Adult,...,1,0,0,7.760,SAGCFN_22_01738,34.20,ATTACTCG,ATAGAGGC,564,6.0
1658447_20171006_S,,1658447_20171006_S,1658447,10/06/2017,OP,RAH,Chest Clinic 3,26,5,Adult,...,1,0,0,0.098,SAGCFN_22_01837,13.10,CAGCAGGTCA,TACCTAAGTG,576,2.9
1664053_20180406_S,,1664053_20180406_S,1664053,4/06/2018,OP,RAH,Chest Clinic 1,26,5,Adult,...,3,1,0,0.159,SAGCFN_22_01822,30.80,TCTCGCGC,TAATCTTA,374,1.9


In [11]:
df_combined = df.merge(metadata, left_index=True, right_index=True, how='inner')
print(f"df: {df.shape}")
print(f"metadata: {metadata.shape}")
print(f"df_combined: {df_combined.shape}")


df: (127, 4350)
metadata: (127, 166)
df_combined: (127, 4516)


In [4]:
df_long = df.reset_index().melt(id_vars='index', var_name='taxa_functions', value_name='abundance')
df_long.rename(columns={'index': 'sample_id'}, inplace=True)
df_long

Unnamed: 0,sample_id,taxa_functions,abundance
0,1068841_20180306_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",10.085904
1,1447437_20171212_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",59.260325
2,1128691_20171206_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",0.000000
3,1128691_20171218_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",0.000000
4,1128691_20180116_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",14.478968
...,...,...,...
552445,895293_20180502_S,Methylomirabilis,0.000000
552446,896213_20180427_S,Methylomirabilis,0.000000
552447,913873_20180417_S,Methylomirabilis,0.000000
552448,980574_20180403_S,Methylomirabilis,0.000000


In [5]:
merged_data =  pd.merge(df_long, metadata, left_on='sample_id', right_on=sequence_type)
merged_data.head()

Unnamed: 0,sample_id,taxa_functions,abundance,minion,MGI,pwCF_ID,Sample date,IP vs OP,Hospital,Room,...,Sum of meds,Sum of antifungals,Sum of steroid + mabs,DNA_extraction_ conc,SAGC ULN,DNA Conc. (ng/ul),Index I7,Index I5,Mean_Size_BP,Total Clusters Passing Filter (Million)
0,1068841_20180306_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",10.085904,1068841_20180306_S,1068841_20180306_S,1068841,3/06/2018,OP,RAH,Chest Clinic 7,...,0,0,0,1.07,SAGCFN_22_01754,42.8,CGCTCATT,ATAGAGGC,417,2.8
1,1068841_20180306_S,2-O-alpha-mannosyl-D-glycerate utilization,2784.895948,1068841_20180306_S,1068841_20180306_S,1068841,3/06/2018,OP,RAH,Chest Clinic 7,...,0,0,0,1.07,SAGCFN_22_01754,42.8,CGCTCATT,ATAGAGGC,417,2.8
2,1068841_20180306_S,2-aminophenol Metabolism,516.160945,1068841_20180306_S,1068841_20180306_S,1068841,3/06/2018,OP,RAH,Chest Clinic 7,...,0,0,0,1.07,SAGCFN_22_01754,42.8,CGCTCATT,ATAGAGGC,417,2.8
3,1068841_20180306_S,2-ketoacid oxidoreductases disambiguation,231.778018,1068841_20180306_S,1068841_20180306_S,1068841,3/06/2018,OP,RAH,Chest Clinic 7,...,0,0,0,1.07,SAGCFN_22_01754,42.8,CGCTCATT,ATAGAGGC,417,2.8
4,1068841_20180306_S,2-oxoglutarate dehydrogenase,122.573392,1068841_20180306_S,1068841_20180306_S,1068841,3/06/2018,OP,RAH,Chest Clinic 7,...,0,0,0,1.07,SAGCFN_22_01754,42.8,CGCTCATT,ATAGAGGC,417,2.8


In [12]:
columns_no_spaces = {}
for c in merged_data.columns:
    if ' ' in c:
        columns_no_spaces[c] = c.replace(' ', '_')
        
merged_data.rename(columns = columns_no_spaces, inplace=True)


df_combined.rename(columns = columns_no_spaces, inplace=True)


In [13]:
list(df_combined.columns)

['2,3-diacetamido-2,3-dideoxy-d-mannuronic acid',
 '2-O-alpha-mannosyl-D-glycerate utilization',
 '2-aminophenol Metabolism',
 '2-ketoacid oxidoreductases disambiguation',
 '2-oxoglutarate dehydrogenase ',
 '2-phosphoglycolate salvage',
 '3-amino-5-hydroxybenzoic Acid Synthesis',
 '4-hydroxybenzoyl-CoA reductase',
 '5-methylaminomethyl-2-thiouridine',
 'A Hypothetical Protein Related to Proline Metabolism',
 'A new toxin - antitoxin system',
 'ABC transporter YeiABEF',
 'ABC transporter YxeMNO',
 'ABC transporter of unknown substrate X',
 'ABC transporter tungstate (TC 3.A.1.6.2)',
 'ABC-type iron transport system',
 'AMP to 3-phosphoglycerate',
 'ATP-dependent Nuclease',
 'ATP-dependent RNA helicases, bacterial',
 'AaeAB efflux system for hydroxylated, aromatic carboxylic acids',
 'Accessory colonization factor',
 'Acetoin, butanediol metabolism',
 'Acetolactate synthase subunits',
 'Acetophenone carboxylase 1',
 'Acetyl-CoA Pathway Wood-Ljungdahl',
 'Actinobacterial signal transducti

In [None]:
model = smf.mixedlm(
    'N12M_Pseudomonas_aeruginosa ~ abundance + Pseudomonas_Culture + CS_Pseudomonas_aeruginosa', 
    merged_data,
    groups=merged_data["pwCF_ID"] 
)
result = model.fit()

# Print the model summary
print(result.summary())


In [None]:
if False:
    model = smf.mixedlm(
        'abundance ~ CS_Pseudomonas_aeruginosa + taxa_functions', 
        merged_data,
        groups=merged_data["pwCF_ID"] 
    )
    result = model.fit()

    # Print the model summary
    print(result.summary())


## Write the results

In [None]:
coefficients = pd.DataFrame({
    "Variable": result.params.index,
    "Estimate": result.params.values,
    "Std Error": result.bse.values,
    "P-value": result.pvalues.values
})
coefficients.to_csv(os.path.join('lmm', f'{taxa}_model_coefficients.tsv'), sep="\t", index=False)

with open(os.path.join('lmm', f'{taxa}_model_results.tsv'), 'w') as f:
    f.write(result.summary().as_text())

In [None]:
"""
# Extract fixed effect coefficients and confidence intervals
params = result.fe_params  # Fixed effect coefficients
conf_int = result.conf_int()  # Confidence intervals
conf_int.columns = ['lower', 'upper']

# Combine coefficients and confidence intervals
coefficients = pd.DataFrame({
    'coef': params,
    'lower': conf_int['lower'],
    'upper': conf_int['upper']
})

# Plot the coefficients with error bars
plt.figure(figsize=(8, 6))
plt.errorbar(coefficients.index, coefficients['coef'], 
             yerr=(coefficients['coef'] - coefficients['lower'], coefficients['upper'] - coefficients['coef']), 
             fmt='o', capsize=5)
plt.axhline(0, color='gray', linestyle='--', linewidth=1)
plt.xticks(rotation=45)
plt.title("Fixed Effect Coefficients with Confidence Intervals")
plt.ylabel("Coefficient Value")
plt.xlabel("Predictor")
plt.tight_layout()
plt.show()
"""