# Linear Mixed Models

Build some linear mixed models about our data

## Import data

In [1]:
import os
import sys

import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
import matplotlib.colors as mcolors
import matplotlib.dates as mdates
from matplotlib.colors import ListedColormap
import pandas as pd
import seaborn as sns
import json

from itertools import cycle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error

from scipy.stats import linregress


# there is a FutureWarning in sklearn StandardScalar which is really annoying. This ignores it.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

try:
  import google.colab
  IN_COLAB = True
  !pip install adjustText
  from google.colab import drive
  drive.mount('/content/drive')
  datadir = '/content/drive/MyDrive/Projects/CF/Adelaide/CF_Data_Analysis'
except ImportError:
  IN_COLAB = False
  datadir = '..'

from adjustText import adjust_text

import cf_analysis_lib


## Read the data frames

In [2]:
sequence_type = "MGI"
datadir = '..'
#sslevel = 'level2_norm_ss.tsv.gz'
sslevel = 'subsystems_norm_ss.tsv.gz'
ss_df = cf_analysis_lib.read_subsystems(os.path.join(datadir, sequence_type, "FunctionalAnalysis", "subsystems", sslevel), sequence_type)
ss_df = ss_df.T
print(f"The subsystems df has shape: {ss_df.shape}")

taxa = "phylum"
genus_otu = cf_analysis_lib.read_taxonomy(datadir, sequence_type, taxa)
genus_otu = genus_otu.T
print(f"The taxonomy df has shape: {genus_otu.shape}")
metadata = cf_analysis_lib.read_metadata(datadir, sequence_type)
print(f"The metadata df has shape: {metadata.shape}")

df = ss_df.merge(genus_otu, left_index=True, right_index=True, how='inner')
print(df.shape)
df.head(5)

The subsystems df has shape: (127, 769)
The taxonomy df has shape: (127, 164)
The metadata df has shape: (127, 166)
(127, 933)


Unnamed: 0,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",2-O-alpha-mannosyl-D-glycerate utilization,2-aminophenol Metabolism,2-ketoacid oxidoreductases disambiguation,2-oxoglutarate dehydrogenase,2-phosphoglycolate salvage,3-amino-5-hydroxybenzoic Acid Synthesis,4-hydroxybenzoyl-CoA reductase,5-methylaminomethyl-2-thiouridine,A Hypothetical Protein Related to Proline Metabolism,...,Verrucomicrobiota,Vulcanimicrobiota,candidate division CPR1,candidate division CPR2,candidate division CPR3,candidate division KD3-62,candidate division LCP-89,candidate division NC10,candidate division WOR-3,candidate division WWE3
1068841_20180306_S,10.085904,2784.895948,516.160945,231.778018,122.573392,342.327431,783.041862,0.0,4136.703664,827.439614,...,64.14605,0.0,0.0,0.0,1.886649,0.0,0.0,6.468509,5.120903,8.8942
1447437_20171212_S,59.260325,1065.438272,543.947408,554.136026,428.171446,301.915763,679.310468,0.0,3913.676651,732.956657,...,99.787903,0.0,0.635592,0.0,0.0,0.0,0.0,5.084734,4.449142,3.81355
1128691_20171206_S,0.0,426.619709,912.76775,49.606943,213.640568,277.79888,423.31258,0.0,2093.412992,236.459761,...,85.998388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128691_20171218_S,0.0,659.087578,864.137047,139.140711,235.318676,355.174973,538.254855,0.0,2050.494687,341.749115,...,45.140868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128691_20180116_S,14.478968,159.268644,593.637673,48.263225,358.354449,202.705547,348.701804,0.0,1570.967988,205.118708,...,67.634358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
metadata

Unnamed: 0_level_0,minion,MGI,pwCF_ID,Sample date,IP vs OP,Hospital,Room,Age,Age groups,Paediatric vs Adult,...,Sum of meds,Sum of antifungals,Sum of steroid + mabs,DNA_extraction_ conc,SAGC ULN,DNA Conc. (ng/ul),Index I7,Index I5,Mean_Size_BP,Total Clusters Passing Filter (Million)
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
634207_20180510_S,,634207_20180510_S,634207,5/10/2018,IP,WCH,Adol Rm9,17,3,Paediatric,...,1,0,0,0.000,SAGCFN_22_01856,7.82,CGGACGATTC,CCACCACCTA,651,2.9
634207_20180517_S,,634207_20180517_S,634207,5/17/2018,IP,WCH,Adol Rm9,17,3,Paediatric,...,1,0,0,0.134,SAGCFN_22_01827,22.80,AGCGATAG,CCTATCCT,633,2.4
715927_20180205_S,715927_20180205_S,715927_20180205_S,715927,2/05/2018,OP,WCH,Level 6 DK Office,13,3,Paediatric,...,1,0,0,0.326,SAGCFN_22_01797,16.50,TAATGCGC,AGGCGAAG,516,3.4
715927_20180213_S,,715927_20180213_S,715927,2/13/2018,IP,WCH,Adol Room 11,13,3,Paediatric,...,3,0,0,0.234,SAGCFN_22_01811,31.00,TCCGCGAA,CCTATCCT,443,2.7
715927_20180226_S,,715927_20180226_S,715927,2/26/2018,OP,WCH,OPD 8,13,3,Paediatric,...,2,0,0,0.108,SAGCFN_22_01833,15.10,TAACTTGGTC,GATTCACGAC,510,2.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651490_20180206_S,1651490_20180206_S,1651490_20180206_S,1651490,2/06/2018,OP,RAH,Chest Clinic 1,27,5,Adult,...,1,0,0,4.760,SAGCFN_22_01741,26.20,ATTACTCG,AGGCGAAG,507,6.4
1651490_20171215_S,1651490_20171215_S,1651490_20171215_S,1651490,12/15/2017,OP,RAH,Chest Clinic 4,26,5,Adult,...,1,0,0,7.760,SAGCFN_22_01738,34.20,ATTACTCG,ATAGAGGC,564,6.0
1658447_20171006_S,,1658447_20171006_S,1658447,10/06/2017,OP,RAH,Chest Clinic 3,26,5,Adult,...,1,0,0,0.098,SAGCFN_22_01837,13.10,CAGCAGGTCA,TACCTAAGTG,576,2.9
1664053_20180406_S,,1664053_20180406_S,1664053,4/06/2018,OP,RAH,Chest Clinic 1,26,5,Adult,...,3,1,0,0.159,SAGCFN_22_01822,30.80,TCTCGCGC,TAATCTTA,374,1.9


In [4]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [5]:
df_long = df.reset_index().melt(id_vars='index', var_name='taxa_functions', value_name='abundance')
df_long.rename(columns={'index': 'sample_id'}, inplace=True)
df_long

Unnamed: 0,sample_id,taxa_functions,abundance
0,1068841_20180306_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",10.085904
1,1447437_20171212_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",59.260325
2,1128691_20171206_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",0.000000
3,1128691_20171218_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",0.000000
4,1128691_20180116_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",14.478968
...,...,...,...
118486,895293_20180502_S,candidate division WWE3,4.604184
118487,896213_20180427_S,candidate division WWE3,23.970085
118488,913873_20180417_S,candidate division WWE3,0.000000
118489,980574_20180403_S,candidate division WWE3,20.923730


In [6]:
merged_data =  pd.merge(df_long, metadata, left_on='sample_id', right_on=sequence_type)
merged_data.head()

Unnamed: 0,sample_id,taxa_functions,abundance,minion,MGI,pwCF_ID,Sample date,IP vs OP,Hospital,Room,...,Sum of meds,Sum of antifungals,Sum of steroid + mabs,DNA_extraction_ conc,SAGC ULN,DNA Conc. (ng/ul),Index I7,Index I5,Mean_Size_BP,Total Clusters Passing Filter (Million)
0,1068841_20180306_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",10.085904,1068841_20180306_S,1068841_20180306_S,1068841,3/06/2018,OP,RAH,Chest Clinic 7,...,0,0,0,1.07,SAGCFN_22_01754,42.8,CGCTCATT,ATAGAGGC,417,2.8
1,1447437_20171212_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",59.260325,1447437_20171212_S,1447437_20171212_S,1447437,12/12/2017,OP,RAH,Chest Clinic 4,...,0,0,0,1.51,SAGCFN_22_01750,32.6,TCCGGAGA,TAATCTTA,498,4.7
2,1128691_20171206_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",0.0,,1128691_20171206_S,1128691,12/06/2017,IP,RAH,RAH 8E 05,...,3,0,0,1.75,SAGCFN_22_01748,33.4,TCCGGAGA,GGCTCTGA,535,4.5
3,1128691_20171218_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",0.0,1128691_20171218_S,1128691_20171218_S,1128691,12/18/2017,OP,RAH,Chest Clinic 1,...,1,0,0,3.4,SAGCFN_22_01743,30.8,ATTACTCG,CAGGACGT,352,4.8
4,1128691_20180116_S,"2,3-diacetamido-2,3-dideoxy-d-mannuronic acid",14.478968,1128691_20180116_S,1128691_20180116_S,1128691,1/16/2018,OP,RAH,Chest Clinic 4,...,1,0,0,0.708,SAGCFN_22_01772,27.4,ATTCAGAA,GGCTCTGA,500,3.8


In [14]:
merged_data.rename(columns = {'CS_Pseudomonas aeruginosa' : 'CS_Pseudomonas_aeruginosa'}, inplace=True)

In [15]:
model = smf.mixedlm(
    'abundance ~ CS_Pseudomonas_aeruginosa + taxa_functions', 
    merged_data,
    groups=merged_data["pwCF_ID"] 
)
result = model.fit()

# Print the model summary
print(result.summary())



KeyboardInterrupt

