# MGI Subsystem Counts

Explore some data, MGI style

In [1]:
import os
import sys

import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
import matplotlib.colors as mcolors
import matplotlib.dates as mdates
from matplotlib.colors import ListedColormap
import pandas as pd
import seaborn as sns
import json

from itertools import cycle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error

from scipy.stats import linregress


# there is a FutureWarning in sklearn StandardScalar which is really annoying. This ignores it.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

try:
  import google.colab
  IN_COLAB = True
  !pip install adjustText
  from google.colab import drive
  drive.mount('/content/drive')
  datadir = '/content/drive/MyDrive/Projects/CF/Adelaide/CF_Data_Analysis'
except ImportError:
  IN_COLAB = False
  datadir = '..'

from adjustText import adjust_text

In [2]:
import cf_analysis_lib

In [3]:
sequence_type = "MGI"

#sslevel = 'level2_norm_ss.tsv.gz'
sslevel = 'subsystems_norm_ss.tsv.gz'
df = cf_analysis_lib.read_subsystems(os.path.join(datadir, sequence_type, "FunctionalAnalysis", "subsystems", sslevel))
df.head()

In [4]:
metadata = cf_analysis_lib.read_metadata(datadir, sequence_type)
metadata.head()

In [5]:
dft = df.T
dft.head()

### Make a simple data frame with patient and data as separate columns so we can use them as labels

In [6]:
pattern = r'(?P<Patient>\d+)_(?P<Date>\d+)_(\S+)'
tmp = dft.index.str.extract(pattern)
cols = ['Patient', 'Date']
labels = pd.DataFrame()
labels[cols] = tmp[cols].values
labels.head(2)

## Run a PCA

We use a standard PCA, and then add the labels from the previous df

In [7]:
scaler = StandardScaler()
scaler.fit(dft)
dft_scaled = scaler.transform(dft)
pca = PCA(n_components=2)
pca.fit(dft_scaled)
dft_pca = pca.transform(dft_scaled)
zipped = list(zip(dft_pca[:,0],
                  dft_pca[:,1],
                  list(dft.index),
                 ))
pct_df = pd.DataFrame(zipped, columns=['PC1', 'PC2', "Label"])
pct_df = pd.concat([pct_df, labels], axis=1)

explained_variance = pca.explained_variance_ratio_ * 100
pc1_variance = explained_variance[0]
pc2_variance = explained_variance[1]

pct_df.head(3)

## Calculate the PCA loadings

In [8]:
loadings = pd.DataFrame(pca.components_.T, columns=['PC1', 'PC2'], index=dft.columns)
loadings.nlargest(10, ['PC1', 'PC2'])

In [9]:
top_loadings_pc1 = loadings['PC1'].abs().nlargest(5).index
top_loadings_pc2 = loadings['PC2'].abs().nlargest(5).index
top_loadings = top_loadings_pc1.union(top_loadings_pc2)
loadings.loc[top_loadings]

In [10]:
fig, ax = plt.subplots(figsize=(9,9))
sns.scatterplot(x="PC1", y="PC2", hue='Patient', legend=False, data=pct_df, ax=ax)
#ax.legend()
#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
ldf = loadings.loc[top_loadings]
mlp1 = max(ldf.PC1)
mlp2 = max(ldf.PC2)
mpp1 = max(pct_df.PC1)
mpp2 = max(pct_df.PC2)

colour_cycle = cycle(mcolors.TABLEAU_COLORS)

plt.xlabel(f"PC1 ({pc1_variance:.2f}%)")
plt.ylabel(f"PC2 ({pc2_variance:.2f}%)")
texts = []
for i, l in ldf.iterrows():
    c = next(colour_cycle)
    ex = ( l.PC1 * (mpp1/mlp1) ) / 2
    ey = ( l.PC2 * (mpp2/mlp2) ) / 2
    texts.append(plt.text(ex, ey, i, color=c, fontsize=9))
    plt.plot([0, ex ], [0, ey ], color=c, lw=1)
adjust_text(texts)
plt.tight_layout()

plt.show()

## Outlier alert!

There is one pwCF who is an outlier in this data. Their PC1 > 40 for the three points

In [11]:
pct_df[pct_df['PC1'] > 25]

In [12]:
df[[x for x in df.columns if x.startswith('1128691_')]].sort_values(by='1128691_20171206_S', ascending=False)

**Read the taxa data**

Here we read the taxonomy data so we can look at these samples

In [14]:
taxa = "genus"
if not datadir:
    datadir = '..'
taxdf = cf_analysis_lib.read_taxonomy(datadir, sequence_type, taxa)
taxdf = taxdf.T

Here's the abundance of Pseudomonas and Streptococcus in that pwCF

In [16]:
pd.options.display.float_format = '{:,.2f}'.format
wants = [x for x in taxdf.index if x.startswith('1128691_')]
taxdf.loc[wants,['Pseudomonas', 'Streptococcus']]

In [22]:
intpwcf = taxdf.loc[wants,:]
intpwcf = intpwcf.loc[:,intpwcf.sum() > 2000]
sns.boxplot(data=intpwcf.loc[:,intpwcf.sum().sort_values(ascending=False).index])
sns.stripplot(data=intpwcf.loc[:,intpwcf.sum().sort_values(ascending=False).index], color='k')
plt.xticks(rotation=90)
plt.title("Abundance of different organisms in pwCF 1128691")
plt.show()