# Principal Component Analysis of the Biolog Data
## Ancestral clones

After processing the data according to our normalization and filtering protocol, we perform dimensionality reduction using PCA.

In [1]:
#Libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
from scipy import interpolate
from matplotlib.backends.backend_pdf import PdfPages
from mpl_toolkits.mplot3d import Axes3D
import sys
from pylab import *
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
import os 

#Functions
def talus(singular):
    talo=[]
    for i in range(0,len(singular)-2):
        value=log(singular[i])-log(singular[i+1])
        talo.append(value)
    return talo
        

First, we will look at the data of the clones derived from the ancestral population for both linages.

In [15]:
#Load the data
name="606_collated.csv"
df=pd.read_csv(name)
temperature = df.Condition.values #Or any other factor
strain = df.Strain.values
treatment=df.Treatment.values
#for PCA
data=df.iloc[:,6:]
X = df.iloc[:,6:].values

Unnamed: 0,Replicate,Clone,Strain,Treatment,Condition,Evolved,Well2,Well3,Well4,Well5,...,37W83,37W84,37W85,37W86,37W87,37W88,37W89,37W90,37W91,37W92
0,X606P.15.1,606p,606,ancestor,15,ancestor,-0.12969,0.057385,0.009833,-0.003774,...,-0.066159,-0.002317,-0.160611,-0.184079,0.072298,-0.233313,0.102421,-0.108393,0.156698,0.095345
1,X606P.15.2,606p,606,ancestor,15,ancestor,-0.11769,0.050385,0.049833,0.005226,...,-0.094159,-0.047317,0.015389,-0.212079,-0.075702,-0.246313,0.093421,-0.041393,0.141698,0.098345
2,X606P.15.3,606p,606,ancestor,15,ancestor,-0.12769,0.055385,0.021833,-0.006774,...,0.006841,-0.026317,0.070389,-0.113079,-0.084702,-0.170313,0.132421,-0.048393,0.177698,0.147345
3,X606P2.15.1,606p,606,ancestor,15,ancestor,-0.16969,0.013385,-0.031167,-0.027774,...,0.003841,-0.059317,-0.312611,-0.252079,-0.164702,-0.141313,-0.040579,-0.131393,-0.026302,0.052345
4,X606P2.15.2,606p,606,ancestor,15,ancestor,-0.16569,-0.008615,-0.032167,-0.033774,...,-0.010159,0.041683,-0.100611,-0.259079,-0.105702,-0.154313,0.028421,-0.000393,-0.012302,0.124345


Given the talus plot, we can see that the variability of our data is mostly explained with 4 principal components.

In [16]:
%matplotlib notebook
#PCA
pca = PCA()
X_reduced = pca.fit_transform(scale(X,with_std=False))
#Variance explained
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance');
plt.grid();
plt.xticks(np.arange(0, 32, step=2));
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,Replicate,Clone,Strain,Treatment,Condition,Evolved,Well2,Well3,Well4,Well5,...,37W83,37W84,37W85,37W86,37W87,37W88,37W89,37W90,37W91,37W92
0,X606P.15.1,606p,606,ancestor,15,ancestor,-0.12969,0.057385,0.009833,-0.003774,...,-0.066159,-0.002317,-0.160611,-0.184079,0.072298,-0.233313,0.102421,-0.108393,0.156698,0.095345
1,X606P.15.2,606p,606,ancestor,15,ancestor,-0.11769,0.050385,0.049833,0.005226,...,-0.094159,-0.047317,0.015389,-0.212079,-0.075702,-0.246313,0.093421,-0.041393,0.141698,0.098345
2,X606P.15.3,606p,606,ancestor,15,ancestor,-0.12769,0.055385,0.021833,-0.006774,...,0.006841,-0.026317,0.070389,-0.113079,-0.084702,-0.170313,0.132421,-0.048393,0.177698,0.147345
3,X606P2.15.1,606p,606,ancestor,15,ancestor,-0.16969,0.013385,-0.031167,-0.027774,...,0.003841,-0.059317,-0.312611,-0.252079,-0.164702,-0.141313,-0.040579,-0.131393,-0.026302,0.052345
4,X606P2.15.2,606p,606,ancestor,15,ancestor,-0.16569,-0.008615,-0.032167,-0.033774,...,-0.010159,0.041683,-0.100611,-0.259079,-0.105702,-0.154313,0.028421,-0.000393,-0.012302,0.124345


In [17]:
%matplotlib notebook
#Talus Plot
tal=talus(pca.singular_values_)
plot(tal,'o-')
plt.xlabel('PC')
plt.ylabel(r"$\frac{\log\left[{\lambda\left(PC_i\right)}\right]}{\log\left[{\lambda\left(PC_{i+1}\right)}\right]}$");
plt.grid();
plt.xticks(np.arange(0, 32, step=2));
#plt.xticks(np.arange(0, 32, step=2));
plt.axvline(x=4, linestyle = '--', color = 'firebrick')


<IPython.core.display.Javascript object>

<matplotlib.lines.Line2D at 0x7ff3d41d8048>

We will first inspect the first three principal components.

In [20]:
%matplotlib notebook
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
colors1 = ['royalblue', 'sandybrown', 'firebrick','olive','black']


ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=treatment,
          cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])


<IPython.core.display.Javascript object>

ValueError: 'c' argument must either be valid as mpl color(s) or as numbers to be mapped to colors. Here c = ['ancestor' 'ancestor' 'ancestor' 'ancestor' 'ancestor' 'ancestor' 'slow'
 'slow' 'slow' 'fast' 'fast' 'fast' 'random' 'random' 'random' 'random'
 'random' 'random' 'slow' 'slow' 'slow' 'fast' 'fast' 'fast' 'slow' 'slow'
 'slow' 'slow' 'slow' 'slow' 'fast' 'fast' 'fast' 'random' 'random'
 'random' 'fast' 'fast' 'fast' 'random' 'random' 'random'].

The samples are separated well by both temperature and strain.

In [21]:
%matplotlib notebook
# Create 3 different clusters
kmeans = KMeans(n_clusters=6)
kmeans.fit(X_reduced)
# Get the cluster centroids
centers = kmeans.cluster_centers_

# 2D PCA
ig, ax = plt.subplots()
colors = ['royalblue', 'sandybrown', 'firebrick']
lw = 2
conditions=unique(temperature)


for color, i, conditions in zip(colors, [15, 37, 43], conditions):
    plt.scatter(X_reduced[temperature == i, 0], X_reduced[temperature == i, 1], color=color, alpha=.8, lw=lw,
                label=conditions)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.grid(linestyle='dotted')

# Overlay the centroids on the scatter plot
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=100,marker='x')
ax.set_xlabel('PC1',fontsize=12, fontweight='bold')
ax.set_ylabel('PC2',fontsize=12, fontweight='bold')
fontsize = 12
ax = gca()
for tick in ax.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax.yaxis.get_major_ticks():
     tick.label1.set_fontsize(fontsize)
     tick.label1.set_fontweight('bold')


labels=strain
for label, x, y in zip(labels, X_reduced[:, 0], X_reduced[:, 1]):
	plt.annotate(
		label,
		xy=(x, y),
        textcoords='data',fontsize=7)


<IPython.core.display.Javascript object>

PC1 separates the three temperatures, PC2 separates the perturbed temperatures from the optimal.

In [22]:
#Plot all components
components=pd.DataFrame(X_reduced[:,0:12])
variables=df.iloc[:,0:6]
variables=variables.reset_index()
projected = pd.concat([variables, components], axis=1)
projected['Condition'] = projected['Condition'].astype('category')
projected['Strain'] = projected['Strain'].astype('category')
projected['Treatment'] = projected['Treatment'].astype('category')
projected['Evolved'] = projected['Evolved'].astype('category')
projected=projected.drop(columns=['index'])

g = sns.pairplot(projected, hue="Treatment",diag_kind='hist')
pp = PdfPages('lolg.pdf')
plt.savefig(pp,format='pdf')
pp.close()

<IPython.core.display.Javascript object>

In [8]:
%matplotlib notebook
sns.swarmplot(x='Condition',y='Lithium Chloride',data=df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f4a7831f8d0>

In [10]:
%matplotlib notebook
# Create 3 different clusters
kmeans = KMeans(n_clusters=6)
kmeans.fit(X_reduced)
# Get the cluster centroids
centers = kmeans.cluster_centers_

# 2D PCA
ig, ax = plt.subplots()
colors = ['royalblue', 'sandybrown', 'firebrick']
lw = 2
conditions=unique(temperature)


for color, i, conditions in zip(colors, [15, 37, 43], conditions):
    plt.scatter(X_reduced[temperature == i, 1], X_reduced[temperature == i, 2], color=color, alpha=.8, lw=lw,
                label=conditions)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.grid(linestyle='dotted')


ax.set_xlabel('PC2',fontsize=12, fontweight='bold')
ax.set_ylabel('PC3',fontsize=12, fontweight='bold')
fontsize = 12
ax = gca()
for tick in ax.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax.yaxis.get_major_ticks():tric = 'cosine'
     tick.label1.set_fontsize(fontsize)
     tick.label1.set_fontweight('bold')


labels=strain
for label, x, y in zip(labels, X_reduced[:, 1], X_reduced[:, 2]):
	plt.annotate(
		label,
		xy=(x, y),
        textcoords='data',fontsize=7)

IndentationError: unexpected indent (<ipython-input-10-b7a261e16116>, line 30)

In [11]:
#Inspect each PC and look at the top wells, cluster them and inspect treatment
loadings = np.absolute(pca.components_)
percent=int(floor(0.8*len(loadings[1])))
order=np.argsort(loadings[1])
pc1=data.iloc[:,order[percent:len(loadings[1])]]
#Color by factor
colors1 = ['royalblue',  'firebrick', 'sandybrown','olive']
y1= pd.Series(temperature, dtype="category")
lut = dict(zip(y1.unique(), colors1))
colors=y1.map(lut)
#We can see how these variables effectively cluster our data
sns.set(font_scale=0.5) 
sns.clustermap(pc1,cmap="RdYlGn",row_colors=colors.values,z_score=0)
#Save
pp = PdfPages('pc2_clust.pdf')
plt.savefig(pp,format='pdf')
pp.close()

<IPython.core.display.Javascript object>

In [20]:
%matplotlib notebook
sns.swarmplot(x='Condition',y='L-Alanine',data=df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f4a783a6860>

The fourth PC separates the strains.

In [12]:
%matplotlib notebook
# 2D PCA
ig, ax = plt.subplots()
colors = ['royalblue', 'sandybrown', 'firebrick']
lw = 2
conditions=unique(strain)


for color, i, conditions in zip(colors, [606,607], conditions):
    plt.scatter(X_reduced[strain == i, 0], X_reduced[strain == i, 1], color=color, alpha=.8, lw=lw,
                label=conditions)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.grid(linestyle='dotted')


ax.set_xlabel('PC4',fontsize=12, fontweight='bold')
ax.set_ylabel('PC1',fontsize=12, fontweight='bold')
fontsize = 12
ax = gca()
for tick in ax.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax.yaxis.get_major_ticks():
     tick.label1.set_fontsize(fontsize)
     tick.label1.set_fontweight('bold')
#Save


<IPython.core.display.Javascript object>

The talus plot indicates that all the signal must be around 5 principal components.

In [14]:
%matplotlib notebook
#Inspect each PC and look at the top wells, cluster them and inspect treatment
loadings = np.absolute(pca.components_)
percent=int(floor(0.8*len(loadings[4])))
order=np.argsort(loadings[4])
pc1=data.iloc[:,order[percent:len(loadings[4])]]
#Color by factor
colors1 = ['royalblue', 'sandybrown', 'firebrick','olive']
y1= pd.Series(strain, dtype="category")
lut = dict(zip(y1.unique(), colors1))
colors=y1.map(lut)
#We can see how these variables effectively cluster our data
sns.set(font_scale=0.5) 
sns.clustermap(pc1,cmap="RdYlGn",row_colors=colors.values,z_score=0)  


pp = PdfPages('pc4_clust.pdf')
plt.savefig(pp,format='pdf')
pp.close()

         

<IPython.core.display.Javascript object>

We can see that the third principal component separates the strains.

In [36]:
#Save loadings for the first PCs
wells=list(data)
pc1_load=pd.DataFrame({'wells': wells, 'loadings': loadings[0]})
pc1_load.to_csv('pc1_loadings.csv')
pc2_load=pd.DataFrame({'wells': wells, 'loadings': loadings[1]})
pc2_load.to_csv('pc2_loadings.csv')
pc3_load=pd.DataFrame({'wells': wells, 'loadings': loadings[2]})
pc3_load.to_csv('pc3_loadings.csv')
pc4_load=pd.DataFrame({'wells': wells, 'loadings': loadings[3]})
pc4_load.to_csv('pc4_loadings.csv')

Look at particular wells

In [22]:
%matplotlib notebook
sns.swarmplot(x='Strain',y='pH6',data=df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7fb2a09aa978>

In [37]:
%%capture
#Plot all components
components=pd.DataFrame(X_reduced[:,0:12])
variables=df.iloc[:,0:4]
variables=variables.reset_index()
projected['Condition'] = projected['Condition'].astype('category')
projected['Strain'] = projected['Strain'].astype('category')
projected=projected.drop(columns=['index'])

g = sns.pairplot(projected, hue="Condition")

pp = PdfPages('pc_temperature.pdf')
plt.savefig(pp,format='pdf')
pp.close()

components=pd.DataFrame(X_reduced[:,0:12])
variables=df.iloc[:,0:4]
variables=variables.reset_index()
projected = pd.concat([variables, components], axis=1)
projected['Condition'] = projected['Condition'].astype('category')
projected['Strain'] = projected['Strain'].astype('category')
projected=projected.drop(columns=['index'])

g = sns.pairplot(projected, hue="Strain")


array([[0.06310827, 0.03083421, 0.05401369, ..., 0.02035889, 0.06877486,
        0.10399594],
       [0.16359619, 0.03029754, 0.05972509, ..., 0.04918483, 0.30018165,
        0.26292903],
       [0.04352521, 0.05235326, 0.0452744 , ..., 0.22073253, 0.03385885,
        0.14680419],
       ...,
       [0.14553997, 0.07087434, 0.01905164, ..., 0.0714289 , 0.01368785,
        0.05671379],
       [0.05952424, 0.0496513 , 0.19928638, ..., 0.02675414, 0.07089794,
        0.07012135],
       [0.33315007, 0.02573336, 0.01126048, ..., 0.12716133, 0.00170717,
        0.05240905]])