# Principal Component Analysis of the Biolog Data
## Ancestral clones

After processing the data according to our normalization and filtering protocol, we perform dimensionality reduction using PCA.

In [1]:
#Libraries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cmx
from scipy import interpolate
from matplotlib.backends.backend_pdf import PdfPages
from mpl_toolkits.mplot3d import Axes3D
import sys
from pylab import *
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
import os 
import dplython

#Functions
def talus(singular):
    talo=[]
    for i in range(0,len(singular)-2):
        value=log(singular[i])-log(singular[i+1])
        talo.append(value)
    return talo
        

First, we will look at the data of the clones derived from the ancestral population for both linages.

In [2]:
#Load the data
name="normalizedAncestor606Wells86&94RemovedBiologData.csv"
name2="normalizedAncestor607Wells86&94RemovedBiologData.csv"
df2=pd.read_csv(name2)
df=pd.read_csv(name)
df=pd.concat([df,df2])
temperature = df.Condition.values #Or any other factor
strain = df.Strain.values
biolog=pd.read_csv("gen_biolog .csv")
dictbio=biolog.set_index('Well').to_dict()
df.columns=df.columns.to_series().map(dictbio['Assay'])
#for PCA
data=df.iloc[:,4:]
X = df.iloc[:,4:].values

Given the talus plot, we can see that the variability of our data is mostly explained with 4 principal components.

In [3]:
%matplotlib notebook
#PCA
pca = PCA()
X_reduced = pca.fit_transform(scale(X,with_std=False))
#Variance explained
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance');
plt.grid();
plt.xticks(np.arange(0, 32, step=2));
#plt.xticks(np.arange(0, 50, step=2));
PC1_v=round(100*pca.explained_variance_ratio_[0],1)
PC2_v=round(100*pca.explained_variance_ratio_[1],1)
PC3_v=round(100*pca.explained_variance_ratio_[2],1)
PC4_v=round(100*pca.explained_variance_ratio_[3],1)
PC5_v=round(100*pca.explained_variance_ratio_[4],1)

<IPython.core.display.Javascript object>

In [4]:
%matplotlib notebook
#Talus Plot
tal=talus(pca.singular_values_)
plot(np.arange(1,33),tal,'o-')
plt.xlabel('PC')
plt.ylabel(r"$\frac{\log\left[{\lambda\left(PC_i\right)}\right]}{\log\left[{\lambda\left(PC_{i+1}\right)}\right]}$");
plt.grid();
plt.xticks(np.arange(0, 32, step=2));
#plt.xticks(np.arange(0, 32, step=2));
plt.axvline(x=4, linestyle = '--', color = 'firebrick')

#Save
pp = PdfPages('Talus_ancestral.pdf')
plt.savefig(pp,format='pdf')
pp.close()

<IPython.core.display.Javascript object>

We will first inspect the first three principal components.

In [7]:
%matplotlib notebook
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
#Color by factor
colors1 = ['royalblue','sandybrown', 'firebrick']
y1= pd.Series(temperature, dtype="category")
lut = dict(zip(y1.unique(), colors1))
colors=y1.map(lut)

ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=colors,
          cmap=plt.cm.Set1, edgecolor='k', s=40)

ax.set_xlabel("PC1 ({})".format(PC1_v))
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("PC2 ({})".format(PC2_v))
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("PC3 ({})".format(PC3_v))
ax.w_zaxis.set_ticklabels([])


<IPython.core.display.Javascript object>

[]

The samples are separated well by both temperature and strain.

In [8]:
%matplotlib notebook
# Create 3 different clusters
kmeans = KMeans(n_clusters=6)
kmeans.fit(X_reduced)
# Get the cluster centroids
centers = kmeans.cluster_centers_

# 2D PCA
ig, ax = plt.subplots()
colors = ['royalblue', 'sandybrown', 'firebrick']
lw = 2
conditions=unique(temperature)


for color, i, conditions in zip(colors, [15, 37, 43], conditions):
    plt.scatter(X_reduced[temperature == i, 0], X_reduced[temperature == i, 1], color=color, alpha=.8, lw=lw,
                label=conditions)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.grid(linestyle='dotted')

# Overlay the centroids on the scatter plot
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=100,marker='x')
ax.set_xlabel("PC1 ({})".format(PC1_v),fontsize=12, fontweight='bold')
ax.set_ylabel("PC2 ({})".format(PC2_v),fontsize=12, fontweight='bold')
fontsize = 12
ax = gca()
for tick in ax.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax.yaxis.get_major_ticks():
     tick.label1.set_fontsize(fontsize)
     tick.label1.set_fontweight('bold')


labels=strain
for label, x, y in zip(labels, X_reduced[:, 0], X_reduced[:, 1]):
	plt.annotate(
		label,
		xy=(x, y),
        textcoords='data',fontsize=7)

#Save
pp = PdfPages('Ancestral_2D.pdf')
plt.savefig(pp,format='pdf')
pp.close()

<IPython.core.display.Javascript object>

PC1 separates the three temperatures, PC2 separates the perturbed temperatures from the optimal.

KeyError: 'Wells'

KeyError: 'Well'

In [78]:
# Read the significant genes
sig = pd.read_csv('strain_ancestral_wells.csv')
pc1 = data[sig['Well'].values]
#Color by factor
#Color by factor
colors1 = ['royalblue', 'sandybrown', 'firebrick','olive']
y1= pd.Series(strain, dtype="category")
lut = dict(zip(y1.unique(), colors1))
colors=y1.map(lut)
# Color by well type
wells_f = pd.read_csv('gen_biolog .csv')
biolog2 =  biolog.iloc[6:,:]
dictbio=biolog2.set_index('Assay').to_dict()
wells_f['Type'] = wells_f['Assay'].map(dictbio['Other'])
# Colors 
colors2 = ['lightskyblue','seagreen']
sig['Type'] = sig['Well'].map(dictbio['Other'])
other = sig['Type'].values
y2= pd.Series(other, dtype="category")
lut2 = dict(zip(y2.unique(), colors2))
colors2=y2.map(lut2)

In [81]:
#We can see how these variables effectively cluster our data
sns.set(font_scale=0.5) 
sns.clustermap(pc1,cmap="RdYlGn",row_colors=colors.values,col_colors=colors2.values,z_score=1) 
pp = PdfPages('strain_effect_ancestral.pdf')
plt.savefig(pp,format='pdf')
pp.close()

<IPython.core.display.Javascript object>

In [28]:
#Color by factor
#Color by factor
colors1 = ['royalblue', 'sandybrown', 'firebrick','olive']
y1= pd.Series(strain, dtype="category")
lut = dict(zip(y1.unique(), colors1))
colors=y1.map(lut)
#We can see how these variables effectively cluster our data
sns.set(font_scale=0.5) 
sns.clustermap(pc1,cmap="RdYlGn",row_colors=colors.values,col_colors=colors2.values,z_score=1) 
pp = PdfPages('sig_all_clust.pdf')
plt.savefig(pp,format='pdf')
pp.close()

<IPython.core.display.Javascript object>

IndexError: index 8 is out of bounds for axis 1 with size 8

In [7]:
%matplotlib notebook
#Inspect each PC and look at the top wells, cluster them and inspect treatment
loadings = np.absolute(pca.components_)
percent=int(floor(0.8*len(loadings[0])))
order=np.argsort(loadings[0])
pc1=data.iloc[:,order[percent:len(loadings[0])]]
#Color by factor
colors1 = ['royalblue',  'firebrick', 'sandybrown','olive']
y1= pd.Series(temperature, dtype="category")
lut = dict(zip(y1.unique(), colors1))
colors=y1.map(lut)
#We can see how these variables effectively cluster our data
sns.set(font_scale=0.5) 
sns.clustermap(pc1,cmap="RdYlGn",row_colors=colors.values,z_score=0) 
pp = PdfPages('pc1_clust.pdf')
plt.savefig(pp,format='pdf')
pp.close()

<IPython.core.display.Javascript object>

In [8]:
%matplotlib notebook
sns.swarmplot(x='Condition',y='Lithium Chloride',data=df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f4a7831f8d0>

In [23]:
%matplotlib notebook
# Create 3 different clusters
kmeans = KMeans(n_clusters=6)
kmeans.fit(X_reduced)
# Get the cluster centroids
centers = kmeans.cluster_centers_

# 2D PCA
ig, ax = plt.subplots()
colors = ['royalblue', 'sandybrown', 'firebrick']
lw = 2
conditions=unique(temperature)


for color, i, conditions in zip(colors, [15, 37, 43], conditions):
    plt.scatter(X_reduced[temperature == i, 1], X_reduced[temperature == i, 2], color=color, alpha=.8, lw=lw,label=conditions)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.grid(linestyle='dotted')


ax.set_xlabel("PC2 ({})".format(PC2_v),fontsize=12, fontweight='bold')
ax.set_ylabel("PC3 ({})".format(PC3_v),fontsize=12, fontweight='bold')
fontsize = 12
ax = gca()


<IPython.core.display.Javascript object>

In [11]:
#Inspect each PC and look at the top wells, cluster them and inspect treatment
loadings = np.absolute(pca.components_)
percent=int(floor(0.8*len(loadings[1])))
order=np.argsort(loadings[1])
pc1=data.iloc[:,order[percent:len(loadings[1])]]
#Color by factor
colors1 = ['royalblue',  'firebrick', 'sandybrown','olive']
y1= pd.Series(temperature, dtype="category")
lut = dict(zip(y1.unique(), colors1))
colors=y1.map(lut)
#We can see how these variables effectively cluster our data
sns.set(font_scale=0.5) 
sns.clustermap(pc1,cmap="RdYlGn",row_colors=colors.values,z_score=0)
#Save
pp = PdfPages('pc2_clust.pdf')
plt.savefig(pp,format='pdf')
pp.close()

<IPython.core.display.Javascript object>

In [20]:
%matplotlib notebook
sns.swarmplot(x='Condition',y='L-Alanine',data=df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f4a783a6860>

The fourth PC separates the strains.

In [25]:
%matplotlib notebook
# 2D PCA
ig, ax = plt.subplots()
colors = ['royalblue', 'sandybrown', 'firebrick']
lw = 2
conditions=unique(strain)


for color, i, conditions in zip(colors, [606,607], conditions):
    plt.scatter(X_reduced[strain == i, 3], X_reduced[strain == i, 0], color=color, alpha=.8, lw=lw,
                label=conditions)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.grid(linestyle='dotted')



ax.set_xlabel("PC4 ({})".format(PC4_v),fontsize=12, fontweight='bold')
ax.set_ylabel("PC1 ({})".format(PC1_v),fontsize=12, fontweight='bold')
fontsize = 12
ax = gca()
for tick in ax.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize)
    tick.label1.set_fontweight('bold')
for tick in ax.yaxis.get_major_ticks():
     tick.label1.set_fontsize(fontsize)
     tick.label1.set_fontweight('bold')
#Save
pp = PdfPages('pc4_strain.pdf')
plt.savefig(pp,format='pdf')
pp.close()
tric = 'cosine'

<IPython.core.display.Javascript object>

The talus plot indicates that all the signal must be around 5 principal components.

In [14]:
%matplotlib notebook
#Inspect each PC and look at the top wells, cluster them and inspect treatment
loadings = np.absolute(pca.components_)
percent=int(floor(0.8*len(loadings[4])))
order=np.argsort(loadings[4])
pc1=data.iloc[:,order[percent:len(loadings[4])]]
#Color by factor
colors1 = ['royalblue', 'sandybrown', 'firebrick','olive']
y1= pd.Series(strain, dtype="category")
lut = dict(zip(y1.unique(), colors1))
colors=y1.map(lut)
#We can see how these variables effectively cluster our data
sns.set(font_scale=0.5) 
sns.clustermap(pc1,cmap="RdYlGn",row_colors=colors.values,z_score=0)  


pp = PdfPages('pc4_clust.pdf')
plt.savefig(pp,format='pdf')
pp.close()

         

<IPython.core.display.Javascript object>

We can see that the third principal component separates the strains.

In [32]:
#Save loadings for the first PCs
loadings = np.absolute(pca.components_)
wells=list(data)
pc1_load=pd.DataFrame({'wells': wells, 'loadings': loadings[0]})
pc1_load.to_csv('pc1_loadings_ances.csv')
pc2_load=pd.DataFrame({'wells': wells, 'loadings': loadings[1]})
pc2_load.to_csv('pc2_loadings_ances.csv')
pc3_load=pd.DataFrame({'wells': wells, 'loadings': loadings[2]})
pc3_load.to_csv('pc3_loadings_ances.csv')
pc4_load=pd.DataFrame({'wells': wells, 'loadings': loadings[3]})
pc4_load.to_csv('pc4_loadings_ances.csv')

Look at particular wells

In [46]:
%matplotlib notebook
sns.swarmplot(x='Condition',y='pH5',hue='Strain',data=df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f936c4d79b0>

In [37]:
%%capture
#Plot all components
components=pd.DataFrame(X_reduced[:,0:12])
variables=df.iloc[:,0:4]
variables=variables.reset_index()
projected['Condition'] = projected['Condition'].astype('category')
projected['Strain'] = projected['Strain'].astype('category')
projected=projected.drop(columns=['index'])

g = sns.pairplot(projected, hue="Condition")

pp = PdfPages('pc_temperature.pdf')
plt.savefig(pp,format='pdf')
pp.close()

components=pd.DataFrame(X_reduced[:,0:12])
variables=df.iloc[:,0:4]
variables=variables.reset_index()
projected = pd.concat([variables, components], axis=1)
projected['Condition'] = projected['Condition'].astype('category')
projected['Strain'] = projected['Strain'].astype('category')
projected=projected.drop(columns=['index'])

g = sns.pairplot(projected, hue="Strain")


array([[0.06310827, 0.03083421, 0.05401369, ..., 0.02035889, 0.06877486,
        0.10399594],
       [0.16359619, 0.03029754, 0.05972509, ..., 0.04918483, 0.30018165,
        0.26292903],
       [0.04352521, 0.05235326, 0.0452744 , ..., 0.22073253, 0.03385885,
        0.14680419],
       ...,
       [0.14553997, 0.07087434, 0.01905164, ..., 0.0714289 , 0.01368785,
        0.05671379],
       [0.05952424, 0.0496513 , 0.19928638, ..., 0.02675414, 0.07089794,
        0.07012135],
       [0.33315007, 0.02573336, 0.01126048, ..., 0.12716133, 0.00170717,
        0.05240905]])