## Necessary Imports

In [1]:
%matplotlib notebook
import pandas as pd, numpy as np, random
import matplotlib.pyplot as plt, matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing
from sklearn.cluster import KMeans
import seaborn as sns

## Reading Data

We have 6 difference species worth of data. Here we are reading in and analyzing just one. 

In [2]:
# Read csv into data frame. 
df = pd.read_csv('data/gene_level/DC10_GL_clust-smry.csv')

# Keep only relevant columns in the data frame.
df = df[['sum_meanTPM_Ti','sum_meanTPM_Tii','sum_meanTPM_Tiii','sum_meanTPM_Tiv', 'TC_qval']]

# Remove rows that have missing values and/or infinite values. 
df = df.replace([np.inf, -np.inf], np.nan).dropna()

# Convert everything to doubles. 
df = df.astype('double')

# Remove rows with TC_qval greater than 0.1. These are 'Timecourse Corrected P-Values' that are based on variance.
# The lower the q value, the tigher the spread. 
df = df[df['TC_qval'] < 0.1]

# We are not done with the q values. So we remove them.
df = df.drop(['TC_qval'], axis = 1)

## Raw Data Plot

Each gene has expression levels given at 4 different time points. We take each one of those time points and plot them. Since we can only plot in 3 dimensions, the last time point is represented as color. 

In [3]:
# Most of the data points are less than 200. I am only plotting ones larger than 200 to make the code run faster.
x = df['sum_meanTPM_Ti'].where(df['sum_meanTPM_Ti'] > 100)
y = df['sum_meanTPM_Tii'].where(df['sum_meanTPM_Tii'] > 100)
z = df['sum_meanTPM_Tiii'].where(df['sum_meanTPM_Tiii'] > 100)
r = df['sum_meanTPM_Tiv'].where(df['sum_meanTPM_Tiv'] > 100)

# Supress massive values by taking log. 
# Values may be massive becuase of large expression levels or misreads. 
x = np.log10(x)
y = np.log10(y)
z = np.log10(z)
r = np.log10(r)

# Plot the result. 
ax = plt.axes(projection='3d')
ax.scatter(x, y, z, c=r, cmap='viridis', linewidth=0.5);
ax.set_xlabel('sum_meanTPM_Ti')
ax.set_ylabel('sum_meanTPM_Tii')
ax.set_zlabel('sum_meanTPM_Tiii')
for angle in range(0, 360):
    ax.view_init(30, angle)
    plt.draw()
    plt.pause(.001)

<IPython.core.display.Javascript object>

## Clustering

Discounting color we can see 3 distinct bands. Each one of these represents a group of genes with similar expression curves. 

These bands do not follow the good clustering principal. So it is very difficult to tease them appart. Especially when we consider the 4th dimension (color). 

So we use singular vector decomposition to reduce the dimensionality and create groups that follow the good clustering principal. 

In [4]:
# We normalize the data with 'l2' normalization.
# This makes the sum of the squares of each feature equal to one.
normalized_df = preprocessing.normalize(df, norm = 'l2')

# Create a object that can perform the singular vector decomposition
svd = TruncatedSVD(n_components=3)

# We perform the singular vector decomposition.
# This results in a 3 dimensional data set.
reduced = svd.fit_transform(normalized_df)

# Converting the result to a dataframe.
reduced_df = pd.DataFrame(data=reduced, columns = ["x", "y", "z"])

# My man Kartik Krishnan says that 8 clusters are where its at.
pred = KMeans(n_clusters = 8).fit_predict(reduced_df)

# Lets plot it. We color based off the color. 
x = reduced_df["x"]
y = reduced_df["y"]
z = reduced_df["z"]
r = pred

ax = plt.axes(projection='3d') 
ax.scatter(x, y, z, c=r, cmap='viridis', linewidth=0.5);
for angle in range(0, 360):
    ax.view_init(30, angle)
    plt.draw()
    plt.pause(.001)

## Contents of each cluster. 

Each one of these clusters should represent genes that follow similar expression curves. We should check to make sure that is the case.

In [None]:
# We gotta tidy up our dataframe to make these plots work. 
# This will take a hot second to run. Its cool. Grab some coffee or something. 

df['cluster'] = pred

tidy_df = pd.DataFrame(columns=['tp','cluster','mean_TPM'])

for index, row in df.iterrows():
    
    # Normalize each gene. We only care about the shape.
    mean_TPMs = row[0:4]
    mean_TPMs = (mean_TPMs - mean_TPMs.min()) / (mean_TPMs.max() - mean_TPMs.min())    
    
    # Append genes info to tidy_df
    tidy_df = tidy_df.append({ 'tp':0, 'cluster':row[4], 'mean_TPM':mean_TPMs[0]}, ignore_index = True)
    tidy_df = tidy_df.append({ 'tp':1, 'cluster':row[4], 'mean_TPM':mean_TPMs[1]}, ignore_index = True)
    tidy_df = tidy_df.append({ 'tp':2, 'cluster':row[4], 'mean_TPM':mean_TPMs[2]}, ignore_index = True)
    tidy_df = tidy_df.append({ 'tp':3, 'cluster':row[4], 'mean_TPM':mean_TPMs[3]}, ignore_index = True)
    
tidy_df

In [None]:
# Plot each clusters genes.
%matplotlib notebook

for i in range(8):
    plt.figure()
    cluster_df = tidy_df[tidy_df.cluster == i]
    ax = sns.lineplot(x="tp", y="mean_TPM", data=cluster_df, ci='sd')
    ax.set_title('Cluster ' + str(i) + '. Contains ' + str(int(cluster_df.shape[0]/4)) + ' genes.')
    ax.set_xticks([0,1,2,3])

In [None]:
%matplotlib notebook
import scipy.optimize
import matplotlib.pyplot as plt


#parameter for curve_fit that provides general structure of curve
def func_struct(x, a, b, c, d):
    return a*(x**3)+b*(x**2)+c*(x)+d


#creates datapoints from 0 to 3 at intervals of 0.1
time = np.arange(0.0, 3.0, 0.1)

functions = {}

for i in range(8):
    plt.figure()
    cluster_df = tidy_df[tidy_df.cluster == i]
    popt, pcov = scipy.optimize.curve_fit(func_struct, cluster_df.tp, cluster_df.mean_TPM)
    functions[i] = popt
       
    
    plt.title("Curve for Cluster " + str(i))
    plt.plot(time, func_struct(time, popt[0], popt[1], popt[2], popt[3]))
    plt.show()
   
    
    

In [None]:
functions

In [None]:
import numpy as np
import similaritymeasures
import matplotlib.pyplot as plt
from numpy import *
from matplotlib.pyplot import *


time = np.arange(0.0, 3.0, 0.1)

datapoint_arrays = []

#create data 
# there are 8 clusters to get data from
for i in range(0, 8):
    #each time interval
    y = np.zeros(len(time))
    for j in range(0, len(time)):
        y[j] = func_struct(time[j], functions[i][0], functions[i][1], functions[i][2], functions[i][3])
    #add the array made up datapoints for this cluster
    arr = np.zeros((len(time), 2))
    arr[:, 0] = time
    arr[:, 1] = y
    datapoint_arrays.append(arr)

    

def run_comparisons(c1, c2):
    # quantify the difference between the two curves using PCM
    pcm = similaritymeasures.pcm(datapoint_arrays[c1], datapoint_arrays[c2])
    pcm_matrix[c1][c2] = pcm
    
    # quantify the difference between the two curves using
    # Discrete Frechet distance
    df = similaritymeasures.frechet_dist(datapoint_arrays[c1], datapoint_arrays[c2])
    frechet_matrix[c1][c2] = df

    # quantify the difference between the two curves using
    # area between two curves
    area = similaritymeasures.area_between_two_curves(datapoint_arrays[c1], datapoint_arrays[c2])
    area_matrix[c1][c2] = area
    # quantify the difference between the two curves using
    # Curve Length based similarity measure
    cl = similaritymeasures.curve_length_measure(datapoint_arrays[c1], datapoint_arrays[c2])
    cl_matrix[c1][c2] = cl
    
    # quantify the difference between the two curves using
    # Dynamic Time Warping distance
    dtw, d = similaritymeasures.dtw(datapoint_arrays[c1], datapoint_arrays[c2])
    dtw_matrix[c1][c2] = dtw
    

#converting to easily viewable 2d arrays

w, h = 8, 8;
frechet_matrix = [[0 for x in range(w)] for y in range(h)] 
pcm_matrix = [[0 for x in range(w)] for y in range(h)] 
area_matrix = [[0 for x in range(w)] for y in range(h)] 
cl_matrix = [[0 for x in range(w)] for y in range(h)] 
dtw_matrix = [[0 for x in range(w)] for y in range(h)] 


for i in range(0, 8):
    for j in range(i + 1, 8):
        run_comparisons(i, j)


def print_matrix(title, matrix):
    print('\n'+title)
    print(' '.join(["{:8}".format(x) for x in range(0, 8)]))          
    for i in range(0, 8):
        print ('\n'+str(i)+' '+' '.join(["{:8.5f}".format(matrix[i][j]) for j in range(0, 8)]))
        

print_matrix('PCM', pcm_matrix)
print_matrix('Frechet Distance', frechet_matrix)
print_matrix('Area Difference', area_matrix)
print_matrix('Curve Length Distance', cl_matrix)
print_matrix('Dynamic Time Warping Distance', dtw_matrix)
  

