In [2]:
import numpy as np
import scipy.sparse as sp
import pandas as pd
import sklearn.preprocessing as prepro
import sys
import os

import sys
sys.path.insert(0, '../')
import utils

# Loading in RNAseq data from TCGA

In [30]:
# We first start by loading in rawdata obtained from Ayse. Ask her about what kind of batch correction happened.
# This is a feature by sample+1 2d matrix. +1 is just a label colmun
rawdata = pd.read_table(open("../data/rawdata/TCGA/all_TCGA_data_joined_Batch_Corrected.tsv"))
rawdata.shape
data = rawdata.values[:, 1:]
data.shape

(9092, 3949)

In [31]:
# Here are labels for cancer types. its 
labels = [i for i in rawdata][1:]
print len(labels)
np.save("../data/rawdata/TCGA/labels.npy", labels)
print "examples:", labels[:10]

3949
examples: ['A1CF', 'AAGAB', 'AAK1', 'AASDHPPT', 'ABCA5', 'ABCB1', 'ABCB8', 'ABCC3', 'ABCC4', 'ABCC5']


# Making train/valid/test splits here.

In [32]:
df = pd.read_table("../data/rawdata/TCGA/TCGA_cancer_types.tsv")

In [33]:
# Basically making a one hot encoded truth matrix. so you would expect a 9092 by 33 matrix. 
temp = df["Cancertype"].values
lbs = list(set(temp))
lbmap = dict([(lbs[i], i) for i in range(len(lbs))])

y = np.zeros((len(temp), len(lbs)))
for i in range(len(temp)):
    y[i, lbmap[temp[i]]] = 1

# Double checking with an assert statement.
assert y.shape[0] ==  np.sum(y)

In [34]:
# Generating indexes to split on. We are using 640 test, 1280 validation, and about 7000 training samples.
indexes = np.arange(y.shape[0])
np.random.shuffle(indexes)
train = indexes[:-1920]
valid = indexes[-1920:-640]
test = indexes[-640:]
len(train), len(valid), len(test)

(7172, 1280, 640)

In [35]:
np.save("../data/trainX.npy", data[train])
np.save("../data/validX.npy", data[valid])
np.save("../data/testX.npy", data[test])

In [36]:
np.save("../data/trainY.npy", y[train])
np.save("../data/validY.npy", y[valid])
np.save("../data/testY.npy", y[test])

# Making adjacency matrix

In [3]:
rawppi = open("../data/rawdata/PPI/BIOGRID-ORGANISM-Homo_sapiens-3.4.157.mitab.txt")

In [4]:
ppi = rawppi.readlines()

In [5]:
# intx is a class that stores ppi
temp = utils.intx(ppi[123])
print temp

PRRC2A_SSR3


In [6]:
# Double checking if it works
print "SSR3" in temp
print temp.isInteracting("PRRC2A", "SSR3")
print temp.isInteracting("SSR3", "PRRC2A")
print temp.isInteracting("SSR3", "dmy")

True
True
True
False


In [8]:
# Loading in all interactions
interactions = [utils.intx(i) for i in ppi[1:]]
print "# of interactions:", len(interactions)

# of interactions: 410003


In [9]:
# How many iteraction does SERF2 have?
sum(["SERF2" in i for i in interactions])

15

In [38]:
# Basically checking if a label is involved in an interaction (0th)
print interactions[0]
inds = [i for i in range(len(labels)) if labels[i] in interactions[0]]
print inds

MAP2K4_FLNC
[]


In [39]:
# We are generating a 20000 by 20000 ppi matrix. This may take a while but you need to only do it once.
ppi_matrix = np.zeros((len(labels), len(labels)))

discarded = []
# For each interaction,
for interaction in interactions:
    # Check which genes participate in interactions
    # There are some weird cases where more than 2 genes participate in a single interactions 
    inds = [i for i in range(len(labels)) if labels[i] in interaction]
    temp = []
    
    # Get a tuple of indecies to fill. Do not fill in self-interaction.
    for i in inds:
        for j in inds:
            if i!=j:
                temp.append([i, j])
                temp.append([j, i])
                
    # Fill the matrix.
    for t in temp:
        ppi_matrix[t[0], t[1]] = 1
    
    # Just keep track of what kinds of interactions are being discarded.
    if len(temp) == 0:
        discarded.append(interaction)
                
    # Print interactions that are weird.
    if len(inds)<2:
        #print [labels[j] for j in inds], interaction, temp
        pass

In [40]:
np.save("../data/ppi2.npy", ppi_matrix)

In [42]:
ppi_matrix = np.load("../data/ppi2.npy")
print "Shape:", ppi_matrix.shape
print "Sparcity:", np.sum(ppi_matrix)/(ppi_matrix.shape[0]**2)

Shape: (3949, 3949)
Sparcity: 0.003454144161816003
