In [50]:
from sklearn import manifold
from scipy import sparse 
import numpy as np
from time import time
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn import metrics
%matplotlib inline

## Importing sparse data

In [2]:
# Function to load csr matrix of samples and list of colnames from file
def load_csr_from_csv(filename):
    indices = np.loadtxt(filename+"_indices.csv",unpack=True,delimiter=",")
    indptr = np.loadtxt(filename+"_indptr.csv",unpack=True,delimiter=",")
    shape = np.loadtxt(filename+"_shape.csv",unpack=True,delimiter=",")
    col_names = np.loadtxt(filename+"_col_names.csv",unpack=True,delimiter=",",dtype=str)
    data = np.ones_like(indices,dtype=np.int8)
    
    return sparse.csr_matrix((data, indices, indptr),
                         shape = shape),col_names

In [3]:
# returns the csr matrix of samples and list of colnames
def load_csr_from_csv_with_data(filename):
    data = np.loadtxt(filename+"_data.csv",unpack=True,delimiter=",",dtype=np.int8)
    indices = np.loadtxt(filename+"_indices.csv",unpack=True,delimiter=",")
    indptr = np.loadtxt(filename+"_indptr.csv",unpack=True,delimiter=",")
    shape = np.loadtxt(filename+"_shape.csv",unpack=True,delimiter=",")
    col_names = np.loadtxt(filename+"_col_names.csv",unpack=True,delimiter=",",dtype=str)
    
    return sparse.csr_matrix((data, indices, indptr),
                         shape = shape),col_names

In [9]:
# load the target values of brightness and create a binary vector 
# with 1 for samples with brightness above threshold and 0 otherwise
y,std = np.loadtxt("target_values.csv",unpack=True,delimiter=",")

threshold = 0.95 #threshold for target values

y_bin = np.where(y >= y[0]*threshold,1,0) # binary respresentation of target values based on threshold

class_weights = (y_bin.sum()*1./(y_bin.shape[0]-y_bin.sum())*1.,1.)
 
print "Total number of mutants: %d"%y.shape[0]
print "Fraction of positive samples: %.2f"%(y_bin.sum()*1./y_bin.shape[0])

Total number of mutants: 54025
Fraction of positive samples: 0.34


In [10]:
sample_weights = np.loadtxt("sample_weights.csv", delimiter=",")
sample_weights = np.log10(sample_weights*10)
sample_weights = (sample_weights/sample_weights.max()).reshape(-1)

In [11]:
# Load the full mutation list dataset with column names from file
X, col_names = load_csr_from_csv("sparse_csr")
print X.shape

(54025, 1879)


In [12]:
# Retrieve all the mutants with single mutations 
# and create the look up table for this single mutations
single_mutations = {}
single_mut_idx = []

for i in xrange(X.shape[0]):
    if X.getrow(i).indices.shape[0] == 1: #how many mutations in this mutant? one?
        index = X.getrow(i).indices[0]
        mutation = col_names[index]
#         print mutation, i
        if mutation not in single_mutations:
            single_mutations[mutation] = {"id":index,"mutants":[i]}
            single_mut_idx.append(i)
        else:
            single_mutations[mutation]["mutants"].append(i)

## MDS

In [None]:
t0 = time()
clf = manifold.MDS(n_components=10, max_iter=100, n_init=1)
Y = clf.fit_transform(X.toarray())
t1 = time()
print("MDS: %.2g sec" % (t1 - t0))

### SVM on reduced data

In [None]:
print "Training SVM..."
t1 = time()
clf = SVC(C=100, gamma=0.1, class_weight={0:class_weights[0],1:class_weights[1]}, probability = True)
clf.fit(Y, y_bin, sample_weight = sample_weights)
print "Time taken: %.f"%(time()-t1)

In [None]:
yhat = clf.predict(Y)

In [None]:
print "Training Accuracy: %.2f" % metrics.accuracy_score(y_bin, yhat, sample_weight = sample_weights)