This is try to predict transembrane domaim from large set of data


In [2]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from math import log, exp

In [3]:
# A dictionary for parameters of single amino acid
# 'A': [Hydropathicity, side chain charge, polar, Interface Scale, pKa_sidechain, pI, Octanol Scale]
aa_info = {'A': [1.80, 0, 0, 0.17, 0.0, 6.0, 0.5],
           'C': [2.50, 0, 0, -0.24, 8.3, 5.1, 0],
           'D': [-3.5, -1, 1, 1.23, 3.9, 2.8, 3.64],
           'E': [-3.5, -1, 1, 0.0, 4.3, 3.2, 0.11],
           'F': [2.80, 0, 0, -1.13, 0.0, 5.5, -1.71],
           'G': [-0.4, 0, 0, 0, 0.0, 6.0, 1.15],
           'H': [-3.2, 0, -1, 0.17, 6.0, 7.6, 0.11],
           'I': [4.50, 0, 0, -0.31, 0.0, 6.0, -1.12],
           'K': [-3.9, 1, -1, 0.99, 10.5, 9.7, 2.8],
           'L': [3.80, 0, 0, -0.56, 0.0, 6.0, -1.25],
           'M': [1.90, 0, 0, -0.23, 0.0, 5.7, -0.67],
           'N': [-3.5, 0, 1, 0.42, 0.0, 5.4, 0.85],
           'P': [2.80, 0, 0, 0.45, 0.0, 6.3, 0.14],
           'Q': [-3.5, 0, 1, 0.58, 0.0, 5.7, 0.77],
           'R': [-4.5, 1, -1, 0.81, 12.5, 10.8, 1.81],
           'S': [-0.8, 0, 1, 0.13, 0.0, 5.6, 0.46],
           'T': [-0.7, 0, 1, 0.14, 0.0, 5.6, 0.25],
           'V': [4.20, 0, 0, 0.07, 0.0, 6.0, -0.46],
           'W': [-0.9, 0, 0, -1.85, 0.0, 5.9, -2.09],
           'Y': [-1.3, 0, 1, -0.94, 10.7, 5.7, -0.71]}

In [4]:
def parse(line):
    """ Extract labels and features from raw data
    
    :param line: single line from the input file, starts with 0 or 1, 
                 0 means that it is not transmenbrane domain, 1 means it is transmenbrane domain
                 20 charactors after the space, sanding for 20 residues
        
    :return: LabeledPoint: labeled with 0 or 1, and the features calculated from the peptides sequence  
             [Hydropathicity, side chain charge, polar, MW, pKa_sidechain, pI]
             
    """
    allAminoAcids = 'ACDEFGHIKLMNPQRSTVWY'
    label, seq = line.split()
    features = [0]*9
    for aa in seq:
        for i in range(7):
            features[i] += aa_info[aa][i]
        if aa in 'VILMFWC':
            features[7] += 1
        if aa in 'RK':
            features[8] += 1
    return LabeledPoint(label, features)

In [5]:
fileName = 'YeastTM20.dat'
rawData = sc.textFile(fileName, 2).map(parse)
print rawData.take(5)

[LabeledPoint(0.0, [8.9,-1.0,6.0,5.47,33.1,116.8,11.46,5.0,2.0]), LabeledPoint(0.0, [-27.8,-6.0,9.0,2.51,54.2,106.0,8.2,2.0,1.0]), LabeledPoint(0.0, [-46.9,-6.0,14.0,4.93,58.5,102.0,14.89,1.0,1.0]), LabeledPoint(0.0, [-4.4,2.0,4.0,4.72,52.2,128.0,10.57,6.0,4.0]), LabeledPoint(1.0, [33.2,0.0,4.0,-8.75,19.0,116.5,-12.04,12.0,0.0])]


In [6]:
weights = [0.8, 0.1, 0.1]
seed = 1
rawTrainData, rawValidationData, rawTestData = rawData.randomSplit(weights, seed)
rawTrainData.cache()
rawValidationData.cache()
rawTestData.cache()
nTrain = rawTrainData.count()
nVal = rawValidationData.count()
nTest = rawTestData.count()
print nTrain, nVal, nTest, nTrain + nVal + nTest

26490 3326 3302 33118


Loss should calculated for a give prediction and label

In [7]:
def computeLogLoss(p, y):
    """Calculates the value of log loss for a given probabilty and label.

    Note:
        log(0) is undefined, so when p is 0 we need to add a small value (epsilon) to it
        and when p is 1 we need to subtract a small value (epsilon) from it.

    Args:
        p (float): A probabilty between 0 and 1.
        y (int): A label.  Takes on the values 0 and 1.

    Returns:
        float: The log loss value.
    """
    epsilon = 10e-12
    if y == 1:
        pp = p
    if y == 0:
        pp = 1-p
    if pp == 0:
        return -log(pp+epsilon)
    elif pp ==1:
        return -log(pp-epsilon)
    else:
        return -log(pp)
    
def getP(x, w, intercept):
    """Calculate the probability for an observation given a set of weights and intercept.

    Note:
        We'll bound our raw prediction between 20 and -20 for numerical purposes.

    Args:
        x (SparseVector): A vector with values of 1.0 for features that exist in this
            observation and 0.0 otherwise.
        w (DenseVector): A vector of weights (betas) for the model.
        intercept (float): The model's intercept.

    Returns:
        float: A probability between 0 and 1.
    """
    rawPrediction = x.dot(w)+intercept

    # Bound the raw prediction value
    rawPrediction = min(rawPrediction, 20)
    rawPrediction = max(rawPrediction, -20)
    return 1/(1+exp(-rawPrediction))

In [8]:
def evaluateResults(model, data):
    """Calculates the log loss for the data given the model.

    Args:
        model (LogisticRegressionModel): A trained logistic regression model.
        data (RDD of LabeledPoint): Labels and features for each observation.

    Returns:
        float: Log loss for the data.
    """
    log_loss = (data.map(lambda x: computeLogLoss(getP(x.features, model.weights, model.intercept), x.label))
                    .reduce(lambda x, y: x+y))/data.count()
    return log_loss

In [10]:
# try fixed hyperparameters
numIters = 500
stepSize = 1
regParam = 1e-6
regType = 'l2'
includeIntercept = True

model0 = LogisticRegressionWithSGD.train(rawTrainData,
                                         iterations=numIters, 
                                         step=stepSize, 
                                         miniBatchFraction=1.0, 
                                         initialWeights=None, 
                                         regParam=regParam, 
                                         regType=regType, 
                                         intercept=includeIntercept)
print model0.weights, model0.intercept

[14.8043966925,0.859896618258,-0.669030917881,-5.88642694337,-0.696606468457,-2.49932420251,-8.7145353456,2.31841152976,-0.926274412398] 0.977904503532


In [11]:
classOneFracTrain = (rawTrainData.map(lambda x: x.label)
                                 .reduce(lambda x, y: x+y))/rawTrainData.count()
print classOneFracTrain

logLossTrBase = (rawTrainData.map(lambda x: x.label)
                             .map(lambda x: computeLogLoss(classOneFracTrain, x))
                             .reduce(lambda x, y: x+y))/rawTrainData.count()
print 'Baseline Train Logloss = {0:.3f}\n'.format(logLossTrBase)

0.284144960362
Baseline Train Logloss = 0.597



In [12]:
logLossTrLR0 = evaluateResults(model0, rawTrainData)
print ('Logloss:\n\tLogReg = {0:.3f}'
       .format(logLossTrLR0))

Logloss:
	LogReg = 0.857


In [13]:
logLossVa = evaluateResults(model0, rawValidationData)
print ('Logloss:\n\tLogReg = {0:.3f}'
       .format(logLossVa))

Logloss:
	LogReg = 0.793


In [15]:
numIters = 100
regType = 'l2'
includeIntercept = True

# Initialize variables using values from initial model training
bestModel = None
bestLogLoss = 1e10

stepSizes = [0.01, 0.1, 1, 10]
regParams = [1e-6, 1e-3]
for stepSize in stepSizes:
    for regParam in regParams:
        model = (LogisticRegressionWithSGD
                 .train(rawTrainData, numIters, stepSize, regParam=regParam, regType=regType,
                        intercept=includeIntercept))
        logLossVa = evaluateResults(model, rawValidationData)
        print ('\tstepSize = {0:.2f}, regParam = {1:.0e}: logloss = {2:.3f}'
               .format(stepSize, regParam, logLossVa))
        if (logLossVa < bestLogLoss):
            bestModel = model
            bestLogLoss = logLossVa

print ('Validation Logloss:\n\tBaseline = {0:.3f}\n\tLogReg = {1:.3f}'
       .format(logLossTrBase, bestLogLoss))

	stepSize = 0.01, regParam = 1e-06: logloss = 0.124
	stepSize = 0.01, regParam = 1e-03: logloss = 0.124
	stepSize = 0.10, regParam = 1e-06: logloss = 0.578
	stepSize = 0.10, regParam = 1e-03: logloss = 0.577
	stepSize = 1.00, regParam = 1e-06: logloss = 0.937
	stepSize = 1.00, regParam = 1e-03: logloss = 0.935
	stepSize = 10.00, regParam = 1e-06: logloss = 0.964
	stepSize = 10.00, regParam = 1e-03: logloss = 0.947
Validation Logloss:
	Baseline = 0.597
	LogReg = 0.124


In [16]:
logLossTe = evaluateResults(bestModel, rawTestData)
print ('Logloss:\n\tLogReg = {0:.3f}'
       .format(logLossTe))

Logloss:
	LogReg = 0.151


In [10]:
# More iteration with optimized parameters
numIters = 5000
stepSize = 0.01
regParam = 1e-6
regType = 'l2'
includeIntercept = True

model1 = LogisticRegressionWithSGD.train(rawTrainData,
                                         iterations=numIters, 
                                         step=stepSize, 
                                         miniBatchFraction=1.0, 
                                         initialWeights=None, 
                                         regParam=regParam, 
                                         regType=regType, 
                                         intercept=includeIntercept)
print model1.weights, model1.intercept

[0.179153018013,0.0283969783774,-0.000504497958365,-0.120192164047,-0.0173080116018,-0.037859079514,-0.131153338053,0.0228327769761,-0.0181199689548] 0.999521602093


In [18]:
logLossTe1 = evaluateResults(model1, rawTestData)
print ('Logloss:\n\tLogReg = {0:.3f}'
       .format(logLossTe1))

Logloss:
	LogReg = 0.119


In [28]:
for x in rawData.take(10):
    print x
    print getP(x.features, model1.weights, model1.intercept)
    print computeLogLoss(getP(x.features, model.weights, model.intercept), x.label)

(0.0,[8.9,-1.0,6.0,5.47,33.1,116.8,11.46,5.0,2.0])
0.0136887058158
0.0134959647309
(0.0,[-27.8,-6.0,9.0,2.51,54.2,106.0,8.2,2.0,1.0])
1.65706592483e-05
8.42738374866e-06
(0.0,[-46.9,-6.0,14.0,4.93,58.5,102.0,14.89,1.0,1.0])
1.42044975535e-07
5.80012745778e-08
(0.0,[-4.4,2.0,4.0,4.72,52.2,128.0,10.57,6.0,4.0])
0.000584096720652
0.000409526109164
(1.0,[33.2,0.0,4.0,-8.75,19.0,116.5,-12.04,12.0,0.0])
0.993286451278
0.00569112093568
(1.0,[24.1,0.0,5.0,-6.57,19.0,116.7,-7.73,10.0,0.0])
0.918990181632
0.0766422312658
(1.0,[19.4,1.0,7.0,-4.07,10.5,118.5,-5.57,10.0,1.0])
0.748384874676
0.251818107502
(1.0,[19.7,0.0,10.0,-5.28,10.7,113.5,-8.48,10.0,0.0])
0.853117547392
0.140893858006
(1.0,[41.7,-1.0,4.0,-6.53,12.6,113.0,-12.85,15.0,0.0])
0.999011015046
0.000698261172206
(1.0,[24.5,0.0,6.0,-1.61,0.0,115.5,-3.63,11.0,0.0])
0.881085817363
0.0904585007629


In [29]:
print model0.weights

[0.197843553366,0.0145948882501,-0.0100536691748,-0.0821223158907,-0.0167367335816,-0.0405236595803,-0.118249079876,0.0304857073854,-0.0119070274029]
