In [1]:
#load libraries
import pandas as pd
import numpy as np

# % makes it in window instead of popout
%matplotlib inline

#pandas display options
from pandas import set_option
set_option("display.max_rows", 10)

In [2]:
#This is a function that takes a multiparametric data set with the name of the sample "extract_-dilution" as the rows 
#and biological parameters and returns a cleaned cp2 DataFrame indexed by Prefraction and a Dataframe to inform the
#dilution profiles.
def readBioData(file = ''):
    #open a tab delimitted file as a pandas dataframe
    cp1 = pd.read_table(file, sep = '\t')
    
    #This for loop makes two new columns 'Prefraction'and 'Dilution'
    for i in range(0, len(cp1['Features'])):
        name = cp1.get_value(col = 'Features', index = i)
        cp1.set_value(col='Prefraction',index = i, value = str(name.split('_')[0]))
        cp1.set_value(col='Dilution',index = i, value = int(name.split('-')[2]))
    
    #Index cp1 parameters by 'Prefraction' column
    cp1.index = cp1.Prefraction
    cp2 = cp1.iloc[0:,1:-2] #Clean CP Data
    
    #creates a dataframe with information about the diluted CP biological data
    dilutions = cp1['Dilution'][cp1['Dilution']>5]
    
    return dilutions, cp2

In [3]:
#This function takes a Markers Table from Unifi as a CSV File and returns a cleaned dataframe with prefractions 
#as its indices and mz_rt baskets as the columns
def markersTable(file = ''):
    marks = pd.read_csv(file)
    
    #For loop takes the labels from UNIFI and provides a clean 'Prefraction' Column
    for i in range(0, len(marks['PrimaryId'])):
        name = marks.get_value(col = 'PrimaryId', index = i)
        if '_' in str(name):
            marks.set_value(col='Prefraction',index = i, value = str(name).split('_')[1])
        else:
            marks.set_value(col='Prefraction',index=i, value='NaN')
    
    #Redefine indices as 'Prefractions'
    marks.index = marks.Prefraction
    
    return marks.iloc[4:,9:-1] #Slice of the data that only contains ints and floats

In [4]:
#This is a function that takes a cp dataframe with columns of cp parameters and rows indexed by Prefraction and returns
#an nxn matrix. Note only computes each value once xi,yi::yi,xi.
def nbyn(cp2):
    #import necessary libraries
    from scipy.stats import pearsonr
    
    #make nxn DataFrame
    nxn = pd.DataFrame(index = cp2.index, columns = cp2.index, dtype=float)
    
    #compute pearson correlations for lower half of nxn and reflect the other to save time
    for xi in nxn.index:        #loops over all rows
        for yi in nxn.loc[xi:,xi:].columns: #starts at diagonal and computes down a column and across the row
            if xi == yi:
                cos = 1
            else:
                #cos = coscore(cp2.loc[xi].values,cp2.loc[yi].values) 
                cos = pearsonr(cp2.loc[xi].values,cp2.loc[yi].values)[0]
            nxn.set_value(index = xi, col = yi, value = cos)
            nxn.set_value(index = yi, col = xi, value = cos)
    return nxn

#takes in two arrays and computes the cosine score between the two
def coscore(x, y):
	if not len(x) == len(y):
		print ("lengths not same")
		return None
	magx = sum([xi**2 for xi in x]) ** (1.0/2.0)
	magy = sum([yi**2 for yi in y]) ** (1.0/2.0)
	
	dot = 0.0
	for xi, yi in zip(x, y):
		dot += xi * yi
	dot /= float(magx * magy)
	return dot

In [5]:
def scale(dilutionTable, markersTable):
    scaled2 = markersTable.copy()
    for i in dilutionTable.index:
        for column in markersTable.columns:
            markersTable.set_value(col=column, index=i, value=5*scaled2.get_value(col=column, index=i)/dilutionTable[i])

In [6]:
diltab, cp = readBioData('3STDEV_Dilutions_ALL.txt')

In [7]:
marks = markersTable('Preliminary_MarkerTable.csv')

In [8]:
nxn = nbyn(cp)

In [9]:
scale(diltab, marks)

In [10]:
marks.loc['RLPA-1011E']

181.98852_0.041    0.000000
158.00325_0.042    0.000000
183.98785_0.042    0.000000
180.99025_0.043    0.000000
213.43665_0.043    0.000000
                     ...   
288.91871_4.481    1.961836
370.24553_4.483    0.000000
167.01323_4.483    0.000000
495.81331_4.484    0.000000
194.11791_4.484    1.049766
Name: RLPA-1011E, dtype: float64

In [55]:
def blankRemove(marks):
    keeper = []
    blank = []
    for c in blanked.columns:
        if "Blank" in blanked[c][blanked[c]>0].index:
            blank.append(c)
        else:
            keeper.append(c)
    return marks.loc[cp.index][keeper]

In [57]:
blanked1 = blankRemove(marks)
blanked1

Unnamed: 0_level_0,181.98852_0.041,158.00325_0.042,183.98785_0.042,180.99025_0.043,213.43665_0.043,167.01342_0.044,318.89495_0.045,280.09632_0.046,282.91003_0.046,449.81558_0.046,...,394.79276_4.480,192.92237_4.480,179.01474_4.480,249.98378_4.480,230.95819_4.481,492.81335_4.481,288.91871_4.481,370.24553_4.483,167.01323_4.483,495.81331_4.484
Prefraction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RLPA-1010A,0,0,0,0,0,0.0,0,0,0,0,...,0.00,0.00,0,0.00,0,0,0,0,0,0
RLPA-1010B,0,0,0,0,0,0.0,0,0,0,0,...,0.00,0.00,0,0.00,0,0,0,0,0,0
RLPA-1010C,0,0,0,0,0,0.0,0,0,0,0,...,0.00,0.00,0,198.44,0,0,0,0,0,0
RLPA-1010D,0,0,0,0,0,0.0,0,0,0,0,...,0.00,0.00,0,0.00,0,0,0,0,0,0
RLPA-1010E,0,0,0,0,0,1752.3,0,0,0,0,...,143.91,0.00,0,0.00,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RLPA-2032B,0,0,0,0,0,0.0,0,0,0,0,...,0.00,0.00,0,0.00,0,0,0,0,0,0
RLPA-2032C,0,0,0,0,0,0.0,0,0,0,0,...,0.00,0.00,0,0.00,0,0,0,0,0,0
RLPA-2032D,0,0,0,0,0,0.0,0,0,0,0,...,0.00,405.53,0,0.00,0,0,0,0,0,0
RLPA-2032E,0,0,0,0,0,0.0,0,0,0,0,...,0.00,0.00,0,0.00,0,0,0,0,0,0


In [35]:
blanks = blanked.loc['Blank1'][blanked.loc['Blank1']>0]
blanks

Unnamed: 0_level_0,181.98852_0.041,158.00325_0.042,183.98785_0.042,180.99025_0.043,213.43665_0.043,167.01342_0.044,318.89495_0.045,280.09632_0.046,282.91003_0.046,449.81558_0.046,...,192.92237_4.480,179.01474_4.480,249.98378_4.480,230.95819_4.481,492.81335_4.481,288.91871_4.481,370.24553_4.483,167.01323_4.483,495.81331_4.484,194.11791_4.484
Prefraction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Blank1,,,,,,,,,,,...,,,,,,,,,,
Blank1,,,,,,,,,,,...,,,,,,451.71,,,,358.01
Blank1,,,,,,,,,,,...,,,,,,,,,,
Blank1,,,,,,,,,,,...,312.72,,,,,,,,,280.04


In [50]:
"Blank" in blanked['194.11791_4.484'][blanked['194.11791_4.484']>0].index

True

In [53]:
keeper = []
blank = []
for c in blanked.columns:
    if "Blank" in blanked[c][blanked[c]>0].index:
        blank.append(c)
        print('True')
    else:
        keeper.append(c)
        print('False')

False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
True
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
True
False
True
False
False
False
False
False
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
True
False
False
False
False
False
False
Fal

['181.98852_0.041', '158.00325_0.042', '183.98785_0.042', '180.99025_0.043', '213.43665_0.043', '167.01342_0.044', '318.89495_0.045', '280.09632_0.046', '282.91003_0.046', '449.81558_0.046', '304.88874_0.046', '391.82920_0.047', '404.81636_0.047', '275.88788_0.048', '224.12810_0.048', '182.19065_0.048', '270.88639_0.048', '288.91871_0.048', '266.93705_0.049', '306.89147_0.049', '192.92263_0.049', '272.90210_0.049', '214.00368_0.049', '184.98615_0.049', '260.92398_0.049', '483.79814_0.050', '390.81807_0.050', '241.88446_0.051', '223.98855_0.051', '170.98193_0.052', '316.91335_0.052', '186.98149_0.052', '171.99209_0.052', '173.97953_0.052', '570.86372_0.053', '449.81595_0.073', '223.94367_0.086', '195.94889_0.086', '209.96467_0.091', '531.79762_0.091', '310.80729_0.098', '213.43565_0.105', '494.81034_0.116', '497.77624_0.117', '287.88842_0.122', '172.97749_0.124', '280.09613_0.127', '1748.92141_0.127', '392.81430_0.128', '272.90233_0.131', '303.88372_0.132', '270.88617_0.133', '258.96159

In [28]:
for y in blanked.index:
    if 'Blank' in str(y):
        Blanks.append(Blanks,blanked.loc[y][blanked.loc[y] > 0])

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [115]:
bad

[]

In [61]:
#This is going to be a Feature class that describes the distribrution 
#and intensities of each adduct throughout the dataset
class Feature():
    def __init__(self, mz_rt, marks, cp, nxn):
        self.subtable = marks[mz_rt][marks[mz_rt]>0]
        self.runs = marks[mz_rt][marks[mz_rt]>0].index
        self.mz = mz_rt.split('_')[0]
        self.rt = mz_rt.split('_')[1]
        self.subNXN = nxn.loc[self.runs,self.runs]
        self.cScore = Feature.clusterScore(self.runs, nxn)
        self.syntheticFingerprint = cp.loc[self.runs].mean()
        self.aScore = Feature.activityScore(self.syntheticFingerprint)
        #self.synthWeighted 
    
    #function taxes one DataFrame and one array: one NXN of pearson correlations from all runs 
    #and an array of all the runs
    def clusterScore(runs, nxn):
        score = 0
        subNXN = nxn.loc[runs,runs]**3
        return float(subNXN.sum().sum()-len(runs))/(2*len(runs))
    
    #This function takes the cp fingerprint DataFrame and a subtable of runs and peak heights
    def syntheticFingerprint(subtable, cp):
        synth = cp.loc[runs]
    
    #This is a function that takes the synthetic fingerprint and computes the magnitude in order to 
    #estimate the overall activity
    def activityScore(syn):
        aScore = 0
        for i in syn:
            aScore += i**2
        return aScore**(1/2)

In [39]:
g = Feature('489.18891_1.435', marks, cp, nxn)

In [15]:
subNXN = nxn.loc[['RLPA-1011E','RLPA-2005E','RLPA-2005D'],['RLPA-1011E','RLPA-2005E','RLPA-2005D']]

In [40]:
g.aScore

4.2687645894587734

In [36]:
g.runs

Index(['RLPA-1011F', 'RLPA-1011D', 'RLPA-1011E', 'RLPA-2005D', 'RLPA-2005E'], dtype='object', name='Prefraction')

In [37]:
g.subNXN

Prefraction,RLPA-1011F,RLPA-1011D,RLPA-1011E,RLPA-2005D,RLPA-2005E
Prefraction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RLPA-1011F,1.0,0.481253,0.956112,0.704916,0.420261
RLPA-1011D,0.481253,1.0,0.416437,0.433184,0.917915
RLPA-1011E,0.956112,0.416437,1.0,0.669384,0.358097
RLPA-2005D,0.704916,0.433184,0.669384,1.0,0.37239
RLPA-2005E,0.420261,0.917915,0.358097,0.37239,1.0


In [38]:
g = cp.loc[g.runs].mean()

In [27]:
for i in g:
    print(i)

0.0
0.0021704104
0.0012978966
-0.1405889746
0.0400852924
0.0401330424
0.0
-3.304e-05
-1.468e-05
-1.376e-06
0.0
0.0234375
0.01875
0.01875
0.054630681
0.1523426668
0.1637684464
0.2162351376
0.2432429568
0.3650613706
0.171004553
-0.0013517318
0.0061952756
-0.014671587
-0.1859736986
0.1290934766
0.118385695
0.022237499
0.0309176572
0.0305889382
0.0494370608
0.0730008124
0.0773754286
0.0602161354
0.0579452428
0.0579452428
0.0579452428
0.0579452428
0.096379405
0.0967367134
0.1543896148
0.1543896148
-0.011174258
0.014798978
0.0233154612
0.028375647
0.0390744536
0.0011591518
0.0011591518
0.0499196446
-0.0110198386
0.0071147666
0.017813423
0.0452886288
0.0452886288
0.0302047784
0.0302047784
0.0302047784
0.0302047784
0.0299486378
0.097439265
0.0465541748
-0.0831875216
-0.0831875216
-0.060975821
-0.060975821
0.0165094912
0.0165094912
0.0
-0.3019697616
-0.376763782
-0.0019469378
-0.0023002796
-0.1057306948
-0.129026978
-0.043676946
-0.0313615926
-0.0906691702
-0.0734808764
-0.1187734278
-0.0117322

In [77]:
def analyze(marks, cp, nxn):
    adductTable = pd.DataFrame(columns = ['mz','rt','Cluster Score','Activity Score'])
    for c in marks.columns:
        i = Feature(c, marks, cp, nxn)
        add = pd.DataFrame({'mz': i.mz,
                               'rt': i.rt ,
                               'Cluster Score': i.cScore,
                               'Activity Score': i.aScore,
                               },index = [c])
        adductTable.append(add)
    return adductTable

In [78]:
g = analyze(blanked1, cp, nxn)

ZeroDivisionError: float division by zero

SyntaxError: keyword can't be an expression (<ipython-input-69-fa7aa231318f>, line 1)

Unnamed: 0,mz,rt,Cluster Score,Activity Score
