In [1]:
#For an interesting demo also check out Rhodes' SciPy Pandas demo.
#load libraries
import pandas as pd
import numpy as np

# % makes it in window instead of popout
%matplotlib inline

#pandas display options
from pandas import set_option
set_option("display.max_rows", 10)

In [2]:
dude = pd.read_csv('Preliminary_MarkerTable.csv')

In [3]:
#This function takes a Markers Table from Unifi as a CSV File and returns a cleaned dataframe with prefractions 
#as its indices and mz_rt baskets as the columns
def markersTable(file = ''):
    marks = pd.read_csv(file)
    
    #For loop takes the labels from UNIFI and provides a clean 'Prefraction' Column
    for i in range(0, len(marks['PrimaryId'])):
        name = marks.get_value(col = 'PrimaryId', index = i)
        if '_' in str(name):
            marks.set_value(col='Prefraction',index = i, value = str(name).split('_')[1])
        else:
            marks.set_value(col='Prefraction',index=i, value='NaN')
    
    #Redefine indices as 'Prefractions'
    marks.index = marks.Prefraction
    
    return marks.iloc[4:,9:-1] #Slice of the data that only contains ints and floats

In [4]:
dude = markersTable('Preliminary_MarkerTable.csv')

In [8]:
#Sloppy way to assign values
cp = pd.read_table('3STDEV_Dilutions_ALL.txt', sep = '\t')
for i in range(0, len(cp['Features'])):
    dil = cp.get_value(col = 'Features', index = i)
    cp.set_value(col='Prefraction',index = i, value = str(dil.split('_')[0]))
    cp.set_value(col='Dilution',index = i, value = int(dil.split('-')[2]))

In [9]:
cp2 = cp.copy()

In [6]:
scaled = dude.copy()

In [8]:
#This is going to be a section that takes the dilution value of the Prefraction that was used for screening as indicated
#by the _-# after the Prefraction ID in order to scale the intensity of the peak area in the m/z,rt table
#All values in the m/z,rt table will be scaled for each extract by dividing by the dilution factor/5

In [10]:
#Makes a DataFrame that is a subset of the original cp DataFrame but for only the dilution 
#factor and prefractions that need to be adjusted
toDilute = cp2[cp2['Dilution'] > 5][['Dilution','Prefraction']] #example of the where statement and selection of rows

In [11]:
toDilute

Unnamed: 0,Dilution,Prefraction
3,200,RLPA-1010D
8,20,RLPA-1011C
9,20,RLPA-1011D
10,1280,RLPA-1011E
11,640,RLPA-1011F
...,...,...
158,80,RLPA-2020D
159,400,RLPA-2020E
165,10,RLPA-2021E
170,10,RLPA-2022D


In [12]:
toDil = toDilute.copy()

In [13]:
toDil = toDil.sort_index(axis = 1, ascending = False)
toDil.index = toDil.Prefraction

In [36]:
toDil

Unnamed: 0_level_0,Prefraction,Dilution
Prefraction,Unnamed: 1_level_1,Unnamed: 2_level_1
RLPA-1010D,RLPA-1010D,200
RLPA-1011C,RLPA-1011C,20
RLPA-1011D,RLPA-1011D,20
RLPA-1011E,RLPA-1011E,1280
RLPA-1011F,RLPA-1011F,640
...,...,...
RLPA-2020D,RLPA-2020D,80
RLPA-2020E,RLPA-2020E,400
RLPA-2021E,RLPA-2021E,10
RLPA-2022D,RLPA-2022D,10


In [14]:
scaled2 = scaled.iloc[4:,9:-1] #Slice of the data that only contains ints and floats

In [29]:
scaled3 = scaled.copy()
scaled3

Unnamed: 0_level_0,181.98852_0.041,158.00325_0.042,183.98785_0.042,180.99025_0.043,213.43665_0.043,167.01342_0.044,318.89495_0.045,280.09632_0.046,282.91003_0.046,449.81558_0.046,...,192.92237_4.480,179.01474_4.480,249.98378_4.480,230.95819_4.481,492.81335_4.481,288.91871_4.481,370.24553_4.483,167.01323_4.483,495.81331_4.484,194.11791_4.484
Prefraction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Blank1,0,0,0,0,0,0,0,0,0,0,...,0.00,0,0,0.00,0,0.00,0,0,0,0.00
Blank2,0,0,0,0,0,0,0,0,0,0,...,0.00,0,0,0.00,0,0.00,0,0,0,0.00
Blank3,0,0,0,0,0,0,0,0,0,0,...,0.00,0,0,0.00,0,356.88,0,0,0,211.39
Blank4,0,0,0,0,0,0,0,0,0,0,...,0.00,0,0,0.00,0,0.00,0,0,0,216.59
Blank5,0,0,0,0,0,0,0,0,0,0,...,0.00,0,0,113.29,0,0.00,0,0,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Blank19,0,0,0,0,0,0,0,0,0,0,...,0.00,0,0,0.00,0,0.00,0,0,0,241.80
Blank18,0,0,0,0,0,0,0,0,0,0,...,0.00,0,0,0.00,0,0.00,0,0,0,258.00
RLPA-2032F,0,0,0,0,0,0,0,0,0,0,...,0.00,0,0,0.00,0,0.00,0,0,0,0.00
RLPA-2032E,0,0,0,0,0,0,0,0,0,0,...,0.00,0,0,0.00,0,0.00,0,0,0,0.00


In [16]:
for i in toDil.index:
    for column in scaled3.columns:
        scaled3.set_value(col=column, index=i, value=5*scaled2.get_value(col=column, index=i)/toDil.get_value(col='Dilution', index=i))

In [42]:
for i in toDil.index:
    scaled3.loc[i]/

In [43]:
scaled3.loc['RLPA-1011E'].values

array([   0.  ,    0.  ,    0.  , ...,    0.  ,    0.  ,  268.74])

In [17]:
scaled2['RLPA-1011E':'RLPA-1011E']

Unnamed: 0_level_0,181.98852_0.041,158.00325_0.042,183.98785_0.042,180.99025_0.043,213.43665_0.043,167.01342_0.044,318.89495_0.045,280.09632_0.046,282.91003_0.046,449.81558_0.046,...,192.92237_4.480,179.01474_4.480,249.98378_4.480,230.95819_4.481,492.81335_4.481,288.91871_4.481,370.24553_4.483,167.01323_4.483,495.81331_4.484,194.11791_4.484
Prefraction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RLPA-1011E,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,502.23,0,0,0,268.74


In [18]:
scaled3['RLPA-1011E':'RLPA-1011E']

Unnamed: 0_level_0,181.98852_0.041,158.00325_0.042,183.98785_0.042,180.99025_0.043,213.43665_0.043,167.01342_0.044,318.89495_0.045,280.09632_0.046,282.91003_0.046,449.81558_0.046,...,192.92237_4.480,179.01474_4.480,249.98378_4.480,230.95819_4.481,492.81335_4.481,288.91871_4.481,370.24553_4.483,167.01323_4.483,495.81331_4.484,194.11791_4.484
Prefraction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RLPA-1011E,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.961836,0,0,0,1.049766


In [19]:
sampled = scaled3['RLPA-1011E':'RLPA-2005E']

In [20]:
sampled1 = sampled.copy()

In [21]:
#find staurosporine as an example
for c in sampled1.columns:
    if '489.1' in c:
        print(c)

489.15413_1.236
489.18891_1.435
489.10431_2.003
489.18939_3.475


In [22]:
sampled1['489.18891_1.435'][sampled1['489.18891_1.435']>0]

Prefraction
RLPA-1011E      6.923945
RLPA-2005D    800.150000
RLPA-2005E     81.626875
Name: 489.18891_1.435, dtype: float64

In [23]:
#This is going to be a Feature class that describes the distribrution 
#and intensities of each adduct throughout the dataset
class Feature():
    def __init__(self, mz_rt):
        self.table = sampled2[mz_rt][sampled2[mz_rt]>0]
        self.mz = mz_rt.split('_')[0]
        self.rt = mz_rt.split('_')[1]

In [24]:
#make an even smaller test set
#find boundaries
for c in sampled1.columns:
    if '_1.43' in c:
        print(c)

530.25759_1.430
636.37477_1.430
505.33397_1.430
401.23838_1.430
343.17944_1.430
557.32553_1.430
634.44672_1.430
653.41930_1.430
576.81973_1.430
342.67802_1.430
316.22286_1.430
556.82421_1.430
630.42936_1.430
631.43841_1.430
1112.63640_1.430
316.72433_1.430
670.27230_1.430
508.27480_1.431
223.80062_1.431
610.35403_1.431
335.19340_1.431
670.77596_1.431
751.32896_1.431
683.34171_1.431
652.41129_1.431
744.32103_1.431
1152.63281_1.431
544.23625_1.431
483.79592_1.431
224.02972_1.431
303.05026_1.431
263.05730_1.431
855.38475_1.432
777.42115_1.432
222.03234_1.432
319.23372_1.432
193.05311_1.432
235.06339_1.432
522.25405_1.432
281.06884_1.432
577.32122_1.432
283.06561_1.432
670.34498_1.432
338.36727_1.433
614.38868_1.433
338.27559_1.433
698.36447_1.433
525.26266_1.433
519.28113_1.433
467.59809_1.433
643.43483_1.433
541.26258_1.433
1503.52955_1.433
467.55069_1.433
1185.47958_1.433
609.29800_1.433
1483.56063_1.433
1185.97863_1.434
1501.53183_1.434
1037.33034_1.434
577.74185_1.434
467.43020_1.434


In [25]:
sampled2 = sampled1.iloc[0:,10900:10999]

In [26]:
sampled2

Unnamed: 0_level_0,467.55069_1.433,1185.47958_1.433,609.29800_1.433,1483.56063_1.433,1185.97863_1.434,1501.53183_1.434,1037.33034_1.434,577.74185_1.434,467.43020_1.434,295.16629_1.434,...,336.11223_1.439,556.29811_1.439,409.14102_1.440,803.21806_1.440,1048.79456_1.440,307.11146_1.440,717.30951_1.440,253.14130_1.440,680.30591_1.441,602.25551_1.441
Prefraction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RLPA-1011E,0,0,0,0,0,0,0,0.000,0,0,...,2.072969,0,0.796562,0,0,0.693594,0,0,0,0
Blank10,0,0,0,0,0,0,0,0.000,0,0,...,0.000000,0,0.000000,0,0,0.000000,0,0,0,0
RLPA-1021E,0,0,0,0,0,0,0,0.000,0,0,...,0.000000,0,0.000000,0,0,0.000000,0,0,0,0
RLPA-1014A,0,0,0,0,0,0,0,0.000,0,0,...,0.000000,0,0.000000,0,0,0.000000,0,0,0,0
RLPA-1012A,0,0,0,0,0,0,0,0.000,0,0,...,0.000000,0,0.000000,0,0,0.000000,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
RLPA-1022B,0,0,0,0,0,0,0,0.000,0,0,...,0.000000,0,0.000000,0,0,0.000000,0,0,0,0
RLPA-1010B,0,0,0,0,0,0,0,0.000,0,0,...,0.000000,0,0.000000,0,0,0.000000,0,0,0,0
RLPA-2005F,0,0,0,0,0,0,0,0.000,0,0,...,0.000000,0,0.000000,0,0,0.000000,0,0,0,0
RLPA-2004D,0,0,0,0,0,0,0,175.915,0,0,...,0.000000,0,0.000000,0,0,0.000000,0,0,0,0


In [27]:
g = Feature('489.18891_1.435')

In [84]:
g.table

Prefraction
RLPA-1011E      6.923945
RLPA-2005D    800.150000
RLPA-2005E     81.626875
Name: 489.18891_1.435, dtype: float64

In [42]:
cp2 = cp.copy()
cp2.index = cp.Prefraction

In [33]:
nxn = np.empty([len(cp['Prefraction']),len(cp['Prefraction'])], dtype = float)

In [41]:
nxn = pd.DataFrame(index = cp2.index, columns = cp2.index)

In [70]:
cp3 = cp2.iloc[0:,1:-2].get_values()
cp4 = cp2.iloc[0:,1:-2]

In [77]:
cp4.index = cp2.Prefraction
cp4[1]

KeyError: 1

In [67]:
def coscore(x, y):
	if not len(x) == len(y):
		print ("lengths not same")
		return None
	magx = sum([xi**2 for xi in x]) ** (1.0/2.0)
	magy = sum([yi**2 for yi in y]) ** (1.0/2.0)
	
	dot = 0.0
	for xi, yi in zip(x, y):
		dot += xi * yi
	dot /= float(magx * magy)
	return dot

In [69]:
coscore(cp3[0],cp3[0])

0.99999999999999989