### Notebook Author: Jonathan McNutt
#### Date: 04/16/2018

Code completed as part of a Harvard graduate student faculty aide research project during the Spring 2018 semester for a presentation on high-dimensional visualization techniques, presented by Professor Peter Henstock during the Bio-IT World 2018 Conference in Boston, MA.


In [45]:
import pandas as pd
import numpy as np

In [46]:
# Set the input_csv_file_name variable
input_csv_file_name = "ugiSmiles100000aECFP.csv"

In [47]:
# Set the output_txt_file_name variable
output_txt_file_name = "test_ECFP_a.txt"

In [48]:
df = pd.read_csv(input_csv_file_name)

In [49]:
df

Unnamed: 0,PCAT_CMPD_ID,smiles,ECFP6_fp
0,pvh13,CCCCNC(=O)C(C1CCC=CC1)N(C1CCCC1)C(=O)C1(CC1)c1...,"115,200,281,323,378,438,481,528,534,602,697,71..."
1,pvh15,Cc1cccc(C)c1NC(=O)C(C1CCC=CC1)N(C1CCCC1)C(=O)C...,"115,200,281,323,378,438,481,528,534,611,697,71..."
2,pvh19,CC(C)(C)NC(=O)C(c1n(C)ccc1)N(C1CCCC1)C(=O)C1(C...,"115,323,345,378,649,711,772,788,792,808,859,86..."
3,pvh29,CC(C)(CC(C)(C)C)NC(=O)C(c1c(Cl)cccc1)N(C1CCCC1...,"115,221,323,345,378,395,506,521,711,772,788,79..."
4,pvh42,Cc1cccc(C)c1NC(=O)C(c1c(OC)cccc1)N(C1CCCC1)C(=...,"6,115,323,345,378,611,711,772,788,792,808,874,..."
5,pvh57,c1ccccc1CNC(=O)C(c1c(C(F)(F)F)cccc1)N(C1CCCC1)...,"81,115,170,199,249,323,345,346,378,429,711,772..."
6,pvh74,CC(C)(CC(C)(C)C)NC(=O)C(c1c(C)c(C)c(cc1)OC)N(C...,"115,245,323,345,378,395,506,521,676,703,711,77..."
7,pvh79,CC(C)NC(=O)C(c1c(C)c(C)c(cc1)OC)N(C1CCCC1)C(=O...,"115,245,323,345,378,628,655,676,703,711,772,78..."
8,pvh96,Cc1cccc(C)c1NC(=O)C(c1cc(OC)c(OC)c(c1)OC)N(C1C...,"115,323,345,378,400,474,611,711,728,763,772,78..."
9,pvh100,CC(C)(C)NC(=O)C(N(C1CCCC1)C(=O)C1(CC1)c1ccc(Cl...,"81,115,301,323,345,378,429,579,649,711,728,763..."


In [50]:
# Initialize the array that will hold the ECFP property values
property_array = []

In [51]:
# Use a for loop to extract, split and append the ECFP property values to "property_array"
for i in range(25000):
    # get the numpy.ndarray class representation of the ECFP property values per "cell" (pandas column value per row)
    ECFP_properties = df[i:i+1].ECFP6_fp.values
    
    # split each numpy.ndarray "cell" on the comma delimiter
    split_properties = np.array([x.split(',') for x in ECFP_properties], dtype=np.int)
    
    # append the property values for each "cell" to the combined array
    property_array = np.append(property_array, split_properties)

In [52]:
# Check the size of the final appended array    
print(property_array.size)

2142765


In [53]:
# Create a new array, unique_properties, that holds only the unique integer values in the property_array
unique_properties = np.unique(property_array).astype(int)

In [54]:
# Check the size of the property array when counting only the unique values
print(unique_properties.size)

4027


In [55]:
# Check the final value of unique_properties
print(unique_properties)

[   1    2    3 ..., 4093 4094 4095]


In [56]:
# Save the last value of unique_properties as a variable so it can be called later 
#  as the array lenth when initializing an array of zeros
last_unique_property_value = unique_properties[len(unique_properties)-1]

In [57]:
# Check the last value of unique_properties (this is the highest integer-value property in the CSV file)
print(last_unique_property_value)

4095


In [58]:
# Get the first compound's numpy.ndarray class representation of the ECFP property values
ECFP_properties = ugiSmiles_a_ECFP_df[0:1].ECFP6_fp.values

In [59]:
# Split the numpy.ndarray "cell" on the comma delimiter
split_properties = np.array([x.split(',') for x in ECFP_properties], dtype=np.int)

In [60]:
# Initialize the placeholder array of zeros to have the same number of columns as the value of the 
#  last unique property value previously stored as a variable "last_unique_property_value"  
binarize_properties = np.zeros(last_unique_property_value).astype(int)

In [61]:
# Check the size of the zeroed array to make sure it is the same value as "last_unique_property_value"
binarize_properties.size

4095

In [62]:
# Use a for loop to binarize the properties from the first compound
for value in split_properties:
    binarize_properties[value-1] = 1

In [63]:
# Check split_properties to see the array of property values in the first compound
print(split_properties)

[[ 115  200  281  323  378  438  481  528  534  602  697  711  741  772
   788  792  808  874  908  984  998 1036 1086 1288 1322 1365 1382 1407
  1408 1417 1576 1609 1695 1718 1733 1849 1879 1924 1998 2003 2096 2119
  2166 2184 2237 2258 2283 2299 2318 2348 2381 2460 2723 2728 2762 2831
  2939 2942 2955 2961 2987 3016 3021 3247 3265 3451 3465 3475 3522 3569
  3647 3657 3662 3670 3694 3890 3919 3953]]


In [64]:
# Pass the first value of the split_properties array (minus one) as an index of the binarize_properties array.
#  If binarization is successful, then the value here will equal '1'
binarize_properties[split_properties[0][0]-1]

1

In [65]:
# Now that we have the first fully binarized row of compound properties, let's add the other rows using vstack
#  in a for loop since we know exactly how many rows there are

for i in range(24999):
    # get the numpy.ndarray class representation of the ECFP property values per "cell" (pandas column value per row)
    #  here it is [i+1:i+2] because we want to start at [1:2] and i begins as '0'   
    ECFP_properties = df[i+1:i+2].ECFP6_fp.values
    
    # split each numpy.ndarray "cell" on the comma delimiter
    split_properties = np.array([x.split(',') for x in ECFP_properties], dtype=np.int)
    
    # initialize the placeholder array of zeros have the same number of columns as the value of the 
    #  last unique property value previously stored as a variable "last_unique_property_value"
    binarize_zeros = np.zeros(last_unique_property_value).astype(int)
    
    # use a for loop to binarize the properties from each compound     
    for value in split_properties:
        binarize_zeros[value-1] = 1
        
    # use vstack to combine the arrays vertically
    binarize_properties = np.vstack((binarize_properties, binarize_zeros))

In [66]:
# Check the dimensions of the binarize_properties array
#  The dimensions here should be the expected number of rows in the CSV file, and the same of columns as the value 
#   of last_unique_property_value
print(binarize_properties.shape)

(25000, 4095)


In [67]:
# Save the binarize_properties array into a txt file delimited by spaces and containing only integers
#  Use the previously defined variable "output_txt_file_name" as the txt file name here
np.savetxt(output_txt_file_name, binarize_properties, delimiter=" ", fmt='%i')

In [68]:
# Add the LargeVis format header row to the txt file so that the LargeVis algorithm can execute properly
with open(output_txt_file_name,'w') as out:
    out.write("{} {}\n".format(*binarize_properties.shape))
    for row in binarize_properties:
        out.write(' '.join(row.astype(str))+'\n')