## Structural keys - CB1 ligands

In [10]:
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import csv
import pandas as pd
import seaborn as sns
from rdkit.Chem import MACCSkeys


In [4]:
#Load data
df_can = pd.read_csv('cannabinoid.csv', delimiter=';')

In [5]:
#Clean data
df1 = df_can.dropna(subset=['Smiles'])

In [6]:
#List of molecules instances
ligandm_database = []
for ligand in df1["Smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))

In [7]:
#Load drugbank data
drugbank = Chem.SDMolSupplier('../data/drugbank.sdf')
drugs = [m for m in drugbank if m]

In [8]:
#Paterns - structura keys
carboxyl_pattern = Chem.MolFromSmarts('C(=O)[O;h1]')
amino_pattern = Chem.MolFromSmarts('[NX3;H2,H1]')
ketone_pattern = Chem.MolFromSmarts('[#6][CX3](=O)[#6]')
phl_pattern = Chem.MolFromSmarts('[OH]c1ccccc1')

In [9]:
#Search for patern against my dataset
custom_key = [carboxyl_pattern, amino_pattern, ketone_pattern, phl_pattern]
can_keys = [[m.HasSubstructMatch(substruct) for substruct in custom_key] for m in ligandm_database]
len(can_keys), can_keys

(2998,
 [[False, True, False, False],
  [False, False, False, False],
  [False, True, False, False],
  [False, True, False, False],
  [False, True, False, False],
  [False, False, False, False],
  [False, True, False, False],
  [False, True, False, False],
  [False, True, False, False],
  [False, False, False, False],
  [False, False, False, False],
  [False, True, True, False],
  [False, True, False, True],
  [False, True, False, False],
  [False, False, False, False],
  [False, False, False, False],
  [False, True, False, False],
  [False, False, False, False],
  [False, False, False, False],
  [False, True, False, False],
  [False, False, False, False],
  [False, True, False, False],
  [False, True, False, False],
  [False, False, False, False],
  [False, True, False, False],
  [False, False, False, False],
  [False, True, False, False],
  [False, True, False, False],
  [False, True, False, False],
  [False, True, False, False],
  [True, True, False, False],
  [False, True, False, F

In [11]:
#Search for patern against Drugbank
custom_key = [carboxyl_pattern, amino_pattern, ketone_pattern, phl_pattern]
drug_keys = [[m.HasSubstructMatch(substruct) for substruct in custom_key] for m in drugs]
len(drug_keys), drug_keys

(7113,
 [[True, True, False, True],
  [False, True, False, True],
  [False, True, False, True],
  [False, True, False, True],
  [False, True, False, False],
  [False, True, False, False],
  [False, True, False, False],
  [False, False, False, False],
  [False, True, False, False],
  [True, True, False, False],
  [True, True, False, False],
  [True, True, False, False],
  [True, False, True, False],
  [True, True, False, False],
  [True, True, False, False],
  [False, False, False, False],
  [True, True, False, False],
  [True, True, False, False],
  [False, False, False, False],
  [False, True, False, False],
  [True, True, False, False],
  [True, True, False, False],
  [True, True, False, False],
  [False, True, False, False],
  [True, False, False, False],
  [True, True, False, False],
  [True, True, False, False],
  [True, True, False, True],
  [False, False, False, False],
  [False, False, False, False],
  [True, True, False, False],
  [True, False, False, False],
  [False, False, 

In [13]:
# MACCS keys
can_maccs = [MACCSkeys.GenMACCSKeys(m) for m in ligandm_database]
drug_maccs = [MACCSkeys.GenMACCSKeys(m) for m in drugs]
can_maccs[0]

<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x29e59615f80>

In [21]:
#Size of the vector
can_maccs[20].GetNumBits() 

167

In [18]:
#List of indices set to TRUE
list(can_maccs[20].GetOnBits())

[38,
 42,
 62,
 65,
 66,
 74,
 77,
 80,
 87,
 98,
 101,
 103,
 105,
 107,
 112,
 120,
 121,
 125,
 134,
 137,
 141,
 142,
 144,
 145,
 149,
 150,
 156,
 160,
 161,
 162,
 163,
 165]

In [22]:
#Individual bits
can_maccs[0].GetBit(20), can_maccs[0].GetBit(26)

(False, False)

In [24]:
#Bit value
can_maccs[0].ToBitString()

'00000000000000000000000000000000000000000010000000000010000000000100000000000001001101110011100110001100110100110010111001100100000101110110011011011111011110101111110'

In [25]:
#Relative amounts of substructures
can_ligands_maccs_sums = [0]*can_maccs[0].GetNumBits() 
for key in can_maccs:
    for onbit in key.GetOnBits():
        can_ligands_maccs_sums[onbit] += 1
can_ligands_maccs_sums

[0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 27,
 0,
 1,
 66,
 1,
 4,
 5,
 1,
 7,
 56,
 0,
 106,
 0,
 4,
 149,
 52,
 52,
 113,
 75,
 18,
 44,
 11,
 13,
 0,
 409,
 412,
 22,
 4,
 368,
 434,
 577,
 52,
 71,
 277,
 566,
 365,
 7,
 41,
 76,
 324,
 62,
 79,
 175,
 500,
 643,
 315,
 421,
 485,
 31,
 560,
 486,
 473,
 485,
 505,
 1598,
 36,
 456,
 1668,
 774,
 513,
 20,
 487,
 79,
 94,
 461,
 506,
 665,
 1443,
 193,
 1103,
 219,
 1328,
 1281,
 777,
 662,
 1538,
 323,
 1241,
 1021,
 1587,
 871,
 609,
 1231,
 1172,
 1694,
 735,
 1029,
 1408,
 1638,
 1461,
 1663,
 274,
 1624,
 1247,
 594,
 1418,
 1183,
 1288,
 1156,
 1732,
 559,
 653,
 2103,
 1684,
 1319,
 1094,
 456,
 669,
 741,
 2298,
 1649,
 253,
 1744,
 2159,
 1659,
 486,
 1118,
 2419,
 861,
 1362,
 1366,
 1501,
 718,
 1306,
 1024,
 1682,
 1767,
 1405,
 1083,
 2465,
 1208,
 795,
 631,
 693,
 2383,
 1362,
 1837,
 2608,
 1207,
 1698,
 1879,
 1205,
 2000,
 2090,
 1317,
 1942,
 2412,
 1680,
 2652,
 1567,
 2606,
 1978,
 1971,
 2736,
 2796,
 2900,
 2862,
 293

In [27]:
drugbank_maccs_sums = [0]*drug_maccs[0].GetNumBits()
for key in drug_maccs:
    for onbit in key.GetOnBits():
        drugbank_maccs_sums[onbit] += 1
drugbank_maccs_sums

[0,
 0,
 0,
 30,
 0,
 1,
 11,
 24,
 99,
 46,
 22,
 117,
 30,
 60,
 33,
 11,
 52,
 77,
 68,
 282,
 7,
 21,
 181,
 212,
 435,
 564,
 224,
 96,
 149,
 726,
 126,
 21,
 431,
 481,
 153,
 44,
 625,
 521,
 1312,
 153,
 167,
 166,
 698,
 1198,
 266,
 246,
 194,
 455,
 869,
 713,
 646,
 601,
 553,
 1895,
 2246,
 663,
 178,
 1633,
 669,
 604,
 685,
 695,
 1753,
 207,
 624,
 2507,
 1118,
 782,
 133,
 1551,
 295,
 526,
 2196,
 755,
 1184,
 2092,
 797,
 2245,
 703,
 2263,
 2296,
 1228,
 2080,
 2781,
 2109,
 2498,
 1685,
 1310,
 1596,
 2474,
 3533,
 3342,
 2825,
 1650,
 1481,
 3191,
 3098,
 2783,
 3056,
 1145,
 2845,
 2775,
 1934,
 786,
 3210,
 2939,
 2665,
 1534,
 1732,
 2372,
 3113,
 3377,
 2725,
 2292,
 1097,
 1828,
 1896,
 3354,
 3347,
 924,
 3223,
 3762,
 2973,
 2791,
 2463,
 3448,
 2547,
 3821,
 2689,
 2850,
 1791,
 4638,
 3554,
 2717,
 1637,
 2198,
 3264,
 4492,
 2581,
 4291,
 3262,
 1352,
 4169,
 3821,
 2802,
 3698,
 4375,
 3396,
 3793,
 2507,
 3939,
 4407,
 3963,
 4550,
 4670,
 4851,
 5254

In [28]:
#divide the raw incidence counts by the total set size
can_ligands_maccs_scaled = [x/len(can_maccs) for x in can_ligands_maccs_sums]
can_ligands_maccs_scaled

[0.0,
 0.0,
 0.0,
 0.00066711140760507,
 0.0,
 0.0,
 0.0,
 0.0,
 0.009006004002668445,
 0.0,
 0.000333555703802535,
 0.022014676450967312,
 0.000333555703802535,
 0.00133422281521014,
 0.001667778519012675,
 0.000333555703802535,
 0.0023348899266177454,
 0.018679119412941963,
 0.0,
 0.035356904603068715,
 0.0,
 0.00133422281521014,
 0.049699799866577715,
 0.01734489659773182,
 0.01734489659773182,
 0.037691794529686455,
 0.025016677785190126,
 0.00600400266844563,
 0.01467645096731154,
 0.003669112741827885,
 0.004336224149432955,
 0.0,
 0.13642428285523683,
 0.13742494996664442,
 0.00733822548365577,
 0.00133422281521014,
 0.12274849899933289,
 0.1447631754503002,
 0.19246164109406272,
 0.01734489659773182,
 0.023682454969979987,
 0.0923949299533022,
 0.1887925283522348,
 0.12174783188792529,
 0.0023348899266177454,
 0.013675783855903937,
 0.025350233488992662,
 0.10807204803202135,
 0.02068045363575717,
 0.026350900600400268,
 0.058372248165443626,
 0.1667778519012675,
 0.21447631754

In [30]:
drugbank_maccs_scaled = [x/len(drug_maccs) for x in drugbank_maccs_sums]
drugbank_maccs_scaled

[0.0,
 0.0,
 0.0,
 0.004217629692113032,
 0.0,
 0.00014058765640376775,
 0.0015464642204414453,
 0.003374103753690426,
 0.013918177983973008,
 0.006467032194573317,
 0.0030929284408828905,
 0.016448755799240825,
 0.004217629692113032,
 0.008435259384226065,
 0.004639392661324336,
 0.0015464642204414453,
 0.007310558132995923,
 0.010825249543090117,
 0.009559960635456208,
 0.0396457191058625,
 0.0009841135948263741,
 0.0029523407844791226,
 0.025446365809081963,
 0.029804583157598763,
 0.06115563053563897,
 0.079291438211725,
 0.03149163503444397,
 0.013496415014761703,
 0.020947560804161394,
 0.10206663854913539,
 0.017714044706874738,
 0.0029523407844791226,
 0.0605932799100239,
 0.0676226627302123,
 0.021509911429776464,
 0.006185856881765781,
 0.08786728525235485,
 0.07324616898636299,
 0.1844510052017433,
 0.021509911429776464,
 0.023478138619429215,
 0.023337550963025446,
 0.0981301841698299,
 0.16842401237171375,
 0.03739631660340222,
 0.03458456347532687,
 0.027274005342330942,


In [32]:
#The differences
can_drugbank_differences = [(i, a_b[0] - a_b[1])
                             for i, a_b in enumerate(zip(can_ligands_maccs_scaled, drugbank_maccs_scaled))]
can_drugbank_differences

[(0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, -0.0035505182845079622),
 (4, 0.0),
 (5, -0.00014058765640376775),
 (6, -0.0015464642204414453),
 (7, -0.003374103753690426),
 (8, -0.0049121739813045625),
 (9, -0.006467032194573317),
 (10, -0.0027593727370803555),
 (11, 0.005565920651726487),
 (12, -0.0038840739883104973),
 (13, -0.0071010365690159245),
 (14, -0.0029716141423116607),
 (15, -0.0012129085166389102),
 (16, -0.0049756682063781775),
 (17, 0.007853869869851846),
 (18, -0.009559960635456208),
 (19, -0.004288814502793788),
 (20, -0.0009841135948263741),
 (21, -0.0016181179692689826),
 (22, 0.024253434057495752),
 (23, -0.012459686559866942),
 (24, -0.04381073393790715),
 (25, -0.04159964368203855),
 (26, -0.006474957249253847),
 (27, -0.007492412346316073),
 (28, -0.006271109836849854),
 (29, -0.09839752580730751),
 (30, -0.013377820557441782),
 (31, -0.0029523407844791226),
 (32, 0.07583100294521293),
 (33, 0.06980228723643213),
 (34, -0.014171685946120694),
 (35, -0.004851634066555641)

In [34]:
#Sort the diferences
can_drugbank_differences.sort(key=lambda x: x[1])
can_drugbank_differences

[(139, -0.3380848491055521),
 (140, -0.24812328608969078),
 (123, -0.23027207697488378),
 (131, -0.21642180123456406),
 (157, -0.21582517123041967),
 (146, -0.21246926227682417),
 (84, -0.18876087502732736),
 (54, -0.1753329249819951),
 (53, -0.16134356218734136),
 (132, -0.15808749016519474),
 (72, -0.15496131400970536),
 (89, -0.14467843832717758),
 (159, -0.13806176796996772),
 (155, -0.12161713882641845),
 (152, -0.117856020420193),
 (109, -0.11566204640668173),
 (48, -0.101490219779117),
 (29, -0.09839752580730751),
 (136, -0.09763728328375249),
 (90, -0.08608911869359082),
 (127, -0.08288256653974385),
 (143, -0.08288256653974385),
 (91, -0.0789166628448208),
 (49, -0.07388809841548613),
 (102, -0.07376443942618105),
 (82, -0.07160844940255873),
 (126, -0.07088529988641379),
 (99, -0.06957860374041948),
 (108, -0.05704018246570866),
 (104, -0.05668997945769555),
 (69, -0.05560982733040923),
 (76, -0.04767211131991364),
 (43, -0.046676180483788465),
 (119, -0.04551340145504004),
 

### Conclusion

The least prevalent MACCS bits in my dataset compared to Drugbank are 139,140,123,131,157,146, 84, which correspond to OH, OCO, C-O and NH2. The most ocurring MACCS bits in my dataset compared to Drugbank are 103, 107, 134,145,87,125, which corresspond to CL, XA(A)A (X= halogen), X (HALOGEN) and X!A$A. 
