In [3]:
import pandas as pd

In [4]:
molecules_entities_data = pd.read_csv("fdb_molecules_entities.csv")

In [5]:
entities_data = pd.read_csv("fdb_entities.csv")

In [6]:
molecules_data = pd.read_csv("fdb_molecules.csv")
molecule_of_the_day_data = pd.read_csv("fdb_moleculeoftheday.csv")
receptors_data = pd.read_csv("fdb_receptors.csv")
admet_data = pd.read_csv("fdb_admet.csv")

In [7]:
molecules_data["pubchem_id"].values[0]

4

In [8]:
len(molecules_data)

25595

In [9]:
molecules_admet_data = pd.merge(molecules_data, admet_data, on='pubchem_id', how='inner')

In [10]:
molecules_admet_data.columns

Index(['pubchem_id', 'iupac_name', 'common_name', 'smile', 'molecular_weight',
       'hbd_count', 'hba_count', 'num_rotatablebonds', 'complexity',
       'topological_polor_surfacearea', 'monoisotopic_mass', 'exact_mass',
       'xlogp', 'charge', 'heavy_atom_count', 'atom_stereo_count',
       'defined_atom_stereocenter_count', 'undefined_atom_stereocenter_count',
       'bond_stereo_count', 'defined_bond_stereocenter_count',
       'undefined_bond_stereocenter_count', 'isotope_atom_count',
       'covalently_bonded_unit_count', 'cas_id', 'fema_number',
       'fema_flavor_profile', 'odor', 'taste', 'functional_groups', 'inchi',
       'volume3d', 'fooddb_flavor_profile', 'super_sweet', 'bitter',
       'supersweetdb_id', 'bitterdb_id', 'fooddb_id', 'flavornet_id',
       'fenoroli_and_os', 'natural', 'unknown_natural', 'synthetic',
       'flavor_profile', 'admet_solubility', 'admet_solubility_level',
       'admet_bbb', 'admet_bbb_level', 'admet_ext_hepatotoxic',
       'admet_ext_

In [11]:
import numpy as np
def split_functional_groups(functional_groups):
    if pd.isna(functional_groups):
        return []
    return functional_groups.split("@")
molecules_admet_data["functional_groups"] = molecules_admet_data["functional_groups"].apply(split_functional_groups)

In [12]:
molecules_admet_data["flavor_profile"] = molecules_admet_data["flavor_profile"].apply(split_functional_groups)

In [13]:
unique_flavors = []
for flavors_list in molecules_admet_data["flavor_profile"]:
    for flavor in flavors_list:
        if flavor not in unique_flavors:
            unique_flavors.append(flavor)

In [14]:
molecules_admet_data["functional_groups"]

0        [hydroxy compound, alcohol, secondary alcohol,...
1        [hemiacetal, hydroxy compound, alcohol, primar...
2        [carbonyl compound, ketone, carboxylic acid de...
3        [carbonyl compound, ketone, carboxylic acid de...
4        [carbonyl compound, ketone, carboxylic acid de...
                               ...                        
25590    [acetal, hydroxy compound, alcohol, primary al...
25591    [acetal, hydroxy compound, alcohol, primary al...
25592    [acetal, hydroxy compound, alcohol, primary al...
25593    [hydroxy compound, alcohol, primary alcohol, s...
25594    [acetal, hydroxy compound, alcohol, primary al...
Name: functional_groups, Length: 25595, dtype: object

In [15]:
molecules_admet_data.columns

Index(['pubchem_id', 'iupac_name', 'common_name', 'smile', 'molecular_weight',
       'hbd_count', 'hba_count', 'num_rotatablebonds', 'complexity',
       'topological_polor_surfacearea', 'monoisotopic_mass', 'exact_mass',
       'xlogp', 'charge', 'heavy_atom_count', 'atom_stereo_count',
       'defined_atom_stereocenter_count', 'undefined_atom_stereocenter_count',
       'bond_stereo_count', 'defined_bond_stereocenter_count',
       'undefined_bond_stereocenter_count', 'isotope_atom_count',
       'covalently_bonded_unit_count', 'cas_id', 'fema_number',
       'fema_flavor_profile', 'odor', 'taste', 'functional_groups', 'inchi',
       'volume3d', 'fooddb_flavor_profile', 'super_sweet', 'bitter',
       'supersweetdb_id', 'bitterdb_id', 'fooddb_id', 'flavornet_id',
       'fenoroli_and_os', 'natural', 'unknown_natural', 'synthetic',
       'flavor_profile', 'admet_solubility', 'admet_solubility_level',
       'admet_bbb', 'admet_bbb_level', 'admet_ext_hepatotoxic',
       'admet_ext_

In [16]:
properties_data = pd.read_csv("fdb_fn_properties.csv")
more_properties_data = pd.read_csv("fdb_more_properties.csv")

In [17]:
more_properties_data.columns

Index(['pubchem_id', 'number_of_atoms', 'molecular_formula',
       'molecular_composition', 'molecular_weight', 'molecular_mass', 'energy',
       'alogp', 'logd', 'molecular_solubilty', 'pka',
       'number_of_aromatic_bonds', 'number_of_aromatic_rings', 'hba_count',
       'hbd_count', 'number_of_h_acceptor', 'number_of_h_acceptor_lipinski',
       'number_of_h_donor', 'number_of_h_donor_lipinski', 'numb_of_h_bonds',
       'num_rings', 'num_rotatablebonds', 'surface_area', 'molecular_sasa',
       'radius_of_gyration', 'molecular_3d_sasa'],
      dtype='object')

In [18]:
molecules_properties_admet_data = pd.merge(molecules_admet_data, more_properties_data, on='pubchem_id', how='inner')

In [21]:
import pandas as pd
import plotly.express as px

# Assuming 'molecules_properties_admet_data' DataFrame exists and contains columns 'number_of_aromatic_rings' and 'surface_area'

# Create a DataFrame with values from 1 to 10
x_values = pd.DataFrame({'number_of_aromatic_rings': range(1, 6)})

# Group by 'number_of_aromatic_rings' and calculate mean surface area for each group
mean_radius_gyration = molecules_properties_admet_data.groupby('number_of_aromatic_rings')['radius_of_gyration'].mean().reset_index()

# Merge the DataFrame with all values and the calculated mean surface area DataFrame
merged_data = pd.merge(x_values, mean_radius_gyration, on='number_of_aromatic_rings', how='left')

# Fill NaN values with 0 to prepare for forward fill
merged_data.fillna(0, inplace=True)

# Forward fill the surface area values
merged_data['radius_of_gyration'] = merged_data['radius_of_gyration'].replace(0, method='ffill')

# Create line chart
fig = px.line(merged_data, x='number_of_aromatic_rings', y='radius_of_gyration', 
              title='Mean Surface Area by Number of Aromatic Rings (1 to 5)',
              labels={'number_of_aromatic_rings': 'Number of Aromatic Rings', 'radius_of_gyration': 'Mean radius_of_gyration'})

# Show the plot
fig.show()

In [26]:
import pandas as pd
import plotly.express as px

# Assuming 'molecules_properties_admet_data' DataFrame exists and contains columns 'number_of_aromatic_rings' and 'radius_of_gyration'

# Filter DataFrame to include only number of aromatic rings from 0 to 6
filtered_data = molecules_properties_admet_data[molecules_properties_admet_data['number_of_aromatic_rings'].isin(range(7))]

# Create box plot
fig = px.box(filtered_data, x='number_of_aromatic_rings', y='radius_of_gyration',
             title='Radius of Gyration by Number of Aromatic Rings (0 to 6)',
             labels={'number_of_aromatic_rings': 'Number of Aromatic Rings', 'radius_of_gyration': 'Radius of Gyration'})

# Show the plot
fig.show()


In [136]:
import pandas as pd
import plotly.express as px

# Assuming 'molecules_properties_admet_data' DataFrame exists and contains columns 'number_of_aromatic_rings'

# Filter out compounds with 0 aromatic rings
filtered_data = molecules_properties_admet_data[(molecules_properties_admet_data['number_of_aromatic_rings'] > 0) & (molecules_properties_admet_data['number_of_aromatic_rings'] < 5) ]

# Count the number of compounds for each number of aromatic rings
count_by_aromatic_rings = filtered_data['number_of_aromatic_rings'].value_counts().reset_index()
count_by_aromatic_rings.columns = ['number_of_aromatic_rings', 'count']

# # Add buffer value to slices for bottom 8 aromatic rings
# buffer_value = 10
# count_by_aromatic_rings.loc[count_by_aromatic_rings['number_of_aromatic_rings'] <= 8, 'count'] += buffer_value

# Create pie chart
fig = px.pie(count_by_aromatic_rings, names='number_of_aromatic_rings', values='count', 
             title='Distribution of Compounds by Number of Aromatic Rings')

# Show the plot
fig.show()


In [28]:
import pandas as pd
import plotly.express as px

# Assuming 'molecules_properties_admet_data' DataFrame exists and contains columns 'number_of_aromatic_rings' and 'natural'

# Filter DataFrame to include only compounds with aromatic rings from 0 to 6
filtered_data = molecules_properties_admet_data[molecules_properties_admet_data['number_of_aromatic_rings'].isin(range(7))]

# Group by 'number_of_aromatic_rings' and 'natural', and count the occurrences
count_by_aromatic_rings_natural = filtered_data.groupby(['number_of_aromatic_rings', 'natural']).size().reset_index(name='count')

# Calculate percentage for each category of aromatic rings
count_by_aromatic_rings_natural['percentage'] = count_by_aromatic_rings_natural.groupby('number_of_aromatic_rings')['count'].apply(lambda x: 100 * x / float(x.sum()))

# Filter DataFrame to include only natural compounds (natural = 1)
natural_compounds = count_by_aromatic_rings_natural[count_by_aromatic_rings_natural['natural'] == 0]

# Create pie chart
fig = px.pie(natural_compounds, names='number_of_aromatic_rings', values='percentage', 
             title='Percentage of Synthetic Compounds by Number of Aromatic Rings (0 to 6)',
             labels={'number_of_aromatic_rings': 'Number of Aromatic Rings'})

# Show the plot
fig.show()



Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)

