In [5]:
import requests
import os
import pandas as pd

## Total number of mol files:

In [47]:
import os

def count_mol_files(root_folder_path):
    total_mol_files = 0

    if not os.path.isdir(root_folder_path):
        print(f"Greška: Putanja '{root_folder_path}' nije validan direktorijum.")
        return 0

    for dirpath, dirnames, filenames in os.walk(root_folder_path):
        for filename in filenames:
            if filename.lower().endswith('.mol'): # Koristimo .lower() za case-insensitive proveru
                total_mol_files += 1
    return total_mol_files

In [48]:
main_data_folder = 'data/mols_from_original_csv' 

num_mol_files = count_mol_files(main_data_folder)
print(f"Ukupan broj .mol fajlova u '{main_data_folder}' i njegovim podfolderima je: {num_mol_files}")

Ukupan broj .mol fajlova u 'data/mols_from_original_csv' i njegovim podfolderima je: 19453


In [31]:
def get_file(base_output_directory, csv_filename_for_folder, p_name, dtxsid):
    mol_url = f"https://comptox.epa.gov/dashboard-api/ccdapp1/chemical-files/mol/by-dtxsid/{dtxsid}"


    folder_name = os.path.splitext(csv_filename_for_folder)[0]
    
    # Puna putanja do specifičnog podfoldera za ovaj CSV fajl
    specific_output_directory = os.path.join(base_output_directory, folder_name)

    # Ime izlaznog Mol fajla (npr. 'Chlorhexidine dihydrochloride.mol')
    output_filename = f"{p_name}.mol"
    output_file_path = os.path.join(specific_output_directory, output_filename)

    # Kreirajte specifični izlazni direktorijum ako ne postoji
    os.makedirs(specific_output_directory, exist_ok=True)
    
    print(f"  Pokušavam da preuzmem Mol fajl za DTXSID: {dtxsid} ({p_name})")
    print(f"  Sačuvaću u: {os.path.abspath(output_file_path)}")
    
    try:
        response = requests.get(mol_url, stream=True, timeout=60) # Povećan timeout
        response.raise_for_status() # Proverava da li je došlo do HTTP greške
        
        # Otvaranje fajla u binarnom modu ('wb') i pisanje sadržaja
        with open(output_file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192): # Čitamo fajl u delovima
                f.write(chunk)
        
        print(f"  Uspešno preuzeto i sačuvano: {output_filename}")

    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            print(f"  Greška 404: Mol fajl nije pronađen za DTXSID '{dtxsid}' na URL-u. Proverite da li je DTXSID ispravan.")
        else:
            print(f"  HTTP greška prilikom preuzimanja za {dtxsid}: {e.response.status_code} - {e.response.text}")
    except requests.exceptions.ConnectionError:
        print(f"  Greška u povezivanju za {dtxsid}: Proverite internet vezu.")
    except requests.exceptions.Timeout:
        print(f"  Vremensko ograničenje zahteva je isteklo za {dtxsid}.")
    except requests.exceptions.RequestException as e:
        print(f"  Opšta greška zahteva za {dtxsid}: {e}")
    except Exception as e:
        print(f"  Došlo je do neočekivane greške za {dtxsid}: {e}")
    print("-" * 50) # Separator za svaki mol fajl

In [32]:
source_sdf_directory = 'data/sdf_data/binders_csv'
base_mol_output_directory = 'data/sdf_data/mol_binders_data'
for filename in os.listdir(source_sdf_directory):
    print(filename)
    df = pd.read_csv(source_sdf_directory+'/'+filename)
    for index, row in df.iterrows():
        get_file(base_mol_output_directory, filename, row['PREFERRED_NAME'], row['DTXSID'])

OT_ER_ERbERb_1440-ESR2.csv
  Pokušavam da preuzmem Mol fajl za DTXSID: DTXSID4021717 (4-Chloro-3-methylphenol)
  Sačuvaću u: /home/ivana-milutinovic/Documents/Doktorske/BIORad/bio-inf-paper/data-processing/data/sdf_data/mol_binders_data/OT_ER_ERbERb_1440-ESR2/4-Chloro-3-methylphenol.mol
  Uspešno preuzeto i sačuvano: 4-Chloro-3-methylphenol.mol
--------------------------------------------------
  Pokušavam da preuzmem Mol fajl za DTXSID: DTXSID3037094 (4-Hydroxytamoxifen)
  Sačuvaću u: /home/ivana-milutinovic/Documents/Doktorske/BIORad/bio-inf-paper/data-processing/data/sdf_data/mol_binders_data/OT_ER_ERbERb_1440-ESR2/4-Hydroxytamoxifen.mol
  Uspešno preuzeto i sačuvano: 4-Hydroxytamoxifen.mol
--------------------------------------------------
  Pokušavam da preuzmem Mol fajl za DTXSID: DTXSID3021984 (1-Dodecanamine)
  Sačuvaću u: /home/ivana-milutinovic/Documents/Doktorske/BIORad/bio-inf-paper/data-processing/data/sdf_data/mol_binders_data/OT_ER_ERbERb_1440-ESR2/1-Dodecanamine.mol
  U

## Retrieve mol files from original csv files

In [43]:
import requests
import os
import pandas as pd

def get_mol_file_for_dtxsid(base_output_directory, csv_filename_for_folder, p_name, dtxsid):
    mol_url = f"https://comptox.epa.gov/dashboard-api/ccdapp1/chemical-files/mol/by-dtxsid/{dtxsid}"
    folder_name = os.path.splitext(csv_filename_for_folder)[0]
    specific_output_directory = os.path.join(base_output_directory, folder_name)
    sanitized_p_name = p_name.replace('/', '_').replace('\\', '_').replace(':', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')
    output_filename = f"{sanitized_p_name}.mol"
    output_file_path = os.path.join(specific_output_directory, output_filename)
    os.makedirs(specific_output_directory, exist_ok=True)
    
    print(f"Pokušavam da preuzmem Mol fajl za DTXSID: {dtxsid} ({p_name})")
    print(f"Sačuvaću u: {os.path.abspath(output_file_path)}")
    
    try:
        response = requests.get(mol_url, stream=True, timeout=60)
        response.raise_for_status()
        
        with open(output_file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f"  Uspešno preuzeto i sačuvano: {output_filename}")

    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            print(f"  Greška 404: Mol fajl nije pronađen za DTXSID '{dtxsid}' na URL-u. Proverite da li je DTXSID ispravan.")
        else:
            print(f"  HTTP greška prilikom preuzimanja za {dtxsid}: {e.response.status_code} - {e.response.text}")
    except requests.exceptions.ConnectionError:
        print(f"  Greška u povezivanju za {dtxsid}: Proverite internet vezu.")
    except requests.exceptions.Timeout:
        print(f"  Vremensko ograničenje zahteva je isteklo za {dtxsid}.")
    except requests.exceptions.RequestException as e:
        print(f"  Opšta greška zahteva za {dtxsid}: {e}")
    except Exception as e:
        print(f"  Došlo je do neočekivane greške za {dtxsid}: {e}")
    print("-" * 50)

In [46]:
source_csv_directory = 'data/aa_csv' 
base_mol_output_directory = 'data/mols_from_original_csv' 

os.makedirs(base_mol_output_directory, exist_ok=True)
print(f"Bazni direktorijum za Mol fajlove: {os.path.abspath(base_mol_output_directory)}\n")

if not os.path.isdir(source_csv_directory):
    print(f"Greška: Direktorijum '{source_csv_directory}' ne postoji.")
else:
    print(f"Čitam CSV fajlove iz direktorijuma: {os.path.abspath(source_csv_directory)}\n")
    
    for filename in os.listdir(source_csv_directory):
        if filename.endswith(".csv"):
            csv_file_path = os.path.join(source_csv_directory, filename)
            
            print(f"Procesiram CSV fajl: {filename}")
            
            try:
                df = pd.read_csv(csv_file_path)
                
                required_columns = ['DTXSID', 'PREFERRED NAME']
                if not all(col in df.columns for col in required_columns):
                    print(f"  Upozorenje: Fajl '{filename}' ne sadrži sve potrebne kolone ({required_columns}). Preskačem.")
                    continue

                for index, row in df.iterrows():
                    dtxsid = row['DTXSID']
                    splitted_dtxsid = dtxsid.split('/')
                    #print(splitted_dtxsid)
                    p_name = str(row['PREFERRED NAME'])
                    
                    get_mol_file_for_dtxsid(base_mol_output_directory, filename, p_name, splitted_dtxsid[-1])
                    
            except pd.errors.EmptyDataError:
                print(f"  Upozorenje: Fajl '{filename}' je prazan. Preskačem.")
            except Exception as e:
                print(f"  Greška pri čitanju ili obradi '{filename}': {e}")
            
            print("=" * 70)

print("\nProces preuzimanja Mol fajlova završen.")


Bazni direktorijum za Mol fajlove: /home/ivana-milutinovic/Documents/Doktorske/BIORad/bio-inf-paper/data-processing/data/mols_from_original_csv

Čitam CSV fajlove iz direktorijuma: /home/ivana-milutinovic/Documents/Doktorske/BIORad/bio-inf-paper/data-processing/data/aa_csv

Procesiram CSV fajl: TOX21_ERa_BLA_Agonist_ratio-ESR1.csv
Pokušavam da preuzmem Mol fajl za DTXSID: DTXSID0020319 (Chlorothalonil)
Sačuvaću u: /home/ivana-milutinovic/Documents/Doktorske/BIORad/bio-inf-paper/data-processing/data/mols_from_original_csv/TOX21_ERa_BLA_Agonist_ratio-ESR1/Chlorothalonil.mol
  Uspešno preuzeto i sačuvano: Chlorothalonil.mol
--------------------------------------------------
Pokušavam da preuzmem Mol fajl za DTXSID: DTXSID0020573 (17beta-Estradiol)
Sačuvaću u: /home/ivana-milutinovic/Documents/Doktorske/BIORad/bio-inf-paper/data-processing/data/mols_from_original_csv/TOX21_ERa_BLA_Agonist_ratio-ESR1/17beta-Estradiol.mol
  Uspešno preuzeto i sačuvano: 17beta-Estradiol.mol
------------------