# Scrape drug list

1. Scrape links for all drugs

In [None]:
import requests
from bs4 import BeautifulSoup

base_url = "https://www.drugs.com"

start_url = "https://www.drugs.com/drug_information.html"
response = requests.get(start_url)
soup = BeautifulSoup(response.content, "html.parser")

nav = soup.find("nav", class_="ddc-paging")
letter_links = []

for li in nav.find_all("li"):
    a_tag = li.find("a")
    if a_tag and a_tag["href"]:
        full_link = base_url + a_tag["href"]
        letter_links.append(full_link)

print(f"Found {len(letter_links)} letter pages")
print(letter_links)

all_drug_links = []

for letter_url in letter_links:
    print(f"Scraping drugs from: {letter_url}")
    r = requests.get(letter_url)
    letter_soup = BeautifulSoup(r.content, "html.parser")
    ul = letter_soup.find("ul", class_="ddc-list-column-2")
    if ul:
        for li in ul.find_all("li"):
            a_tag = li.find("a")
            if a_tag and a_tag["href"]:
                drug_url = base_url + a_tag["href"]
                all_drug_links.append(drug_url)

print(f"\nTotal drug links found: {len(all_drug_links)}")
print(all_drug_links[:5])


Found 27 letter pages
['https://www.drugs.com/alpha/a.html', 'https://www.drugs.com/alpha/b.html', 'https://www.drugs.com/alpha/c.html', 'https://www.drugs.com/alpha/d.html', 'https://www.drugs.com/alpha/e.html', 'https://www.drugs.com/alpha/f.html', 'https://www.drugs.com/alpha/g.html', 'https://www.drugs.com/alpha/h.html', 'https://www.drugs.com/alpha/i.html', 'https://www.drugs.com/alpha/j.html', 'https://www.drugs.com/alpha/k.html', 'https://www.drugs.com/alpha/l.html', 'https://www.drugs.com/alpha/m.html', 'https://www.drugs.com/alpha/n.html', 'https://www.drugs.com/alpha/o.html', 'https://www.drugs.com/alpha/p.html', 'https://www.drugs.com/alpha/q.html', 'https://www.drugs.com/alpha/r.html', 'https://www.drugs.com/alpha/s.html', 'https://www.drugs.com/alpha/t.html', 'https://www.drugs.com/alpha/u.html', 'https://www.drugs.com/alpha/v.html', 'https://www.drugs.com/alpha/w.html', 'https://www.drugs.com/alpha/x.html', 'https://www.drugs.com/alpha/y.html', 'https://www.drugs.com/alph

2. Scrape drug info

In [None]:
import csv
import time

data_rows = []
for idx, drug_url in enumerate(all_drug_links):
    print(f"Scraping {idx+1}/{len(all_drug_links)}: {drug_url}")

    try:
        r = requests.get(drug_url, timeout=10)
        soup = BeautifulSoup(r.content, 'html.parser')

        # Drug Name
        name = ""
        try:
            name = soup.find("h1").get_text(strip=True)
        except:
            pass

        # Drug Info
        drug_info = ""
        try:
            drug_info = soup.find("p", class_="drug-subtitle").get_text()
        except:
            pass

        # Usage
        usage = ""
        try:
            uses_h2 = soup.find("h2", id="uses")
            usage_paragraphs = uses_h2.find_all_next("p", limit=2)
            usage = " ".join([p.get_text(strip=True) for p in usage_paragraphs])
        except:
            pass

        # Warnings
        warnings = ""
        try:
            warn_h2 = soup.find("h2", id="warnings")
            warn_paragraphs = warn_h2.find_all_next("p", limit=2)
            warnings = " ".join([p.get_text(strip=True) for p in warn_paragraphs])
        except:
            pass

        # Side Effects
        side_effects = ""
        try:
            side_h2 = soup.find("h2", id="side-effects")
            ul = side_h2.find_next("ul")
            if ul:
                side_effects = "; ".join([li.get_text(strip=True) for li in ul.find_all("li")])
        except:
            pass

        # Drug Interactions link
        interactions_link = ""
        try:
            all_links = soup.find_all("a", class_="ddc-related-link")
            for a_link in all_links:
              href = a_link.get("href", "")
              if "drug-interactions" in href:
                interactions_link = base_url + href
                break
        except:
            pass

        data_rows.append([
            name,
            drug_info,
            usage,
            warnings,
            side_effects,
            interactions_link
        ])

        time.sleep(0.2)

    except Exception as e:
        print(f"Error scraping {drug_url}: {e}")
        continue


csv_filename = "drugs_data.csv"

with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        'Drug Name',
        'Drug Info',
        'Usage',
        'Warnings',
        'Side Effects',
        'Drug Interactions Link'
    ])
    writer.writerows(data_rows)

print(f"\nDone! Saved {len(data_rows)} rows to {csv_filename}")

Scraping 1/1281: https://www.drugs.com/abecma.html
Scraping 2/1281: https://www.drugs.com/abilify.html
Scraping 3/1281: https://www.drugs.com/abilify-asimtufii.html
Scraping 4/1281: https://www.drugs.com/abilify-maintena.html
Scraping 5/1281: https://www.drugs.com/abiraterone.html
Scraping 6/1281: https://www.drugs.com/acetaminophen.html
Scraping 7/1281: https://www.drugs.com/mtm/acetylcysteine.html
Scraping 8/1281: https://www.drugs.com/actemra.html
Scraping 9/1281: https://www.drugs.com/actos.html
Scraping 10/1281: https://www.drugs.com/acyclovir.html
Scraping 11/1281: https://www.drugs.com/adderall.html
Scraping 12/1281: https://www.drugs.com/mtm/adderall-xr.html
Scraping 13/1281: https://www.drugs.com/mtm/advair-diskus.html
Scraping 14/1281: https://www.drugs.com/advil.html
Scraping 15/1281: https://www.drugs.com/afinitor.html
Scraping 16/1281: https://www.drugs.com/agamree.html
Scraping 17/1281: https://www.drugs.com/aimovig.html
Scraping 18/1281: https://www.drugs.com/ajovy.html


# Analyze and clean data

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drugs_data.csv')

In [None]:
df.head()

Unnamed: 0,Drug Name,Drug Info,Usage,Warnings,Side Effects,Drug Interactions Link
0,Abecma,\nPronunciation: uh-BEK-muh\nGeneric name: id...,Abecma (idecabtagene vicleucel) is a CAR T-cel...,Cytokine release syndrome or CRS. Abecma can c...,"cytokine release syndrome- CRS (confusion, tro...",https://www.drugs.com/drug-interactions/idecab...
1,Abilify,\nPronunciation: a BIL ĭ fī\nGeneric name: ari...,Abilify (aripiprazole) is an antipsychotic med...,Abilify is not approved for use in older adult...,blurred vision;; increased saliva ordrooling;;...,https://www.drugs.com/drug-interactions/aripip...
2,Abilify Asimtufii,\nPronunciation: a-BIL-i-fy AH-SIM-TUH-FYE\nGe...,Abilify Asimtufii is anatypical antipsychoticu...,Abilify Asimtufii is not approved for use in o...,Increased weight (17%); Akathisiamovement diso...,https://www.drugs.com/drug-interactions/aripip...
3,Abilify Maintena,\nGeneric name: aripiprazole lauroxil (injecti...,Abilify Maintena (aripiprazole) extended-relea...,Abilify Maintena is not approved for use in ol...,Increase in weight17%; An inability to remain ...,https://www.drugs.com/drug-interactions/aripip...
4,Abiraterone,\nGeneric name: abiraterone acetate [ A-bir-A-...,Abirateroneworks by reducing androgen producti...,Abiraterone tablets should not be handled by a...,"swelling in your ankles or feet, pain in your ...",https://www.drugs.com/drug-interactions/abirat...


In [None]:
print(df['Drug Name'].isnull().sum())

0


In [None]:
print(df['Drug Info'].isnull().sum())

9


In [None]:
print(df['Usage'].isnull().sum())

83


In [None]:
print(df['Warnings'].isnull().sum())

234


In [None]:
print(df['Side Effects'].isnull().sum())

98


In [None]:
print(df['Drug Interactions Link'].isnull().sum())

178


1. Drop missing side effects

In [None]:
df = df.dropna(subset=['Side Effects'])

In [None]:
print(df['Side Effects'].isnull().sum())

0


In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drugs_data_cleaned.csv', index=False)

2. Manually check data and drop not useful information

In [None]:
df_cleaned = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drugs_data_cleaned.csv')

In [None]:
len(df_cleaned)

1183

In [None]:
to_delete = [
    'Ambroxol Hydrochloride',
    'Avsola',
    'Biotin',
    'Canasa',
    'Cyanocobalamin',
    'Dapagliflozin',
    'Delzicol',
    'Evusheld',
    'Eysuvis',
    'Hemgenix',
    'Inflectra',
    'Jujube',
    'Kratom',
    'Kynmobi',
    'Olezarsen',
    'Oxaliplatin',
    'Paragard',
    'Prolia',
    'Qelbree',
    'Qlosi',
    'Quassia',
    'Quviviq',
    'Renflexis',
    'Rimegepant',
    'Semaglutide',
    'Simethicone',
    'Sogroya',
    'Tyrvaya',
    'Ubiquinone',
    'Ubrelvy',
    'Ubrogepant',
    'Uva Ursi',
    'Vivitrol',
    'Willow Bark',
    'Witch Hazel',
    'Witch hazel topical',
    'Wormwood',
    'XyliMelts',
    'Xylitol',
    'Yew',
    'Zanubrutinib',
    'Zinc oxide topical',

]

In [None]:
len(to_delete)

42

In [None]:
df_cleaned = df_cleaned[~df_cleaned['Drug Name'].isin(to_delete)]

In [None]:
len(df_cleaned)

1142

In [None]:
df_cleaned.to_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drugs_data_cleaned.csv', index=False)

3. Drug Info text parsing

In [None]:
import re

def extract_generic_name(info_text):
    match = re.search(r"Generic name:\s*(.*?)\s*(Brand|Dosage|Drug class|$)", info_text, re.IGNORECASE | re.DOTALL)
    if match:
        result = match.group(1).strip()

        # Remove [ ... ] brackets
        result = re.sub(r"\[.*?\]", "", result).strip()

        # Remove "Other" at the end
        result = re.sub(r"\bOther\b$", "", result).strip()

        return result

    return ""

def extract_drug_class(info_text):
    match = re.search(r"Drug class(?:es)?:\s*(.*)", info_text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return ""


In [None]:
df_cleaned['Generic Name'] = df_cleaned['Drug Info'].apply(extract_generic_name)
df_cleaned['Drug Class'] = df_cleaned['Drug Info'].apply(extract_drug_class)

In [None]:
df_cleaned.head()

Unnamed: 0,Drug Name,Drug Info,Usage,Warnings,Side Effects,Drug Interactions Link,Generic Name,Drug Class
0,Abecma,\nPronunciation: uh-BEK-muh\nGeneric name: id...,Abecma (idecabtagene vicleucel) is a CAR T-cel...,Cytokine release syndrome or CRS. Abecma can c...,"cytokine release syndrome- CRS (confusion, tro...",https://www.drugs.com/drug-interactions/idecab...,idecabtagene vicleucel,Miscellaneous antineoplastics
1,Abilify,\nPronunciation: a BIL ĭ fī\nGeneric name: ari...,Abilify (aripiprazole) is an antipsychotic med...,Abilify is not approved for use in older adult...,blurred vision;; increased saliva ordrooling;;...,https://www.drugs.com/drug-interactions/aripip...,aripiprazole,Atypical antipsychotics
2,Abilify Asimtufii,\nPronunciation: a-BIL-i-fy AH-SIM-TUH-FYE\nGe...,Abilify Asimtufii is anatypical antipsychoticu...,Abilify Asimtufii is not approved for use in o...,Increased weight (17%); Akathisiamovement diso...,https://www.drugs.com/drug-interactions/aripip...,aripiprazole,Atypical antipsychotics
3,Abilify Maintena,\nGeneric name: aripiprazole lauroxil (injecti...,Abilify Maintena (aripiprazole) extended-relea...,Abilify Maintena is not approved for use in ol...,Increase in weight17%; An inability to remain ...,https://www.drugs.com/drug-interactions/aripip...,aripiprazole lauroxil (injection),Atypical antipsychotics
4,Abiraterone,\nGeneric name: abiraterone acetate [ A-bir-A-...,Abirateroneworks by reducing androgen producti...,Abiraterone tablets should not be handled by a...,"swelling in your ankles or feet, pain in your ...",https://www.drugs.com/drug-interactions/abirat...,abiraterone acetate,Miscellaneous antineoplastics


In [None]:
df_cleaned.drop(columns=['Drug Info'], inplace=True)

In [None]:
df_cleaned.insert(0, 'Drug ID', range(1, len(df_cleaned) + 1))

In [None]:
df_cleaned.to_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drugs_data_cleaned.csv', index=False)

In [None]:
df_cleaned.head()

Unnamed: 0,Drug ID,Drug Name,Usage,Warnings,Side Effects,Drug Interactions Link,Generic Name,Drug Class
0,1,Abecma,Abecma (idecabtagene vicleucel) is a CAR T-cel...,Cytokine release syndrome or CRS. Abecma can c...,"cytokine release syndrome- CRS (confusion, tro...",https://www.drugs.com/drug-interactions/idecab...,idecabtagene vicleucel,Miscellaneous antineoplastics
1,2,Abilify,Abilify (aripiprazole) is an antipsychotic med...,Abilify is not approved for use in older adult...,blurred vision;; increased saliva ordrooling;;...,https://www.drugs.com/drug-interactions/aripip...,aripiprazole,Atypical antipsychotics
2,3,Abilify Asimtufii,Abilify Asimtufii is anatypical antipsychoticu...,Abilify Asimtufii is not approved for use in o...,Increased weight (17%); Akathisiamovement diso...,https://www.drugs.com/drug-interactions/aripip...,aripiprazole,Atypical antipsychotics
3,4,Abilify Maintena,Abilify Maintena (aripiprazole) extended-relea...,Abilify Maintena is not approved for use in ol...,Increase in weight17%; An inability to remain ...,https://www.drugs.com/drug-interactions/aripip...,aripiprazole lauroxil (injection),Atypical antipsychotics
4,5,Abiraterone,Abirateroneworks by reducing androgen producti...,Abiraterone tablets should not be handled by a...,"swelling in your ankles or feet, pain in your ...",https://www.drugs.com/drug-interactions/abirat...,abiraterone acetate,Miscellaneous antineoplastics


# Scrape drug interactions

1. For each drug get the url of the page with the drug interactions list

In [None]:
import csv
import requests
from bs4 import BeautifulSoup
import time

base_url = "https://www.drugs.com/drug-interactions/"

import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drugs_data_cleaned.csv')

data_rows = []

for idx, row in df.iterrows():
    drug_id = row['Drug ID']
    drug_name = row['Drug Name']
    interactions_url = row['Drug Interactions Link']

    if pd.isna(interactions_url) or interactions_url.strip() == "":
        print(f"Skipping {drug_name}: No interactions link")
        continue

    print(f"Checking {drug_name} - {interactions_url}")

    try:
        r = requests.get(interactions_url, timeout=10)
        soup = BeautifulSoup(r.content, 'html.parser')

        if soup.find('h2', string=lambda text: text and 'Medications known to interact with' in text):
            # Already on full page
            final_url = interactions_url
        elif soup.find('h2', string=lambda text: text and 'Most frequently checked interactions' in text):
            # Partial page -> look for 'View all'
            final_url = ""
            a_tags = soup.find_all('a', class_='ddc-link-no-visited')
            for a in a_tags:
                if 'View all' in a.get_text():
                    href = a.get('href')
                    if href:
                        final_url = base_url + href
                        break
            if not final_url:
                print(f"Warning: No 'View all' link found for {drug_name}")
        else:
            final_url = ""

        data_rows.append([drug_id, drug_name, final_url])

        time.sleep(0.1)

    except Exception as e:
        print(f"Error checking {drug_name}: {e}")
        data_rows.append([drug_id, drug_name, ""])
        continue

with open('drug_interactions_urls.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Drug ID', 'Drug Name', 'Drug Interactions URL'])
    writer.writerows(data_rows)

print(f"\nDone! Saved {len(data_rows)} rows to drug_interactions_urls.csv")


Checking Abecma - https://www.drugs.com/drug-interactions/idecabtagene-vicleucel,abecma.html
Checking Abilify - https://www.drugs.com/drug-interactions/aripiprazole,abilify.html
Checking Abilify Asimtufii - https://www.drugs.com/drug-interactions/aripiprazole,abilify-asimtufii.html
Checking Abilify Maintena - https://www.drugs.com/drug-interactions/aripiprazole,abilify-maintena.html
Checking Abiraterone - https://www.drugs.com/drug-interactions/abiraterone.html
Checking Acetaminophen - https://www.drugs.com/drug-interactions/acetaminophen.html
Checking Acetylcysteine - https://www.drugs.com/drug-interactions/acetylcysteine.html
Checking Actemra - https://www.drugs.com/drug-interactions/tocilizumab,actemra.html
Checking Actos - https://www.drugs.com/drug-interactions/pioglitazone,actos.html
Checking Acyclovir - https://www.drugs.com/drug-interactions/acyclovir.html
Checking Adderall - https://www.drugs.com/drug-interactions/amphetamine-dextroamphetamine,adderall.html
Checking Adderall X

In [None]:
urls = pd.read_csv('/content/drug_interactions_urls.csv')

In [None]:
urls

Unnamed: 0,Drug ID,Drug Name,Drug Interactions URL
0,1,Abecma,https://www.drugs.com/drug-interactions/idecab...
1,2,Abilify,https://www.drugs.com/drug-interactions/aripip...
2,3,Abilify Asimtufii,https://www.drugs.com/drug-interactions/aripip...
3,4,Abilify Maintena,https://www.drugs.com/drug-interactions/aripip...
4,5,Abiraterone,https://www.drugs.com/drug-interactions/abirat...
...,...,...,...
1059,1138,5-HTP,https://www.drugs.com/drug-interactions/5-hydr...
1060,1139,5-HTP Mood and Stress,https://www.drugs.com/drug-interactions/5-hydr...
1061,1140,5-hydroxytryptophan,https://www.drugs.com/drug-interactions/5-hydr...
1062,1141,"5-hydroxytryptophan, melatonin, and pyridoxine",https://www.drugs.com/drug-interactions/5-hydr...


2. For each url, extract the drug-to-drug links.

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random

df_links = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drug_interactions_urls.csv')

base_url = 'https://www.drugs.com'

interaction_rows = []

for idx, row in df_links.iterrows():
    drug_id = row['Drug ID']
    drug_name = row['Drug Name']
    interactions_url = row['Drug Interactions URL']

    if pd.isna(interactions_url) or interactions_url == '':
        continue

    print(f"\nScraping interactions for {drug_name} ({interactions_url})")

    try:
        response = requests.get(interactions_url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')

        severity_buckets = {'int_3': [], 'int_2': [], 'int_1': []}

        for div in soup.find_all('div', class_='col-list-az'):
            ul = div.find_next_sibling('ul', class_='interactions ddc-list-column-2')
            if ul:
                for li in ul.find_all('li'):
                    interaction_class = li.get('class')
                    if interaction_class:
                        interaction_class = interaction_class[0].strip()
                        if interaction_class in severity_buckets:
                            a = li.find('a')
                            if a:
                                interacts_with = a.get_text(strip=True)
                                link = a.get('href')
                                if link and not link.startswith('http'):
                                    link = base_url + link
                                severity_buckets[interaction_class].append([
                                    drug_id,
                                    drug_name,
                                    interacts_with,
                                    link
                                ])

        for div in soup.find_all('div', class_='ddc-injection-ignore-inside'):
            ul = div.find('ul', class_='interactions ddc-mgt-0 ddc-list-unstyled')
            if ul:
                for li in ul.find_all('li'):
                    interaction_class = li.get('class')
                    if interaction_class:
                        interaction_class = interaction_class[0].strip()
                        if interaction_class in severity_buckets:
                            a = li.find('a')
                            if a:
                                interacts_with = a.get_text(strip=True)
                                link = a.get('href')
                                if link and not link.startswith('http'):
                                    link = base_url + link
                                severity_buckets[interaction_class].append([
                                    drug_id,
                                    drug_name,
                                    interacts_with,
                                    link
                                ])

        total_kept = 0
        max_per_drug = 25

        for severity in ['int_3', 'int_2', 'int_1']:
            bucket = severity_buckets[severity]
            if not bucket:
                continue

            random.shuffle(bucket)
            for row in bucket:
                if total_kept < max_per_drug:
                    interaction_rows.append(row)
                    total_kept += 1
                else:
                    break
            if total_kept >= max_per_drug:
                break

        print(f"Kept {total_kept} interactions for {drug_name}")

        time.sleep(0.1)

    except Exception as e:
        print(f"Error scraping {interactions_url}: {e}")
        continue

output_file = 'drug_to_drug_interactions.csv'
df_out = pd.DataFrame(interaction_rows, columns=[
    'Drug ID',
    'Drug Name',
    'Interacts With Generic Name',
    'Interaction Link'
])

df_out.to_csv(output_file, index=False, encoding='utf-8')
print(f"\nSaved {len(df_out)} rows to {output_file}")



Scraping interactions for Abecma (https://www.drugs.com/drug-interactions/idecabtagene-vicleucel,abecma.html)
Kept 25 interactions for Abecma

Scraping interactions for Abilify (https://www.drugs.com/drug-interactions/aripiprazole,abilify-index.html)
Kept 25 interactions for Abilify

Scraping interactions for Abilify Asimtufii (https://www.drugs.com/drug-interactions/aripiprazole,abilify-asimtufii-index.html)
Kept 25 interactions for Abilify Asimtufii

Scraping interactions for Abilify Maintena (https://www.drugs.com/drug-interactions/aripiprazole,abilify-maintena-index.html)
Kept 25 interactions for Abilify Maintena

Scraping interactions for Abiraterone (https://www.drugs.com/drug-interactions/abiraterone-index.html)
Kept 25 interactions for Abiraterone

Scraping interactions for Acetaminophen (https://www.drugs.com/drug-interactions/acetaminophen-index.html)
Kept 25 interactions for Acetaminophen

Scraping interactions for Acetylcysteine (https://www.drugs.com/drug-interactions/ace

In [None]:
interactions = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drug_to_drug_interactions.csv')

In [None]:
interactions

Unnamed: 0,Drug ID,Drug Name,Interacts With Generic Name,Interaction Link
0,1,Abecma,adalimumab,https://www.drugs.com/drug-interactions/abecma...
1,1,Abecma,"cholera vaccine, live",https://www.drugs.com/drug-interactions/abecma...
2,1,Abecma,triamcinolone,https://www.drugs.com/drug-interactions/abecma...
3,1,Abecma,"poliovirus vaccine, live, trivalent",https://www.drugs.com/drug-interactions/abecma...
4,1,Abecma,fluticasone,https://www.drugs.com/drug-interactions/abecma...
...,...,...,...,...
25505,1142,8-Mop,tetracycline topical,https://www.drugs.com/drug-interactions/8-mop-...
25506,1142,8-Mop,trioxsalen,https://www.drugs.com/drug-interactions/8-mop-...
25507,1142,8-Mop,moxifloxacin,https://www.drugs.com/drug-interactions/8-mop-...
25508,1142,8-Mop,gefitinib,https://www.drugs.com/drug-interactions/8-mop-...


3. Extract each interaction text

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drug_to_drug_interactions.csv')

interaction_texts = []

for index, row in df.iterrows():
    link = row['Interaction Link']
    print(f"Scraping {index+1}/{len(df)}: {link}")

    try:
        response = requests.get(link, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')

        ref_header_div = soup.find('div', class_='interactions-reference-header')

        if ref_header_div:
            next_p = ref_header_div.find_next_sibling('p')
            if next_p:
                interaction_text = next_p.get_text(strip=True)
            else:
                interaction_text = ""
        else:
            interaction_text = ""

    except Exception as e:
        print(f"Error scraping {link}: {e}")
        interaction_text = ""

    interaction_texts.append(interaction_text)

    time.sleep(0.1)

df['Interaction'] = interaction_texts

df.to_csv('drug_to_drug_interactions_final.csv', index=False)
print("\nDone! Full dataset saved as 'drug_to_drug_interactions_final.csv'")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Scraping 20513/25510: https://www.drugs.com/drug-interactions/amikacin-with-torsemide-153-0-2219-0.html
Scraping 20514/25510: https://www.drugs.com/drug-interactions/dolasetron-with-torsemide-926-0-2219-0.html
Scraping 20515/25510: https://www.drugs.com/drug-interactions/plazomicin-with-torsemide-3920-0-2219-0.html
Scraping 20516/25510: https://www.drugs.com/drug-interactions/levomethadyl-acetate-with-torsemide-1459-0-2219-0.html
Scraping 20517/25510: https://www.drugs.com/drug-interactions/droperidol-with-torsemide-944-0-2219-0.html
Scraping 20518/25510: https://www.drugs.com/drug-interactions/dofetilide-with-torsemide-925-0-2219-0.html
Scraping 20519/25510: https://www.drugs.com/drug-interactions/tobramycin-with-torsemide-2206-0-2219-0.html
Scraping 20520/25510: https://www.drugs.com/drug-interactions/pimozide-with-torsemide-1872-0-2219-0.html
Scraping 20521/25510: https://www.drugs.com/drug-interactions/sodium-nitrite-

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drug_to_drug_interactions_final.csv')

In [None]:
df

Unnamed: 0,Drug ID,Drug Name,Interacts With Generic Name,Interaction Link,Interaction
0,1,Abecma,adalimumab,https://www.drugs.com/drug-interactions/abecma...,Using adalimumab together with idecabtagene vi...
1,1,Abecma,"cholera vaccine, live",https://www.drugs.com/drug-interactions/abecma...,If you are currently being treated or have rec...
2,1,Abecma,triamcinolone,https://www.drugs.com/drug-interactions/abecma...,Using triamcinolone together with idecabtagene...
3,1,Abecma,"poliovirus vaccine, live, trivalent",https://www.drugs.com/drug-interactions/abecma...,If you are currently being treated or have rec...
4,1,Abecma,fluticasone,https://www.drugs.com/drug-interactions/abecma...,Using fluticasone together with idecabtagene v...
...,...,...,...,...,...
24574,1142,8-Mop,tetracycline topical,https://www.drugs.com/drug-interactions/8-mop-...,"Methoxsalen sensitizes your skin to sunlight, ..."
24575,1142,8-Mop,trioxsalen,https://www.drugs.com/drug-interactions/8-mop-...,"Methoxsalen sensitizes your skin to sunlight, ..."
24576,1142,8-Mop,moxifloxacin,https://www.drugs.com/drug-interactions/8-mop-...,"Methoxsalen sensitizes your skin to sunlight, ..."
24577,1142,8-Mop,gefitinib,https://www.drugs.com/drug-interactions/8-mop-...,"Methoxsalen sensitizes your skin to sunlight, ..."


In [None]:
unwanted_messages = [
    "Consumer information for this interaction is not currently available.",
    "Information for this minor interaction is available on theprofessional version."
]

In [None]:
df = df[~df['Interaction'].isin(unwanted_messages)].reset_index(drop=True)

In [None]:
print(f"Remaining rows after cleaning: {len(df)}")

Remaining rows after cleaning: 24579


In [None]:
df = df.drop(columns=['Interaction Link'])

In [None]:
df.head()

Unnamed: 0,Drug ID,Drug Name,Interacts With Generic Name,Interaction
0,1,Abecma,adalimumab,Using adalimumab together with idecabtagene vi...
1,1,Abecma,"cholera vaccine, live",If you are currently being treated or have rec...
2,1,Abecma,triamcinolone,Using triamcinolone together with idecabtagene...
3,1,Abecma,"poliovirus vaccine, live, trivalent",If you are currently being treated or have rec...
4,1,Abecma,fluticasone,Using fluticasone together with idecabtagene v...


In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drug_to_drug_interactions_final.csv', index=False)

# Clean final drugs csv

In [3]:
df_cleaned = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drugs_data_cleaned.csv')

In [None]:
df_cleaned.head()

Unnamed: 0,Drug ID,Drug Name,Usage,Warnings,Side Effects,Drug Interactions Link,Generic Name,Drug Class
0,1,Abecma,Abecma (idecabtagene vicleucel) is a CAR T-cel...,Cytokine release syndrome or CRS. Abecma can c...,"cytokine release syndrome- CRS (confusion, tro...",https://www.drugs.com/drug-interactions/idecab...,idecabtagene vicleucel,Miscellaneous antineoplastics
1,2,Abilify,Abilify (aripiprazole) is an antipsychotic med...,Abilify is not approved for use in older adult...,blurred vision;; increased saliva ordrooling;;...,https://www.drugs.com/drug-interactions/aripip...,aripiprazole,Atypical antipsychotics
2,3,Abilify Asimtufii,Abilify Asimtufii is anatypical antipsychoticu...,Abilify Asimtufii is not approved for use in o...,Increased weight (17%); Akathisiamovement diso...,https://www.drugs.com/drug-interactions/aripip...,aripiprazole,Atypical antipsychotics
3,4,Abilify Maintena,Abilify Maintena (aripiprazole) extended-relea...,Abilify Maintena is not approved for use in ol...,Increase in weight17%; An inability to remain ...,https://www.drugs.com/drug-interactions/aripip...,aripiprazole lauroxil (injection),Atypical antipsychotics
4,5,Abiraterone,Abirateroneworks by reducing androgen producti...,Abiraterone tablets should not be handled by a...,"swelling in your ankles or feet, pain in your ...",https://www.drugs.com/drug-interactions/abirat...,abiraterone acetate,Miscellaneous antineoplastics


In [None]:
df_cleaned = df_cleaned.drop(columns=['Drug Interactions Link'])

In [None]:
df_cleaned.head()

Unnamed: 0,Drug ID,Drug Name,Usage,Warnings,Side Effects,Generic Name,Drug Class
0,1,Abecma,Abecma (idecabtagene vicleucel) is a CAR T-cel...,Cytokine release syndrome or CRS. Abecma can c...,"cytokine release syndrome- CRS (confusion, tro...",idecabtagene vicleucel,Miscellaneous antineoplastics
1,2,Abilify,Abilify (aripiprazole) is an antipsychotic med...,Abilify is not approved for use in older adult...,blurred vision;; increased saliva ordrooling;;...,aripiprazole,Atypical antipsychotics
2,3,Abilify Asimtufii,Abilify Asimtufii is anatypical antipsychoticu...,Abilify Asimtufii is not approved for use in o...,Increased weight (17%); Akathisiamovement diso...,aripiprazole,Atypical antipsychotics
3,4,Abilify Maintena,Abilify Maintena (aripiprazole) extended-relea...,Abilify Maintena is not approved for use in ol...,Increase in weight17%; An inability to remain ...,aripiprazole lauroxil (injection),Atypical antipsychotics
4,5,Abiraterone,Abirateroneworks by reducing androgen producti...,Abiraterone tablets should not be handled by a...,"swelling in your ankles or feet, pain in your ...",abiraterone acetate,Miscellaneous antineoplastics


In [None]:
df_cleaned.to_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drugs_data_final.csv', index=False)

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drugs_data_final.csv')

In [6]:
df['Side Effects'] = df['Side Effects'].str.replace(';;', ';', regex=False)

In [7]:
df.head()

Unnamed: 0,Drug ID,Drug Name,Usage,Warnings,Side Effects,Generic Name,Drug Class
0,1,Abecma,Abecma (idecabtagene vicleucel) is a CAR T-cel...,Cytokine release syndrome or CRS. Abecma can c...,"cytokine release syndrome- CRS (confusion, tro...",idecabtagene vicleucel,Miscellaneous antineoplastics
1,2,Abilify,Abilify (aripiprazole) is an antipsychotic med...,Abilify is not approved for use in older adult...,blurred vision; increased saliva ordrooling; m...,aripiprazole,Atypical antipsychotics
2,3,Abilify Asimtufii,Abilify Asimtufii is anatypical antipsychoticu...,Abilify Asimtufii is not approved for use in o...,Increased weight (17%); Akathisiamovement diso...,aripiprazole,Atypical antipsychotics
3,4,Abilify Maintena,Abilify Maintena (aripiprazole) extended-relea...,Abilify Maintena is not approved for use in ol...,Increase in weight17%; An inability to remain ...,aripiprazole lauroxil (injection),Atypical antipsychotics
4,5,Abiraterone,Abirateroneworks by reducing androgen producti...,Abiraterone tablets should not be handled by a...,"swelling in your ankles or feet, pain in your ...",abiraterone acetate,Miscellaneous antineoplastics


In [8]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/Diplomska/drugs_data_final.csv', index=False)