In [8]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import random
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from unidecode import unidecode
import xml.etree.ElementTree as ET
from cachecontrol import CacheControl
from requests.cookies import RequestsCookieJar
from googletrans import Translator
from medscraper.tools import tipo_cambio
from medscraper.tools import navegador
from medscraper.tools import tidy
import pkg_resources
from datetime import datetime

In [9]:
options=webdriver.ChromeOptions()
options.add_argument('--incognito')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--disable-cache')
options.add_argument('--disable-cookies')
options.add_argument('--headless')
        
#Desabilitar cache
no_cookies = RequestsCookieJar()
session=CacheControl(requests.session())
session.cookies=no_cookies

In [10]:
#Meds a buscar
meds=pd.read_excel("correcciones_list_canada.xlsx")
meds

Unnamed: 0,med,principio,strength
0,acarbose,acarbosa,25+mg
1,acarbose,acarbosa,50+mg
2,acarbose,acarbosa,100+mg
3,adenosine,adenosina,/?type=generic&src=drug-suggest#prices
4,adenosine,adenosina,/?type=brand&src=drug-suggest#prices
...,...,...,...
190,pioglitazone,pioglitazona,30+mg
191,pioglitazone,pioglitazona,45+mg
192,actos,pioglitazona,15+mg
193,actos,pioglitazona,30+mg


In [11]:
#crear una muestra filtrando primera concurrencia de "clonidine", "hydrochlorothiazide" y "invokana"
muestra=meds[meds["med"].isin(["clonidine", "adenosine", "irinotecan+hcl"])].drop_duplicates(subset=["med"])
#Sustituir "0.025" por "0.3" en "clonidine"
muestra.loc[muestra["med"]=="clonidine", "strength"]="0.3+mg"
muestra

Unnamed: 0,med,principio,strength
3,adenosine,adenosina,/?type=generic&src=drug-suggest#prices
29,clonidine,clonidina,0.3+mg
126,irinotecan+hcl,irinotecan,40+mg%252f2+ml


In [12]:
url_basica="https://www.pharmacychecker.com/"
fin="/#prices"
driver=webdriver.Chrome(service=Service("C:/chromedriver.exe"), options=options)

In [13]:
def precios_pharmacychecker():
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    medida=soup.find("div", {"id": "drug-strength"}).text
    results=soup.find_all("li", {"class": "drug-pricing-item mt-2"})
    return results, medida

def data_pcheck(results,medida):
    df = pd.DataFrame(columns=['precio', 'farmacia', 'producto', 'quantity', 'medida', 'ship'])
    for result in results:
        price = result.find('p', {'class': 'desktop-price'}).text
        pharmacy = result.find('a', {'class': 'cpc-drug-listings-link-image'}).get("data-pharm")
        producto = result.find('a', {'class': 'cpc-drug-listings-link-image'}).get("data-drug")
        cantidad = result.find('div', {'class': 'col-3 col-lg-3 drug-pricing-details hide-on-mobile'}).text
        ship = result.find('span', {'class': 'drug-pricing-shipping-info'}).text
        df = pd.concat([df, pd.DataFrame({'producto': producto,'precio_loc': price, 'farmacia': pharmacy, 
                                           'quantity': cantidad, 
                                          'medida': medida, 'ship': ship}, index=[0])], ignore_index=True)
    return df

#Diccionario de farmacias
canada_pharmacies = ['liferxpharmacy', 'canadianprescriptiondrugstore', 'medsengage', 
                     'offshorecheapmeds', 'pricepropharmacy', 'canadianpharmacystore', 
                     'affordablerxmeds',
                     'discountcanadadrugs', 'spfpharmacy', '1800rxonline.com', 'buylowdrugs']

In [14]:
master_df = pd.DataFrame()
for index, row in meds.iterrows():
    med = row["med"]
    strength = row["strength"]
    principio_value = row["principio"]
    print(f"Buscando el medicamento: {med} {strength}")
    # sleep_duration = random.uniform(2, 4)
    # time.sleep(sleep_duration)
    try:
        if med == "adenosine" or med == "dactinomycin" or med== "saxagliptin":
            driver.get(url_basica + med + strength)
        else:
            driver.get(url_basica + med + "/" + strength + fin)
        
        html = driver.page_source
        soup = BeautifulSoup(html,'html.parser')
        resultados, medida = precios_pharmacychecker()
        df = data_pcheck(resultados, medida)
        df["principio"] = principio_value

        # Pegar la info de la muestra
        master_df = pd.concat([master_df, df], ignore_index=True)
        
        # Limpiar
        # pais
        master_df["pais"] = master_df["farmacia"].apply(lambda x: "Canada" if any(pharmacy in x for pharmacy in canada_pharmacies) else "Estados Unidos")
        master_df['producto'] = master_df['producto'].apply(tidy.clean_text)
        
        # Traducir del inglés al español
        translator = Translator()
        master_df['producto'] = master_df['producto'].apply(lambda x: translator.translate(x, src='en', dest='es').text)
        master_df['producto'] = master_df['producto'].apply(tidy.clean_text)
        master_df["product_name"]=master_df["producto"]
        master_df["precio_loc"] = master_df["precio_loc"].apply(tidy.clean_text)
        master_df["tipo_cambio"] = 16.7667
        master_df['ship'] = master_df['ship'].astype(str)
        master_df["ship"] = master_df["ship"].str.replace("\n","")
        master_df["ship"] = master_df["ship"].str.split("Shipping").str[0]
    
        
        # Extraer números con decimales
        master_df['ship'] = master_df['ship'].str.findall(r'(\d+\.\d+)').apply(lambda x: x[0] if len(x)>0 else None)
        
        # Sustituir None por 0
        master_df['ship'] = master_df['ship'].fillna(0)
        
        # Convertir a float
        master_df['ship'] = master_df['ship'].astype(float)

        #Incorporar farmacia_corto
        master_df["farmacia_corto"] = "pchecker"

        #Obtener presentación
        
        master_df['presentacion'] = master_df['quantity'].str.findall(r'([a-zA-Z]+)').apply(lambda x: x[0] if len(x)>0 else None)
        for key, values in tidy.presentaciones.items():
            for value in values:
                mask = master_df['presentacion'].str.contains(value, case=False, na=False)
                master_df.loc[mask, 'presentacion'] = key
        
        master_df['num_prod'] = master_df['quantity'].str.findall(r'(\d+)').apply(lambda x: x[0] if len(x)>0 else None)
        #Transformar a float
        master_df['num_prod'] = master_df['num_prod'].astype(int, errors='ignore')
        #Incorporar hora de consulta en formato dia-mes-año y hora
        master_df["hora_consulta"] = datetime.now().strftime("%d/%m/%Y %H:%M:%S")

    except Exception as e:
        print(f"Error en la búsqueda de: {med} {strength}. Error details: {e}")
        continue


master_df["precio_loc"] = master_df["precio_loc"].astype(float)
#Restar a precio_loc el shipping
master_df["precio_loc"] = master_df["precio_loc"] - master_df["ship"]
#Obtener el precio en pesos
master_df["precio"] = (master_df["precio_loc"] * master_df["tipo_cambio"]).round(2)
#Eliminar quantity
master_df.drop('quantity', axis=1, inplace=True)
#Extraer unidad de medida
master_df['unit'] = master_df['medida'].str.findall(r'([a-zA-Z]+)').apply(lambda x: x[0] if len(x)>0 else None)
        #Extraer cantidad de medida
master_df['quantity'] = master_df['medida'].str.findall(r'(\d+(\.\d+)?)').apply(lambda x: x[0][0] if len(x)>0 else None)
master_df['quantity'] = master_df['quantity'].astype(float, errors='ignore')
        #Eliminar medida
master_df.drop('medida', axis=1, inplace=True)
#Renombrar columna principio por busqueda
master_df.rename(columns={"principio":"busqueda"}, inplace=True)
#Eliminar columna ship
master_df.drop('ship', axis=1, inplace=True)
#Sustitur "QTY" por "tableta" en presentacion, si no, dejar igual
master_df["presentacion"] = np.where(master_df["presentacion"]=="QTY", "tableta", master_df["presentacion"])
#Eliminar registros de Estados Unidos
master_df = master_df[master_df["pais"] == "Canada"]

Buscando el medicamento: acarbose 25+mg
Buscando el medicamento: acarbose 50+mg
Buscando el medicamento: acarbose 100+mg
Buscando el medicamento: adenosine /?type=generic&src=drug-suggest#prices
Buscando el medicamento: adenosine /?type=brand&src=drug-suggest#prices
Buscando el medicamento: nesina 6.25+mg
Buscando el medicamento: nesina 12.5+mg
Buscando el medicamento: nesina 25+mg
Buscando el medicamento: axitinib 1+mg
Buscando el medicamento: axitinib 5+mg
Buscando el medicamento: inlyta 1+mg
Buscando el medicamento: inlyta 5+mg
Buscando el medicamento: bendamustine 25+mg
Buscando el medicamento: bendamustine 100+mg
Buscando el medicamento: bezafibrate 400+mg
Buscando el medicamento: velcade 3.5+mg
Buscando el medicamento: bosutinib 400+mg
Buscando el medicamento: bosutinib 500+mg
Buscando el medicamento: bosulif 100+mg
Buscando el medicamento: bosulif 500+mg
Buscando el medicamento: bumetanide 1+mg
Buscando el medicamento: bumetanide 5+mg
Buscando el medicamento: bumex 1+mg
Buscando

KeyboardInterrupt: 

In [15]:
master_df

Unnamed: 0,precio,farmacia,producto,quantity,medida,ship,precio_loc,principio,pais,product_name,tipo_cambio,farmacia_corto,presentacion,num_prod,hora_consulta
0,,affordablerxmeds.com,acarbosa,QTY: 120,25 mg,18.0,78.00,acarbosa,Canada,acarbosa,16.7667,pchecker,QTY,120.0,03/10/2023 08:59:00
1,,liferxpharmacy.com/,acarbosa,60 tablets,25 mg,9.0,26.68,acarbosa,Canada,acarbosa,16.7667,pchecker,tableta,60.0,03/10/2023 08:59:00
2,,liferxpharmacy.com/,acarbosa,90 tablets,25 mg,9.0,31.10,acarbosa,Canada,acarbosa,16.7667,pchecker,tableta,90.0,03/10/2023 08:59:00
3,,medsengage.com,acarbosa,60 tablets,25 mg,9.95,33.23,acarbosa,Canada,acarbosa,16.7667,pchecker,tableta,60.0,03/10/2023 08:59:00
4,,medsengage.com,acarbosa,90 tablets,25 mg,9.95,39.05,acarbosa,Canada,acarbosa,16.7667,pchecker,tableta,90.0,03/10/2023 08:59:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
478,,medsengage.com,sprycel,90 tablets,140 mg,\n \n ...,"$17,745.14",dasatinib,Canada,,,,,,
479,,offshorecheapmeds.com,sprycel,30 tablets,140 mg,\n \n ...,"$6,413.29",dasatinib,Canada,,,,,,
480,,offshorecheapmeds.com,sprycel,90 tablets,140 mg,\n \n ...,"$17,298.97",dasatinib,Canada,,,,,,
481,,pricepropharmacy.com/,sprycel,30 tablets,140 mg,\n \n ...,"$12,379.95",dasatinib,Canada,,,,,,


In [16]:
#salvar como csv
master_df.to_csv("lev_canada_listados.csv", index=False, encoding="latin-1")