In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import random
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from unidecode import unidecode
import xml.etree.ElementTree as ET
from cachecontrol import CacheControl
from requests.cookies import RequestsCookieJar
from googletrans import Translator
from medscraper.tools import tipo_cambio
from medscraper.tools import navegador
from medscraper.tools import tidy
import pkg_resources
from datetime import datetime

In [2]:
options=webdriver.ChromeOptions()
options.add_argument('--incognito')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--disable-cache')
options.add_argument('--disable-cookies')
options.add_argument('--headless')
        
#Desabilitar cache
no_cookies = RequestsCookieJar()
session=CacheControl(requests.session())
session.cookies=no_cookies

In [3]:
#Meds a buscar
meds=pd.read_excel("correcciones_list_canada.xlsx")
meds

Unnamed: 0,med,principio,strength
0,acarbose,acarbosa,25+mg
1,acarbose,acarbosa,50+mg
2,acarbose,acarbosa,100+mg
3,adenosine,adenosina,/?type=generic&src=drug-suggest#prices
4,adenosine,adenosina,/?type=brand&src=drug-suggest#prices
...,...,...,...
190,pioglitazone,pioglitazona,30+mg
191,pioglitazone,pioglitazona,45+mg
192,actos,pioglitazona,15+mg
193,actos,pioglitazona,30+mg


In [4]:
#crear una muestra filtrando primera concurrencia de "clonidine", "hydrochlorothiazide" y "invokana"
muestra=meds[meds["med"].isin(["clonidine", "adenosine", "irinotecan+hcl"])].drop_duplicates(subset=["med"])
#Sustituir "0.025" por "0.3" en "clonidine"
muestra.loc[muestra["med"]=="clonidine", "strength"]="0.3+mg"
muestra

Unnamed: 0,med,principio,strength
3,adenosine,adenosina,/?type=generic&src=drug-suggest#prices
29,clonidine,clonidina,0.3+mg
126,irinotecan+hcl,irinotecan,40+mg%252f2+ml


In [5]:
url_basica="https://www.pharmacychecker.com/"
fin="/#prices"
driver=webdriver.Chrome(service=Service(navegador.select_chromedriver()), options=options)

In [6]:
def precios_pharmacychecker():
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    medida=soup.find("div", {"id": "drug-strength"}).text
    results=soup.find_all("li", {"class": "drug-pricing-item mt-2"})
    return results, medida

def data_pcheck(results,medida):
    df = pd.DataFrame(columns=['precio', 'farmacia', 'producto', 'quantity', 'medida', 'ship'])
    for result in results:
        price = result.find('p', {'class': 'desktop-price'}).text
        pharmacy = result.find('a', {'class': 'cpc-drug-listings-link-image'}).get("data-pharm")
        producto = result.find('a', {'class': 'cpc-drug-listings-link-image'}).get("data-drug")
        cantidad = result.find('div', {'class': 'col-3 col-lg-3 drug-pricing-details hide-on-mobile'}).text
        ship = result.find('span', {'class': 'drug-pricing-shipping-info'}).text
        df = pd.concat([df, pd.DataFrame({'producto': producto,'precio_loc': price, 'farmacia': pharmacy, 
                                           'quantity': cantidad, 
                                          'medida': medida, 'ship': ship}, index=[0])], ignore_index=True)
    return df

#Diccionario de farmacias
canada_pharmacies = ['liferxpharmacy', 'canadianprescriptiondrugstore', 'medsengage', 
                     'offshorecheapmeds', 'pricepropharmacy', 'canadianpharmacystore', 
                     'affordablerxmeds',
                     'discountcanadadrugs', 'spfpharmacy', '1800rxonline.com', 'buylowdrugs']

In [7]:
master_df=pd.DataFrame()
for index, row in meds.iterrows():
    med = row["med"]
    strength = row["strength"]
    principio_value = row["principio"]
    print(f"Med: {med} Strength {strength}")
    try:
        if med == "adenosine" or med == "dactinomycin" or med== "saxagliptin":
            driver.get(url_basica + med + strength)
        else:
            driver.get(url_basica + med + "/" + strength + fin)
        html=driver.page_source
        soup=BeautifulSoup(html,'html.parser')
        resultados,medida=precios_pharmacychecker()
        df = data_pcheck(resultados, medida)
        df["busqueda"]=principio_value
        #Hora consulta
        df["hora_consulta"]=time.strftime("%H:%M:%S %d/%m/%Y")
        # Pegar la info de la muestra
        master_df = pd.concat([master_df, df], ignore_index=True)
        #Limpiar
        #pais
        master_df["pais"]=master_df["farmacia"].apply(lambda x: "Canada" if any(pharmacy in x for pharmacy in canada_pharmacies) else "Estados Unidos")
        master_df['producto'] = master_df['producto'].apply(tidy.clean_text)
                #Traducir del inglés al español
        translator = Translator()
        master_df['producto'] = master_df['producto'].apply(lambda x: translator.translate(x, src='en', dest='es').text)
        master_df['producto'] = master_df['producto'].apply(tidy.clean_text)
        master_df["precio_loc"]=master_df["precio_loc"].astype(str)
        master_df["precio_loc"]=master_df["precio_loc"].apply(tidy.clean_text)
        master_df["precio_loc"]=master_df["precio_loc"].astype(float)
        master_df['ship'] = master_df['ship'].astype(str)
        master_df["ship"]=master_df["ship"].str.replace("\n","")
        master_df["ship"]=master_df["ship"].str.split("Shipping").str[0]
        # #Extraer números con decimales
        master_df['ship'] = master_df['ship'].str.findall(r'(\d+\.\d+)').apply(lambda x: x[0] if len(x)>0 else None)
        # #Sustituir None por 0
        master_df['ship'] = master_df['ship'].fillna(0)
        # #Convertir a float
        master_df['ship'] = master_df['ship'].astype(float)
        master_df["precio_loc"]=master_df["precio_loc"]-master_df["ship"]
        #Eliminar columna ship
        master_df=master_df.drop(columns=["ship"])
        #Tipo de cambio
        master_df["tipo_cambio"]=16.766700
        #Precio en MXN
        master_df["precio"]=(master_df["precio_loc"]*master_df["tipo_cambio"]).round(2)
        master_df["farmaca_corto"]="pchecker"
        master_df["product_name"]=master_df["producto"]
        master_df["quantity"]=master_df["quantity"].astype(str) 
        master_df["num_prod"]=master_df["quantity"].str.findall(r'(\d+)').apply(lambda x: x[0] if len(x)>0 else None)
        master_df["num_prod"]=master_df["num_prod"].astype(int,errors="ignore")
        #Sustituir producto y product_name con "rybelsus" si dice "rybelso"
        master_df["producto"]=master_df["producto"].str.replace("rybelso","rybelsus")
        master_df["product_name"]=master_df["product_name"].str.replace("rybelso","rybelsus")
        master_df["product_name"]=master_df["product_name"].str.replace("calcuencia","calquence")
        #Extraer unidad de medida
        master_df['presentacion'] = master_df['quantity'].str.findall(r'([a-zA-Z]+)').apply(lambda x: x[0] if len(x)>0 else None)
        for key, values in tidy.presentaciones.items():
            for value in values:
                mask = master_df['presentacion'].str.contains(value, case=False, na=False)
                master_df.loc[mask, 'presentacion'] = key
        #Reemplazar qty por tableta en presentacion
        master_df["presentacion"]=master_df["presentacion"].str.replace("QTY","tableta")
        master_df["presentacion"]=master_df["presentacion"].str.replace("patches","parches")
        master_df["presentacion"]=master_df["presentacion"].str.replace("injections","inyeccion")
        #master_df.drop('quantity', axis=1, inplace=True)
                #Extraer unidad de medida
        master_df["medida"]=master_df["medida"].astype(str)
        master_df['unit'] = master_df['medida'].str.findall(r'([a-zA-Z]+)').apply(lambda x: x[0] if len(x)>0 else None)
                #Extraer cantidad de medida
        master_df['quantity_x'] = master_df['medida'].str.findall(r'(\d+(\.\d+)?)').apply(lambda x: x[0][0] if len(x)>0 else None)
        master_df['quantity_x'] = master_df['quantity_x'].astype(float, errors='ignore')
    #master_df.drop('medida', axis=1, inplace=True)
    #Eliminar registros de Estados Unidos
        master_df=master_df[master_df["pais"]=="Canada"]

    except Exception as e:
        print(f"Error en la búsqueda de: {med} {strength}. Error details: {e}")
        continue


#Eliminar medida y quantity
master_df=master_df.drop(columns=["medida","quantity"])
#Renombrar quantity_x
master_df=master_df.rename(columns={"quantity_x":"quantity"})

Med: acarbose Strength 25+mg
Med: acarbose Strength 50+mg
Med: acarbose Strength 100+mg
Med: adenosine Strength /?type=generic&src=drug-suggest#prices
Med: adenosine Strength /?type=brand&src=drug-suggest#prices
Med: nesina Strength 6.25+mg
Med: nesina Strength 12.5+mg
Med: nesina Strength 25+mg
Med: axitinib Strength 1+mg
Med: axitinib Strength 5+mg
Med: inlyta Strength 1+mg
Med: inlyta Strength 5+mg
Med: bendamustine Strength 25+mg
Med: bendamustine Strength 100+mg
Med: bezafibrate Strength 400+mg
Med: velcade Strength 3.5+mg
Med: bosutinib Strength 400+mg
Med: bosutinib Strength 500+mg
Med: bosulif Strength 100+mg
Med: bosulif Strength 500+mg
Med: bumetanide Strength 1+mg
Med: bumetanide Strength 5+mg
Med: bumex Strength 1+mg
Med: bumex Strength 5+mg
Med: myleran Strength 2+mg
Med: invokana Strength 100+mg
Med: invokana Strength 300+mg
Med: zykadia Strength 150+mg
Med: mavenclad Strength 10+mg
Med: clonidine Strength 0.025+mg
Med: clonidine Strength 0.1+mg
Med: clonidine Strength 0.

In [8]:
master_df

Unnamed: 0,precio,farmacia,producto,precio_loc,busqueda,hora_consulta,pais,tipo_cambio,farmaca_corto,product_name,num_prod,presentacion,unit,quantity
0,1006.00,affordablerxmeds.com,acarbosa,60.00,acarbosa,14:00:37 03/10/2023,Canada,16.7667,pchecker,acarbosa,120,tableta,mg,25.0
1,296.44,liferxpharmacy.com/,acarbosa,17.68,acarbosa,14:00:37 03/10/2023,Canada,16.7667,pchecker,acarbosa,60,tableta,mg,25.0
2,370.54,liferxpharmacy.com/,acarbosa,22.10,acarbosa,14:00:37 03/10/2023,Canada,16.7667,pchecker,acarbosa,90,tableta,mg,25.0
3,390.33,medsengage.com,acarbosa,23.28,acarbosa,14:00:37 03/10/2023,Canada,16.7667,pchecker,acarbosa,60,tableta,mg,25.0
4,487.91,medsengage.com,acarbosa,29.10,acarbosa,14:00:37 03/10/2023,Canada,16.7667,pchecker,acarbosa,90,tableta,mg,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1639,1512.52,medsengage.com,actos,90.21,pioglitazona,17:58:04 03/10/2023,Canada,16.7667,pchecker,actos,84,tableta,mg,45.0
1640,1070.89,offshorecheapmeds.com,actos,63.87,pioglitazona,17:58:04 03/10/2023,Canada,16.7667,pchecker,actos,56,tableta,mg,45.0
1641,1460.38,offshorecheapmeds.com,actos,87.10,pioglitazona,17:58:04 03/10/2023,Canada,16.7667,pchecker,actos,84,tableta,mg,45.0
1642,1173.67,pricepropharmacy.com/,actos,70.00,pioglitazona,17:58:04 03/10/2023,Canada,16.7667,pchecker,actos,28,tableta,mg,45.0


In [9]:
#salvar como csv
master_df.to_csv("lev_canada_listados.csv", index=False, encoding="latin-1")