In [49]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import random
from googletrans import Translator


In [50]:
med = Translator().translate('sorafenib', src='es', dest='en').text.replace(' ', '+')
med

'sorafenib'

In [51]:
url='https://www.pharmacychecker.com/'+med+'/?src=drug-suggest#prices'
url

'https://www.pharmacychecker.com/sorafenib/?src=drug-suggest#prices'

In [52]:
options = webdriver.ChromeOptions()
options.add_argument('--incognito')
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--disable-cache')
options.add_argument('--disable-cookies')

In [53]:
driver = webdriver.Chrome(service=Service("C:/chromedriver.exe"),options=options)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

# Se busca tipo para proceder a obtener precios
tipo = None
try:
    div = soup.find('div', class_='input-group draggable-btn-input-group')
    if div is not None:
        tipo = div.text
        try:
            tipo = tipo.replace('\n','').strip()
            tipo = tipo.split('Currently Viewing ')[1].strip()
        except:
            tipo = tipo.replace('\n','')
            tipo = tipo.split('Currently ViewingAlso See')[1]
except Exception as e:
    tipo = None

# Función para obtener precios y medidas de Pharmacychecker
def precios_pharmacychecker():
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    medida = soup.find("div", {"id": "drug-strength"}).text
    results = soup.find_all("li", {"class": "drug-pricing-item mt-2"})
    return results, medida

# Procesar datos en función de `tipo`
try:
    if tipo == 'Brand':
        results_patents, medida_patents = precios_pharmacychecker()  
        results_generics, medida_generics = [], [] 
    
    elif tipo is None:
        results_patents, medida_patents = [], []
        results_generics, medida_generics = [], []
    
    else:
        results_generics, medida_generics = precios_pharmacychecker()

        try:
            element = driver.find_element(By.CLASS_NAME, 'drug-type-link')
            driver.execute_script("arguments[0].click();", element)
            results_patents, medida_patents = precios_pharmacychecker()
        except:
            results_patents = []
finally:
    driver.quit()


In [54]:
print("elementos genéricos: ", len(results_generics))
print("elementos patente: ",len(results_patents))

elementos genéricos:  11
elementos patente:  12


In [68]:
#Genéricos
def data_pcheck(results, medida):
    df = pd.DataFrame(columns=['precio', 'farmacia', 'producto', 'cantidad', 'medida', 'ship'])
    for result in results:
        price = result.find('p', {'class': 'desktop-price'}).text
        pharmacy = result.find('a', {'class': 'cpc-drug-listings-link-image'}).get("data-pharm")
        producto = result.find('a', {'class': 'cpc-drug-listings-link-image'}).get("data-drug")
        cantidad = result.find('div', {'class': 'col-3 col-lg-3 drug-pricing-details hide-on-mobile'}).text
        ship = result.find('span', {'class': 'drug-pricing-shipping-info'}).text
        df = pd.concat([df, pd.DataFrame({'precio': price, 'farmacia': pharmacy, 
                                          'producto': producto, 'cantidad': cantidad, 
                                          'medida': medida, 'ship': ship}, index=[0])], 
                       ignore_index=True)
        
    return df

if not results_generics and not results_patents:
    df_generics = pd.DataFrame()
    df_patents = pd.DataFrame()

if not results_generics:
    df_generics = pd.DataFrame()
else:
    df_generics = data_pcheck(results_generics, medida_generics)

#Patentes
if not results_patents:
    df_patents = pd.DataFrame()
else:
    df_patents = data_pcheck(results_patents, medida_patents)
        
#Pegar los dos dataframes
df = pd.concat([df_generics, df_patents], ignore_index=True)
#Si la base está vacía, no continuar
if df.empty:
    df
else:

    canada_pharmacies = ['liferxpharmacy', 'canadianprescriptiondrugstore', 'medsengage', 
                     'offshorecheapmeds', 'pricepropharmacy', 'canadianpharmacystore', 
                     'discountcanadadrugs', 'spfpharmacy']
    df["pais"] = df["farmacia"].apply(lambda x: "Canada" if any([pharmacy in x for pharmacy in canada_pharmacies]) else "Estados Unidos")
    #Limpiar ship
    #Eliminar \n
    df['ship'] = df['ship'].str.replace('\n', '')
    #Extraer lo que diga antes de la palabra "Shipping"
    df['ship'] = df['ship'].str.split('Shipping').str[0]
    #Extraer números con decimales
    df['ship'] = df['ship'].str.findall(r'(\d+\.\d+)').apply(lambda x: x[0] if len(x)>0 else None)
    #Sustituir None por 0
    df['ship'] = df['ship'].fillna(0)
    #Convertir a float
    df['ship'] = df['ship'].astype(float)
df

Unnamed: 0,precio,farmacia,producto,cantidad,medida,ship,pais
0,$378.00,affordablerxmeds.com,Sorafenib,QTY: 120,200 mg,18.0,Estados Unidos
1,$443.00,affordablerxmeds.com,Sorafenib,QTY: 150,200 mg,18.0,Estados Unidos
2,$433.95,canadianprescriptiondrugstore.com/,Sorafenib,QTY: 150,200 mg,14.95,Canada
3,$223.34,liferxpharmacy.com/,Sorafenib,60 tablets,200 mg,9.0,Canada
4,$301.27,liferxpharmacy.com/,Sorafenib,90 tablets,200 mg,9.0,Canada
5,$232.23,medsengage.com,Sorafenib,60 tablets,200 mg,9.95,Canada
6,$313.06,medsengage.com,Sorafenib,90 tablets,200 mg,9.95,Canada
7,$319.59,offshorecheapmeds.com,Sorafenib,60 tablets,200 mg,9.95,Canada
8,$367.23,offshorecheapmeds.com,Sorafenib,90 tablets,200 mg,9.95,Canada
9,$999.95,pricepropharmacy.com/,Sorafenib,90 tablets,200 mg,9.95,Canada


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     23 non-null     object 
 1   pharmacy  23 non-null     object 
 2   producto  23 non-null     object 
 3   cantidad  23 non-null     object 
 4   medida    23 non-null     object 
 5   ship      23 non-null     float64
 6   pais      23 non-null     object 
dtypes: float64(1), object(6)
memory usage: 1.4+ KB


In [57]:
# if not results:
#     df=pd.DataFrame()  
# else:
#     data=[]
#     for result in results:
#         info={}
#         info["precio"]=result.find("p",{"class":"desktop-price"}).text
#         info["farmacia"] = result.find("a", {"class": "cpc-drug-listings-link-image"}).get("data-pharm")
#         info["producto"] = result.find("a", {"class": "cpc-drug-listings-link-image"}).get("data-drug")
#         info["cantidad"] = result.find("div", {"class": "col-3 col-lg-3 drug-pricing-details hide-on-mobile"}).text
#         #Añadir país
#         data.append(info)
#     df=pd.DataFrame(data)
#     #Añadir pais
#     canada_pharmacies = ['liferxpharmacy', 'canadianprescriptiondrugstore', 'medsengage', 'offshorecheapmeds', 'pricepropharmacy']
#     df["pais"] = df["farmacia"].apply(lambda x: "Canada" if any([pharmacy in x for pharmacy in canada_pharmacies]) else "Estados Unidos")

In [58]:
url="https://www.google.com/"

In [59]:
# driver = webdriver.Chrome(service=Service("C:/chromedriver.exe"),options=options)

# driver.get(url)
# #Escribir búsqueda
# search_box = driver.find_element(By.CLASS_NAME, 'gLFyf')
# search_box.send_keys('aysearch.com')
# search_box.submit()
# html = driver.page_source
# sopa = BeautifulSoup(html, 'html.parser')

In [60]:
import requests
import json

def search(query, api_key, cx):
    url = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={cx}&q={query}"
    response = requests.get(url)
    return response.json()

api_key = "AIzaSyByE4UMq4oEOKCK8lHzG56L5nPIcG6NK_0"
cx = "01098e64b067c4b4a"
query = "Google IO"
results = search(query, api_key, cx)

resultados=[]
for result in results["items"]:
    resultados.append([result["title"], result["snippet"]])
resultados


[['Google I/O 2023',
  'Tune in to watch the latest news and innovations from Google. Join I/O for livestreamed keynotes and helpful product updates on demand.'],
 ['Google I/O - Wikipedia',
  'Google I/O (or simply I/O) is an annual developer conference held by Google in Mountain View, California. The name "I/O" is taken from the number googol,\xa0...'],
 ['Google I/O 2022',
  "I/O '22 recap. From announcements and technology deep-dives to bringing the developer community together, I/O is a unique experience for everyone who attends."],
 ['Google I/O 2023: Pixel Fold, Pixel 7a, and everything else announced',
  '8 days ago ... Google I/O 2023 was held on May 10. While many of the talks and presentations at I/O are developer focused, the event started, as always,\xa0...'],
 ['About Google I/O 2023',
  "I/O is Google's flagship event featuring the latest announcements and updates in technology. Developers can tune in online for the live streamed keynotes, watch\xa0..."],
 ['At Google I/

In [61]:
#resultados

In [62]:
# def search(query, api_key, cx, start=1):
#     url = f"https://www.googleapis.com/customsearch/v1"
#     params = {
#         'key': api_key,
#         'cx': cx,
#         'q': query,
#         'start': start
#     }
#     response = requests.get(url, params=params)
#     return response.json()

# api_key = "AIzaSyByE4UMq4oEOKCK8lHzG56L5nPIcG6NK_0"
# cx = "01098e64b067c4b4a"
# query = "aysearch.com"

# # Specify the number of pages of results you want
# num_pages = 5

# for page in range(0, num_pages):
#     start = page * 10 + 1  # start must be 1 for the first page, 11 for the second page, etc.
#     results = search(query, api_key, cx, start)
#     for result in results["items"]:
#         print(result["title"], result["link"])


In [63]:
results["items"]

[{'kind': 'customsearch#result',
  'title': 'Google I/O 2023',
  'htmlTitle': '<b>Google I/O</b> 2023',
  'link': 'https://io.google/',
  'displayLink': 'io.google',
  'snippet': 'Tune in to watch the latest news and innovations from Google. Join I/O for livestreamed keynotes and helpful product updates on demand.',
  'htmlSnippet': 'Tune in to watch the latest news and innovations from <b>Google</b>. Join <b>I/O</b> for livestreamed keynotes and helpful product updates on demand.',
  'cacheId': 'cBm3ZQiMsTAJ',
  'formattedUrl': 'https://io.google/',
  'htmlFormattedUrl': 'https://<b>io</b>.<b>google</b>/',
  'pagemap': {'cse_thumbnail': [{'src': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRypYPDYqgv4j9sWzIFcM0ptNzjXDRgRR5qVHjvWWJ-ap34X06joMQhHmc',
     'width': '310',
     'height': '163'}],
   'metatags': [{'og:image': 'https://io.google/2023/app/images/og-image.jpg',
     'og:type': 'website',
     'viewport': 'width=device-width, initial-scale=1.0',
     'og:title': 'Goo