### En esta notebook...

- Mining del texto de la cartilla médica de Medicus en formato .pdf
- Extracción de los nombres por especialidad y zona
- Consulta de los nombres en doctoralia 
- Busqueda de coincidencias _fuzzy_
- Scraping de la info 
- Inserción en DataFrame
- Selección del médico mejor puntuado

In [218]:
import os
import re
import pandas as pd
import numpy as np
from time import sleep
from collections import defaultdict
from lxml import html
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_colwidth = 200

In [2]:
#!pip install fuzzywuzzy[speedup]
#!pip install fuzzysearch

In [4]:
from fuzzysearch import find_near_matches
from fuzzywuzzy import fuzz
import textract
from utils import init_chrome

In [7]:
def parse_med_data(html_):
    """Parses doctoralia search page"""
    
    dom = html.fromstring(html_)
    data = []
    for e in dom.xpath('//div[@class="dp-doctor-card dp-doctor-card-md"]/parent::div'):
        name = e.xpath('.//a[@data-ga-label="Doctor Name"]')[0].text.strip()
        profile_url = e.xpath('.//a[@data-ga-label="Doctor Name"]')[0].get('href').partition('#')[0]
        try:
            address = e.xpath('.//div[@class="address-name"]//span[@class="street"]')[0].text
        except:
            address = ''
        try:
            score = int(e.xpath('.//a[@data-ga-label="Reviews"]//@data-score')[0])
            n = int(re.search('\d+', e.xpath('.//span[@data-score]/span')[0].text).group(0))
        except:
            score,n = 0, 0

        data.append((name, address, score, n, profile_url))

    return pd.DataFrame(data, columns = 'name, address, score, n, profile_url'.split(', '))

In [6]:
path = '/home/matias/Matias/Programas Varios/corporate_area_metropolitana.pdf'
text = textract.process(path).decode('utf8')
pages = re.split('CARTILLA CORPORATE\n31/07/2019\n\nPágina \d+? de 213\n\n\x0c', text)

cartilla = defaultdict(lambda: defaultdict(list))
cuerpo_medico = ''
for i in range(1,len(pages)-1):
    tipo, zona, txt = re.split('\n\n?', pages[i], maxsplit=2)
    
    if tipo != 'CUERPO MEDICO':
        continue    
    
    cuerpo_medico += txt
    
    parts = re.split('\n\n(?=[A-Z ]+?\n)', txt.strip('\n\n'))
    med = {p.split('\n')[0]: [re.sub('\n|\(\d+\)', ' ', p).strip() for p in re.findall('(?<=\n)'
                                                                       '('
                                                                       '(?:DRA?\.|CENTRO|CONS|CLI|SANA|HOSP|ALTER)'
                                                                       '[A-Z, ]+'
                                                                       '(?:\n?(?:[A-Z]+?)|\n?(?:\(\d+\))?)*'
                                                                       ')'
                                                                       '(?=\n|$)',
                                                                       p)] for p in parts}
    for k in med:
        cartilla[zona][k] += med[k]    

In [7]:
len(text), len(cuerpo_medico)

(605018, 394452)

In [8]:
cartilla['CAPITAL FEDERAL'].keys()

dict_keys(['ADOLESCENCIA', 'ALERGIA E INMUNOLOGIA', 'ANATOMIA PATOLOGICA', 'ANATOMIA PATOLOGICA INFANTIL', 'ANATOMIA PATOLOGICA PULMONAR', 'ANDROLOGIA', 'CARDIOLOGIA', 'ERGOMETRIA', 'HOLTER', 'CARDIOLOGIA INFANTIL', 'ECOCARDIOGRAMA', 'ELECTROCARDIOGRAMA', 'CIRUGIA BARIATRICA', 'CIRUGIA CARDIOVASCULAR', 'CIRUGIA DE CABEZA Y CUELLO', 'CIRUGIA DE CADERA', 'CIRUGIA DE COLON Y RECTO', 'CIRUGIA DE COLUMNA', 'CIRUGIA DE MANO', 'CIRUGIA DEL QUEMADO', 'CIRUGIA GENERAL', 'CIRUGIA INFANTIL', 'CIRUGIA PERCUTANEA', 'CIRUGIA PLASTICA Y REPARADORA', 'CIRUGIA PLASTICA Y REPARADORA INFANTIL', 'CIRUGIA TORACICA', 'CIRUGIA VASCULAR PERIFERICA', 'TRATAMIENTO FLEBOESCLEROSANTE', 'CITODIAGNOSTICO', 'CLINICA MEDICA', 'DERMATOLOGIA', 'DERMATOLOGIA INFANTIL', 'DERMATOPATOLOGIA', 'DIABETOLOGIA Y NUTRICION', 'DIABETOLOGIA Y NUTRICION INFANTOJUVENIL', 'ENDOCRINOLOGIA Y METABOLISMO', 'ENDOCRINOLOGIA Y METABOLISMO INFANTIL', 'FONOAUDIOLOGIA', 'CENTRO MEDICUS AZCUENAGA', 'GASTROENTEROLOGIA', 'ECOGRAFIAS ABDOMINALES'

In [30]:
cartilla['CAPITAL FEDERAL']['CARDIOLOGIA']

In [30]:
q = 'URO|DERMA|CLINICA MEDICA|CARDIO'
cartilla['CAPITAL FEDERAL'][q] = list(set([m for k in cartilla['CAPITAL FEDERAL'].keys() for m in cartilla['CAPITAL FEDERAL'][k] if re.search(q, k)]))

In [8]:
# Pandas

In [16]:
df = df
df = df[~df['profile_url'].duplicated()]
df = df.reset_index(drop=True)
df = df.sort_values(['n'], ascending=False)

In [33]:
df.shape

(751, 8)

In [18]:
def custom_ratio(str1, str2):
    # Usar que 
    # en la cartilla es DR Apellido, Nombre, Nombre
    # en el sitio es DR nombre apellido
    #re.sub('dra?.?\w+', '')
    return fuzz.token_sort_ratio(str1, str2)

In [19]:
df['dist'] = df.apply(lambda x: custom_ratio(x['name'], x['search_name']), axis=1)

In [26]:
df['match'] = df['dist'] >= 80

In [29]:
df[df['match']][['name', 'search_name', 'profile_url', 'n', 'dist', 'score']].sort_values('n')

Unnamed: 0,name,search_name,profile_url,n,dist,score
586,Dra. Julieta Finkelstein,"Dra. Finkelstein, Julieta",https://www.doctoraliar.com/julieta-finkelstein/medico-clinico-nutricionista/capital-federal,0,100,0
620,Liliana Noemí Banin,"Dra. Banin, Liliana",https://www.doctoraliar.com/liliana-noemi-banin/medico-clinico/capital-federal,0,74,0
600,Dr. Narciso J. Bolomo,"Dr. Bolomo, Narciso Julio",https://www.doctoraliar.com/narciso-j-bolomo/medico-clinico/capital-federal,0,90,0
655,Dr. Martín J. Solanet,"Dra. Mato, Marta J",https://www.doctoraliar.com/martin-j-solanet/medico-clinico/capital-federal,0,71,0
641,Horacio Alejandro Argente,"Dr. Alleva, Alejandro Horacio",https://www.doctoraliar.com/horacio-alejandro-argente/medico-clinico/capital-federal,0,77,0
642,Jorge Alberto Scala,"Dr. Pascual, Jorge Alberto",https://www.doctoraliar.com/jorge-alberto-scala/medico-clinico-traumatologo/coronel-vidal,0,84,0
559,Dr. Carlos M. Grasso Fontan,"Dr. Grasso Fontan, Carlos Miguel",https://www.doctoraliar.com/carlos-m-grasso-fontan/medico-clinico/capital-federal,0,91,0
174,Sandra Braun,"Dra. Braun, Sandra Noemi",https://www.doctoraliar.com/sandra-braun/medico-clinico/capital-federal,0,71,0
142,Dr. Mariano Taverna,"Dr. Pages, Martin Mariano",https://www.doctoraliar.com/mariano-taverna/medico-clinico/capital-federal,0,73,0
182,Dra. María Fernanda Moschione,"Dra. Sanchez, Maria Fernanda",https://www.doctoraliar.com/maria-fernanda-moschione/medico-clinico/capital-federal,0,83,0


In [None]:
#df['check'] = df.apply(lambda x: d(' '.join(x['address'].split()), x['search_name'].lower(), cuerpo_medico.lower()))

In [705]:
cardios = df[df['profile_url'].str.contains('cardio')]
cardios.shape, len(cartilla[zona][especialidad])

((0, 6), 252)

In [697]:
import webbrowser

In [698]:
for i in range(3):
    webbrowser.open_new_tab(df.iloc[i]['profile_url'])

In [202]:
#url = f"https://www.google.com/search?q={query}&ie=utf-8&oe=utf-8"
webbrowser.open_new_tab(driver.current_url)
#re.search('.{,100}'+address+'.{,100}', cuerpo_medico, flags=re.DOTALL|re.IGNORECASE).group(0)