In [1]:
from coveopush import CoveoPush
from coveopush import Document
from coveopush import CoveoPermissions
from coveopush import CoveoConstants
from bs4 import BeautifulSoup
import requests
import json
from config import sourceId, orgId, apiKey


In [2]:
# Push API Configs
push = CoveoPush.Push(
    p_SourceId=sourceId,
    p_OrganizationId=orgId,
    p_ApiKey=apiKey,
    p_Endpoint=CoveoConstants.Constants.PushApiEndpoint.QA_PUSH_API_URL,
)

user_email = "jmeng@coveo.com"
my_permissions = CoveoPermissions.PermissionIdentity(CoveoConstants.Constants.PermissionIdentityType.User, "", user_email)
allowAnonymous = True

In [3]:
# SCRAPE PokemonDB
def requestAndParse(URL):
    page = requests.get(URL)
    return BeautifulSoup(page.content,'html.parser')

soup = requestAndParse("https://pokemondb.net/pokedex/national")


generations = soup.find_all('div', class_="infocard-list")

In [4]:
def processGenerations():
    print('processing gnerations')
    processedData = []
    for index, generation in enumerate(generations):
        processedData.append(processGeneration(generation, index + 1))
    return processedData

def processGeneration(generationData, generationNumber):
    infoCards = generationData.find_all('div', class_="infocard")
    return extractAllPokemonData(infoCards, generationNumber)

In [5]:
def extractAllPokemonData(infoCards, generationNumber):
    pokemonData = []
    for pokemonCard in infoCards:
        pokemonData.append(extractDataFromCard(pokemonCard, generationNumber))
    print(pokemonData)
    return pokemonData

In [6]:
def getWeightAndDescription(pokemonURL):
    print('here')
    infoPage =requestAndParse(pokemonURL)
    weightElement = infoPage.find("th",text="Weight").find_next_sibling("td").text
    pokemonWeight = weightElement.split()[0]
    description = infoPage.find("h2", text="Pokédex entries").find_next_sibling("div").find("td").text
    return {
        "pokemonweight":pokemonWeight,
        "pokemondescription": description
    }

In [7]:
def extractDataFromCard(infoCard, generationNumber):
    pokemonName = infoCard.find('a',class_="ent-name").text
    pokemonURLExtension = infoCard.find('a',class_="ent-name").get("href")
    pokemonURL = "https://pokemondb.net" + pokemonURLExtension
    pictureURL = infoCard.find('span', class_="img-fixed").get("data-src")
    weightAndDescription = getWeightAndDescription(pokemonURL)
    pokemonNumber = infoCard.find('small').text[1:]
    pokemonTypes = []
    for pokemonType in infoCard.find_all('a',class_='itype'):
        typeText = pokemonType.text
        pokemonTypes.append(typeText)
    return {
        "name": pokemonName,
        "type": pokemonTypes,
        "generation": generationNumber,
        "pictureURL": pictureURL,
        "pokemonURL": pokemonURL,
        "pokemonNumber": pokemonNumber,
        "pokemonweight": weightAndDescription["pokemonweight"],
        "pokemondescription": weightAndDescription["pokemondescription"]
    }

In [8]:
def pushData(generationData):
    count = 0
    for generation in generationData:
        for pokemon in generation:
            # if count < 5:
            print(pokemon)
            pokemonName = pokemon["name"]
            mydoc = Document(f"file://folder/pokemon/{pokemonName}")
            mydoc.FileExtension = ".html"
            mydoc.Title = pokemonName
            mydoc.AddMetadata("connectortype", "CSV")
            mydoc.SetData(pokemonName)
            mydoc.AddMetadata("type", pokemon["type"])
            mydoc.AddMetadata("generation", pokemon["generation"])
            mydoc.AddMetadata("pictureurl", pokemon["pictureURL"])
            mydoc.AddMetadata("pokemonurl", pokemon["pokemonURL"])
            mydoc.AddMetadata("pokemonnumber", pokemon["pokemonNumber"])
            mydoc.AddMetadata("pokemonweight", pokemon["pokemonweight"])
            mydoc.AddMetadata("pokemondescription", pokemon["pokemondescription"])
            mydoc.SetAllowedAndDeniedPermissions([my_permissions], [], allowAnonymous)
            push.Add(mydoc)
                # count += 1

In [None]:
extractedData = processGenerations()
push.Start(True, True)
push.SetSizeMaxRequest(150*1024*1024)
pushData(extractedData)
push.End(True, True)