# Pipeline Challenge

In [None]:
import tqdm
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import re
import numpy as np
from selenium import webdriver
import time

# Example of Web Scraping Pipeline

The following is a sequence of steps used to obtain some information from the web. Try to understand the code and then refactor it into a pipeline-like structure. This is not a simple task and there are hundreds of ways to do it. 

## Extract links for the page

In [None]:
url = 'https://www.opendota.com/matches/pro'

# we need to use selenium since the page loads dynamic html
driver = webdriver.Chrome(executable_path='C:/Users/andreaguiar/Desktop/usr/dist/chromedriver.exe')

#navigate to page
driver.get(url)
time.sleep(2)

# find all table rows
rows = driver.find_elements_by_tag_name('tr')

# get first column [0] of each row (skip first row)
row_data = [row.find_element_by_tag_name('td') for row in rows[1::]]

# this code extracts the links from tag a's
links = [item.find_element_by_tag_name('a').get_attribute('href') for item in row_data]

In [None]:
links

## Enter links to get info

### Let's first create the code for 1 link

In [None]:
# get each link
inside_url = links[0] 
print(inside_url)

# get only the numerical part of the link, which is the match id
match_id = re.findall('\d+', inside_url)[0]
print(match_id)

In [None]:
# navigate to page
driver.get(inside_url)

# we need to give some time for the browser to reach the page.
time.sleep(2)

# get tables
table1 = driver.find_elements_by_tag_name('table')[0]
table2 = driver.find_elements_by_tag_name('table')[1]

## Download table1 to a dataframe and start its cleaning.

In [None]:
# get table1 (index=0)
df1 = pd.read_html(driver.page_source)[0]

# throw away some columns and rows (you know there should only be 5 rows == 5 players for each team)
df1 = df1.drop(columns={'/','/.1','Jogador', 'Buffs'}).rename(columns={'Unnamed: 16':'jungle_itens'})
df1 = df1.loc[:4,:]

# rename the columns so as to obtain a readable dataframe
df1 = df1.rename(columns={'K':'kills','M':'deaths','A':'assistences','LVL':'lvl','FN':'creep_kill','N':'creep_deny', 'OPM':'gold_per_minute', 
                        'XPM':'exp_per_minute', 'Itens':'itens','O':'total_gold','DH':'hero_damage','DT':'tower_damage','CH':'hero_heal', 'Buffs':'tomes'})

# get names, links and ids of each user

names1 = [name.text for name in table1.find_elements_by_tag_name('a')]
name_links1 = [name.get_attribute('href') for name in table1.find_elements_by_tag_name('a')]
user_ids1 = [re.findall('\d+', link)[0] for link in name_links1]

## Clean item names

This part of the code finds the name of the itens the player has bought. Don't worry about the details.

In [None]:
itens1 = [name.find_element_by_class_name('sc-uJMKN') 
             for name in table1.find_elements_by_tag_name('td')
                     if len(name.find_elements_by_class_name('sc-uJMKN')) > 0]

name_itens1 = [re.findall('items/(.*)_lg\.png',item.find_element_by_tag_name('object').get_attribute('data'))[0] 
             for item in itens1]

# Clean jungle item names
jungle_itens1 = [name.find_element_by_class_name('neutral') 
             for name in table1.find_elements_by_tag_name('td') 
                     if len(name.find_elements_by_class_name('neutral')) > 0]

name_jungle_itens1 = [re.findall('items/(.*)_lg\.png',item.find_element_by_tag_name('object').get_attribute('data'))[0] 
             for item in jungle_itens1]


## Input values into table 1

In [None]:
df1['jungle_itens'] = name_jungle_itens1
df1['itens'] = name_itens1
df1['name'] = names1
df1['links'] = name_links1
df1['user_ids'] = user_ids1
df1['match_id'] = 5 * [match_id]

## This is your final dataframe for team 1

In [None]:
df1

# Let's repeat the process for team 2

In [None]:
# get table1 (index=1)
df2 = pd.read_html(driver.page_source)[1]

df2 = df2.drop(columns={'/','/.1','Jogador', 'Buffs'}).rename(columns={'Unnamed: 16':'jungle_itens'})
df2 = df2.loc[:4,:]

df2 = df2.rename(columns={'K':'kills','M':'deaths','A':'assistences','LVL':'lvl','FN':'creep_kill','N':'creep_deny', 'OPM':'gold_per_minute', 
                        'XPM':'exp_per_minute', 'Itens':'itens','O':'total_gold','DH':'hero_damage','DT':'tower_damage','CH':'hero_heal', 'Buffs':'tomes'})

names2 = [name.text for name in table2.find_elements_by_tag_name('a')]
name_links2 = [name.get_attribute('href') for name in table2.find_elements_by_tag_name('a')]
user_ids2 = list(map(lambda x : re.findall('\d+', x)[0], name_links2))

itens2 = [name.find_element_by_class_name('sc-uJMKN') 
             for name in table2.find_elements_by_tag_name('td')
                     if len(name.find_elements_by_class_name('sc-uJMKN')) > 0]

name_itens2 = [re.findall('items/(.*)_lg\.png',item.find_element_by_tag_name('object').get_attribute('data'))[0] 
             for item in itens2]


jungle_itens2 = [name.find_element_by_class_name('neutral') 
             for name in table1.find_elements_by_tag_name('td') 
                     if len(name.find_elements_by_class_name('neutral')) > 0]

name_jungle_itens2 = [re.findall('items/(.*)_lg\.png',item.find_element_by_tag_name('object').get_attribute('data'))[0] 
             for item in jungle_itens2]

df2['jungle_itens'] = name_jungle_itens2
df2['itens'] = name_itens2
df2['name'] = names2
df2['links'] = name_links2
df2['user_ids'] = user_ids2
df2['match_id'] = 5 * [match_id]

In [None]:
df2

------

# First let's clean up the code!

**This code was written in a rush. Let's clean up the mess of df1 and table1 stuff and create a function. ** 

Create a function that receives `table` like so:

```python
def clean_table(table, index=0):
    # organize the processess above ...
    
    return df
```

This function will receive the table object from the selenium driver element and it should return a cleaned dataframe with the above processes. If you specify `index=0`, it should return the table for the first team, if you specify `index=1` it should return the table for the second team.

In [None]:
def clean_table(table, index=0):
    
    df = pd.read_html(driver.page_source)[index].drop(columns={'/','/.1','Jogador', 'Buffs'}).rename(columns={'Unnamed: 16':'jungle_itens'})
    df = df.loc[:4,:]
    
    df = df.rename(columns={'K':'kills','M':'deaths','A':'assistences','LVL':'lvl','FN':'creep_kill','N':'creep_deny', 'OPM':'gold_per_minute', 
                            'XPM':'exp_per_minute', 'Itens':'itens','O':'total_gold','DH':'hero_damage','DT':'tower_damage','CH':'hero_heal', 'Buffs':'tomes'})
    names = [name.text for name in table.find_elements_by_tag_name('a')]
    name_links = [name.get_attribute('href') for name in table.find_elements_by_tag_name('a')]
    user_ids = list(map(lambda x : re.findall('\d+', x)[0], name_links))
    match_id = match_id = re.findall('\d+', driver.current_url)[0]
    
    # get item names
    itens = [name.find_element_by_class_name('sc-uJMKN') 
             for name in table.find_elements_by_tag_name('td')
                     if len(name.find_elements_by_class_name('sc-uJMKN')) > 0]

    name_itens = ([re.findall('items/(.*)_lg\.png',item.find_element_by_tag_name('object').get_attribute('data'))[0] 
                     for item in itens] + 5 * [''])[:5]

    jungle_itens = [name.find_element_by_class_name('neutral') 
                         for name in table.find_elements_by_tag_name('td') 
                                 if len(name.find_elements_by_class_name('neutral')) > 0]
    
    
    name_jungle_itens = ([re.findall('items/(.*)_lg\.png',item.find_element_by_tag_name('object').get_attribute('data'))[0] 
                                 for item in jungle_itens] + 5 * [''])[:5]
    
    df['jungle_itens'] = name_jungle_itens
    df['itens'] = name_itens
    df['name'] = names
    df['links'] = name_links
    df['user_ids'] = user_ids
    df['match_id'] = 5 * [match_id]
    
    return df

## In such a way that the following code returns the dataframe of the first team

In [None]:
from datetime import datetime


table = driver.find_elements_by_tag_name('table')[0]
team_a = clean_table(table, index=0)

# and this one receives the second team
table = driver.find_elements_by_tag_name('table')[1]
team_b = clean_table(table, index=1)

teams = pd.concat([team_a, team_b]).reset_index(drop=True)

teams['team'] = ['a']*5 + ['b']*5
teams['date'] = datetime.today()

In [None]:
teams

# store in database

Create the `dota` database on your pgAdmin and substitute `admin` by your own password.

In [None]:
from sqlalchemy import create_engine

engine = create_engine('postgresql+psycopg2://postgres:admin@localhost/dota')
conn = engine.connect()

# Let's get all the tables from the page and append them in a table in the dataframe

## Remember to log

In [None]:
from tqdm.auto import tqdm
import logging
logging.basicConfig(level=logging.INFO)

logger = logging.getLogger('dota')

In [None]:
teams.to_sql('pro_matches', conn, index=False, if_exists='replace')

In [None]:
team_b

In [None]:
## df = pd.DataFrame()
engine.execute('DROP TABLE IF EXISTS pro_matches')

for link in tqdm(links):
    # navigate to link
    driver.get(link)
    
    # give time for browser to reach page
    time.sleep(1)
    
    
    print(link)
    # Add check to only perform the code below if the value is not in the database (add this condition on the pipeline)
    match_id = re.findall('\d+', link)[0]
    
    
    # if the match_id was not found in the database:
    logger.info(f'Match {match_id} not yet found in database. Storing it.')

    try:
        table = driver.find_elements_by_tag_name('table')[0]
        team_a = clean_table(table)

        table = driver.find_elements_by_tag_name('table')[1]
        team_b = clean_table(table)

        teams = pd.concat([team_a, team_b])
        logger.info('Appending results')

        # remove '-' from integer dtypes and convert to string
        teams = teams.convert_dtypes().applymap(lambda x : str(x).replace('-','0')).astype(str)
        teams.to_sql('pro_matches', conn, if_exists='append', index=False, )


        df = pd.concat([df, teams])


    except:
        logger.warning(f'{link} could not be stored.')
        pass

# Missions: reestructure this process into a pipeline.

## <u>COOKIECUTTER</u>: Use cookiecutter to create your new structure of files

After installing cookiecutter, run:
`cookiecutter https://github.com/aguiarandre/etl-pipelines`

This will create your pipeline's folder structure.

## <u>ORGANIZATION - USING .PY FILES</u>: Transform the above steps into a structured .py pipeline

Remember to separate the parameters on their own separate file. The connection on another.

## <u>DOCUMENTATION</u>: Document each function of your pipeline. Then use sphinx to create your code's documentation

 Go into the `your_project/docs` folder and `./make.bat html` or `./make html` (don't forget to run `pip install -r requirements.txt`

## <u>IDEMPOTENCY PRINCIPLE</u>: Avoid duplication in your database. Only perform the storage step if today's date is not there.

## BONUS: <u>MORE BENEFITS OF .PY FILES</u>: Create a scheduler to perform this task once a day.

Use **crontab** if you're a Mac user.
After allowing cron to have Full Disk Access on `Security & Privacy`, write in your crontab: 

> `* * * * * full/path/to/your/python/executable full/path/to/pipeline.py`

Use **task-scheduler** if you are on windows. Create a `run.bat` script on the same folder of your `pipeline.py`. Write inside: 

> `python.exe pipeline.py`

Then go to task-scheduler (Agendar Tarefas) and create a new task. Give it a name, a new trigger specifying times and a new action specifying the path/to/your/run.bat and fill in 'Start at' with the path to your /project/src folder where your pipeline lives in.