# 0. Import Libraries and Initialize Local Dask Executor

In [5]:
%load_ext autoreload
%autoreload 2


from prefect import Flow, task
from prefect.tasks.shell import ShellTask
from prefect.schedules.clocks import IntervalClock
from prefect.schedules import Schedule
from dask.distributed import Client
from prefect.executors import LocalDaskExecutor

from datetime import datetime, timedelta
import pendulum
import os
import sys
import pandas as pd

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.data import io

cwd = os.getcwd().split("/")[-1]
if cwd == "jupyter_notebooks":
    os.chdir("../")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
executor = LocalDaskExecutor()

# 1. Scrape ufcstats.com

## 1.1 Scrape Events Flow

In [4]:
# Sets a Schedule for the Flow to run every Sunday at 6:00 am.
event_schedule = Schedule(
    [
        IntervalClock(
            start_date=pendulum.datetime(2020, 12, 13, 6, tz="America/Los_Angeles"),
            interval=timedelta(days=7),
        )
    ]
)

# Instanciates a ShellTask.
shell_task = ShellTask(helper_script="cd scrapy_ufcstats")

with Flow("Scrape Events", executor=executor, schedule=event_schedule) as f:
    # Gets today's date.
    date_today = datetime.now().date()

    # Scrapy crawl ufc.com's event URLs.
    scrapy_crawl_events_cmd = (
        f"scrapy crawl events -O ../data/events/event_urls_{date_today}.csv"
    )
    shell_task_command = shell_task(command=scrapy_crawl_events_cmd)

    # Extracts the file names of the .csv files located in the path and then gets the most recent date and logs it into prefect.
    csv_file_names = io.get_filenames(
        path="data/events/", extension="csv", upstream_tasks=[shell_task_command]
    )
    csv_file_names_arr = io.arr_to_series(csv_file_names)
    string_dates_arr = io.get_string_date(csv_file_names_arr)
    io.get_max_date(string_dates_arr)

# Registers the Flow to the Prefect project.
f.register("scrape-ufcstats")

Result check: OK
Flow URL: https://cloud.prefect.io/jasonminsookim-gmail-com-s-account/flow/ca3bd7bb-4682-46d9-b4aa-6dab42a2cccd
 └── ID: 39a50dc2-dee4-469f-8894-d30d9b77f421
 └── Project: scrape-ufcstats
 └── Labels: ['Mac-mini.local']


'39a50dc2-dee4-469f-8894-d30d9b77f421'

## 1.2 Scrape 

In [8]:
with Flow("Test", executor=executor) as f:
    io.connect_postgres()
# Registers the Flow to the Prefect project.
#f.register("scrape-ufcstats")
f.run()

[2021-01-15 21:27:45-0800] INFO - prefect.FlowRunner | Beginning Flow run for 'Test'
[2021-01-15 21:27:45-0800] INFO - prefect.TaskRunner | Task 'connect_postgres': Starting task run...
[2021-01-15 21:27:45-0800] INFO - prefect.connect_postgres | Config file loaded successfully.
[2021-01-15 21:27:45-0800] INFO - prefect.connect_postgres | Connecting to the PostgreSQL database fightdata ...
[2021-01-15 21:27:45-0800] INFO - prefect.connect_postgres | Connection to db successful.
[2021-01-15 21:27:45-0800] INFO - prefect.TaskRunner | Task 'connect_postgres': Finished task run for task with final state: 'Success'
[2021-01-15 21:27:45-0800] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded


<Success: "All reference tasks succeeded.">