# Import Libraries

In [1]:
%load_ext autoreload
%autoreload 2


from prefect import Flow, task
from prefect.tasks.shell import ShellTask
from prefect.schedules import Schedule
from datetime import timedelta
from dask.distributed import Client
from prefect.executors import LocalDaskExecutor
from prefect.schedules.clocks import IntervalClock
import pendulum

import os
import sys
import pandas as pd

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.data import io

cwd = os.getcwd().split("/")[-1]
if cwd == "jupyter_notebooks":
    os.chdir("../")

In [2]:
executor=LocalDaskExecutor()

# Scrape Events Flow

In [3]:
event_schedule = Schedule([IntervalClock(start_date=pendulum.datetime(2020, 12, 13, 6, tz="America/Los_Angeles"),
                               interval=timedelta(days=7))])

shell_task = ShellTask(helper_script="cd scrapy_ufcstats")

with Flow("Scrape Events", executor=executor, schedule=event_schedule) as f: 
    # Scrapy crawls events and save the URLs as a csv.
    scrapy_crawl_events_cmd = f"scrapy crawl events -O ../data/events/event_urls.csv"
    scrapy_shell_task = shell_task(command=scrapy_crawl_events_cmd)
    
    # Reads the scraped event_urls and uploads them to a postgres table.
    event_urls_df = io.read_csv("data/events/event_urls.csv", upstream_tasks=[scrapy_shell_task])
    
    # Copies the recently scraped events_df to a postgres table.
    io.df_to_table(df=event_urls_df, table_name="event_urls")

    
f.register("scrape-ufcstats")

Result check: OK
Flow URL: https://cloud.prefect.io/jasonminsookim-gmail-com-s-account/flow/ca3bd7bb-4682-46d9-b4aa-6dab42a2cccd
 └── ID: 47da1385-c4d0-4066-8069-3bfc6031b224
 └── Project: scrape-ufcstats
 └── Labels: ['Mac-mini.local']


'47da1385-c4d0-4066-8069-3bfc6031b224'