In [None]:
%pip install pandas
%pip install requests
%pip install bs4
%pip install lxml

In [None]:
import time

from bs4 import BeautifulSoup
import lxml
import pandas
import requests

In [None]:
URL = "https://openphish.com"
CSV_NAME = "openphish.csv"

In [None]:
def get_response_data():
    response = requests.get(URL)
    return response.content

In [None]:
def get_soup(site_content):
    return BeautifulSoup(site_content)

In [None]:
def get_table(soup):
    return soup.find("table", {"class": "pure-table pure-table-striped"})

In [None]:
def get_rows(table):
    return table.tbody.find_all("tr")

In [None]:
def get_cells(rows):
    return [f"{el[0].text};{el[1].text};{el[2].text}" 
            for el in [row.find_all("td") for row in rows]]

In [None]:
def write_to_csv(data):
    with open(CSV_NAME, 'a', encoding="UTF-8") as f:
        f.writelines([f"{row}\n" for row in data])

In [None]:
def read_from_csv():
    with open(CSV_NAME, "r", encoding="UTF-8") as f:
        raw_data = f.readlines()
    return [row.replace('\n', '') for row in raw_data]

In [None]:
def get_sleep_secs(mins):
    return mins * 60

In [None]:
def get_sleeps_amount(total, sleep):
    return total // sleep + 1

In [None]:
def get_delta_data(old_data, new_data):
    delta_data = []
    for el in new_data:
        if el not in old_data:
            delta_data.append(el)
    
    return delta_data

In [None]:
def get_new_data():
    response = get_response_data()
    soup = get_soup(response)
    table = get_table(soup)
    rows = get_rows(table)
    new_data = list(reversed(get_cells(rows)))
    return new_data

In [None]:
TOTAL = 60
DELTA = 5

SLEEP = get_sleep_secs(DELTA)
AMOUNT = get_sleeps_amount(TOTAL, DELTA)

In [None]:
def sleep(secs):
    time.sleep(secs)

In [None]:
def processing():
    try:
        old_data = read_from_csv()
    except FileNotFoundError:
        old_data = []
    new_data = get_new_data()
    delta = get_delta_data(old_data, new_data)
    write_to_csv(delta)

In [None]:
print("Started:", time.asctime(time.gmtime()))

for _ in range(AMOUNT):
    processing()
    sleep(SLEEP)
    
print("Finished:", time.asctime(time.gmtime()))

In [None]:
def get_unique_urls():
    data = read_from_csv()
    urls = [el.split(";")[0] for el in data]
    return len(set(urls))

In [None]:
from collections import Counter

def get_top_companies():
    data = read_from_csv()
    companies = [el.split(";")[1] for el in data]
    counter = Counter(companies)
    top_3 = counter.most_common(4)
    return [f"{el[0]} - {el[1]}" for el in top_3]

In [None]:
print ("Unique urls: ", get_unique_urls())

In [None]:
print(get_top_companies())