In [220]:
# web page: https://rekt.news/ 
# tool: beautifulSoup for scraping and pandas for data processing


# url request and download & save web page as files
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os.path
import csv
import time

## Setup chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920,1080")
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')

# Set path to chromedriver as per your configuration
homedir = os.path.expanduser("~")
webdriver_service = Service(f"/usr/local/bin/chromedriver")

# Choose Chrome Browser
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)

# getting max page

def maxPage(soup):
    #find max page
    page_info = soup.select_one('.page-number')
    # page 0 of max_page
    max_page = page_info.get_text().split()[3]
    max_page = int(max_page)
    return max_page


#  extract articles from the page. parse and store into a list.
#  save html to file

def parseArticles(article_paths):


    for i in range(len(article_paths)):
        url = "https://rekt.news/"+ article_paths[i]
      
        article_data = requests.get(url)
        soup = BeautifulSoup(article_data.text, 'html.parser')
        main = soup.select_one('main.content')
        post_header = main.select_one('.post-header')
        title = post_header.select_one('.post-title')
        date = post_header.select_one('.post-meta time')

        title = title.get_text()
        date = date.get_text()
        content_children = main.select_one(".post-content").children
        content_texts = []
        for child in content_children:
            content_texts.append(child.get_text())

        content = "\n".join(content_texts)

        post_data.append([title, date, content])
        
        # create a file and save html
        path = article_paths[i].replace('/','')
        file = os.path.join(download_path, f'{path}.html') 

        with open(file, 'w') as f:
            f.write(article_data.text)   

# create a csv file and write data on it

def createCSV(data):
    fields = ["title", "date", "content"]
    # open the file in the write mode
    filename = "rekted_news.csv"
    with open(filename, 'w') as csvfile:
        # create the csv writer
        csvwriter = csv.writer(csvfile) 
        csvwriter.writerow(fields) 
        # writing the data rows 
        csvwriter.writerows(data)
            


url = 'https://rekt.news/'

data = requests.get(url)
soup = BeautifulSoup(data.text, 'html.parser')

# find max page
max_page = maxPage(soup) + 1

# create path
homedir = os.path.expanduser("~")
download_path = f'{homedir}/Desktop/Certik'
if not os.path.exists(download_path):
    os.makedirs(download_path)
file = os.path.join(download_path, "rektnews.html") 

post_data = []

# create page url and download each page's html to files
for i in range(max_page):
    # create page url

    page_url = url + "?page=" + str(i)
    browser.get(page_url)
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    
    # getting each article' url
    articles = soup.select('#grid article')
    my_urls = []

    for article in articles:
        a = article.select('a')[0]
        my_urls.append(a['href'])
    #  extract articles from the page. parse and store into a data_list.    
    parseArticles(my_urls)

# create a csv file and write data
createCSV(post_data)

browser.quit()

