In [8]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time

# Initialize the WebDriver
driver = webdriver.Chrome()

# Function to scrape a single page
def scrape_page(url):
    driver.get(url)
    time.sleep(3)  # Wait for the page to load
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    works = []
    
    # Find work listings
    for work in soup.find_all('li', class_='work'):
        title = work.find('h4', class_='heading').a.text.strip()
        author = work.find('a', rel='author')
        summary = work.find('blockquote', class_='userstuff').text.strip()
        tags = ', '.join([tag.text for tag in work.find_all('ul', class_='tags')])
        pub_date = work.find('p', class_='datetime').text.strip()
        author = author.text.strip() if author else 'N/A'
        kudos = work.find('a', rel='kudos')
        kudos = kudos.text.strip() if kudos else 'N/A'
        works.append([title, author, summary, tags, pub_date, kudos])
    
    return works


# URL of the first page of works
url = 'https://archiveofourown.org/works/search?work_search%5Bquery%5D=&work_search%5Btitle%5D=&work_search%5Bcreators%5D=&work_search%5Brevised_at%5D=&work_search%5Bcomplete%5D=&work_search%5Bcrossover%5D=F&work_search%5Bsingle_chapter%5D=0&work_search%5Bword_count%5D=&work_search%5Blanguage_id%5D=en&work_search%5Bfandom_names%5D=One+Piece+%28Anime+%26+Manga%29&work_search%5Brating_ids%5D=&work_search%5Bcharacter_names%5D=&work_search%5Brelationship_names%5D=&work_search%5Bfreeform_names%5D=&work_search%5Bhits%5D=&work_search%5Bkudos_count%5D=&work_search%5Bcomments_count%5D=&work_search%5Bbookmarks_count%5D=&work_search%5Bsort_column%5D=kudos_count&work_search%5Bsort_direction%5D=desc&commit=Search'
data = []

# Scrape multiple pages
for i in range(1, 11):  # Adjust the range for the number of pages you want to scrape
    page_url = f'{url}{i}'
    data.extend(scrape_page(page_url))
    time.sleep(2)  # Be respectful and avoid hitting the server too hard

# Convert data to DataFrame
df = pd.DataFrame(data, columns=['Title', 'Author', 'Summary', 'Tags', 'Publication Date', 'Kudos'])



In [10]:
print(df.info())
print(df.columns)
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Title             200 non-null    object
 1   Author            200 non-null    object
 2   Summary           200 non-null    object
 3   Tags              200 non-null    object
 4   Publication Date  200 non-null    object
 5   Kudos             200 non-null    object
dtypes: object(6)
memory usage: 9.5+ KB
None
Index(['Title', 'Author', 'Summary', 'Tags', 'Publication Date', 'Kudos'], dtype='object')
                                               Title           Author  \
0                             unintended consequence     itsmylifekay   
1            come on, come on (turn a little faster)  donutsandcoffee   
2                                             Facade     Hazel_Athena   
3  Fuck, Marry, Kill (or, how Usopp becomes the b...          adietxt   
4                          

In [11]:
import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "browser"


fig = go.Figure(data=[go.Table(
    header=dict(values=list(df.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[df.Title, df.Author, df.Summary, df.Tags, df['Publication Date']],
               fill_color='lavender',
               align='left'))
])

fig.show()