In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as bs

In [2]:
def get_content(url):
    
    """Get's HTML content for any provided url"""
    
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    content = bs(urlopen(req).read())
    return content

In [3]:
def get_chapter_links(main_page):
    
    """Get's chapter links from the mainpage of a fiction"""
    
    content = get_content(main_page)
    table_of_contents = content.find_all('tr')
    links = []
    for row in range(1, len(table_of_contents)):
        abb_link = table_of_contents[row].find_all('a')[0].get('href')
        links.append(f'http://www.royalroad.com{abb_link}')
    return links

In [136]:
def get_chapter_content(chapter_url, formatting=True):
    
    """Get's only the story content from a chapter.
    
    Args:
        chapter_url (str): the full url for the chapter
        formatting (bool): determines if tabs and new lines are included im output
        
    Returns:
        content (str): Content formatted as a string.
    """
    
    ch_content = get_content(chapter_url)
    ch_content_cleaned = ch_content.find_all('p')
    ch_content_cleaned = ch_content_cleaned[1:len(ch_content_cleaned)-3]
    content_str = ''
    if formatting:
        for x in ch_content_cleaned:
            content_str += '\t' + x.get_text() + '\n'
    else:
        for x in ch_content_cleaned:
            content_str += x.get_text() + ' '
    return content_str

In [156]:
def get_top_stories(limit=20, category='best rated', readable_date_format=True, int_cast=True, return_df=False):
    
    """Get's information on top stories in whatever specified category
    
    Args:
        limit (int): how many stories have information scraped
        category (string): the RR category that stories are taken from
        readable_date_format (bool): decides between RFC 3339 and MMM dd, yyyy formats
        int_cast (bool): casts relevant return values as integers or floats
        return_df (bool): returns data as a pandas.Dataframe
    
    Constants:
        MAX_LIMIT (int): max number of stories that can be pulled
    
    Returns:
        titles (str): Story titles
        links (str href): Link to story's main page
        followers (int): Number of followers
        rating (int): Stories rating out of 5 stars
        pages (int): Number of pages in full story
        views (int): Number of total views for the story
        chapters (int): Total number of chapters
        genres (list(str)): List of genres associated with the story
        last_update (str): Date of last update. Can be formatted in RFC 3339 or MMM dd, yyyy
    """
    
    MAX_LIMIT = 100
    GENRE_AS_STRING = False
    
    if limit > MAX_LIMIT: limit = LIMIT
    
    possible_categories = {'best rated': 'https://www.royalroad.com/fictions/best-rated',
                           'active only': 'https://www.royalroad.com/fictions/active-popular',
                           'complete': 'https://www.royalroad.com/fictions/complete',
                           'this week': 'https://www.royalroad.com/fictions/weekly-popular',
                           'latest update': 'https://www.royalroad.com/fictions/latest-updates',
                           'new releases': 'https://www.royalroad.com/fictions/new-releases',
                           'trending': 'https://www.royalroad.com/fictions/trending'}
    if category not in possible_categories:
        print('Valid categories include:')
        print(list(possible_categories.keys()))
    
    front_page = get_content(possible_categories[category])
    stories = front_page.find_all('div', class_='fiction-list-item row')
    
    # Assumes 20 entries per page
    if limit % 20 == 0:
        num_pages = int(limit / 20)
    else:
        num_pages = int(limit / 20) + 1
    
    titles = []
    links = []
    followers = []
    rating = []
    pages = []
    views = []
    chapters = []
    genres = []
    last_update = []
    
    story_count = 0                              # ya ya I know this should be a while loop since I'm breaking on this
    for page in range(1, num_pages+1):           # I wrote this before I remembered about the limit
        main_page = get_content(f'{possible_categories[category]}?page={page}')
        for story in main_page.find_all('div', class_ = 'fiction-list-item row'):
            info = story.find_all('span')
            row_stats = story.find('div', class_ = 'row stats').find_all('span')
            if GENRE_AS_STRING:
                temp_genres = ''
                for g in info[1].find_all('span'):
                    temp_genres += (g.get_text().strip()) + ', '
                temp_genres = temp_genres[0:-2]
            else:                                # Old list format. Doesn't look good in Dataframe
                temp_genres = []
                for g in info[1].find_all('span'):
                    temp_genres.append(g.get_text().strip())
            
            titles.append(story.find('a', class_ = 'font-red-sunglo bold').get_text())
            links.append('http://www.royalroad.com' + story.find('a', class_ = 'font-red-sunglo bold').get('href'))
            if int_cast:
                followers.append(int(row_stats[0].get_text()[0:-10].replace(',', '')))
                rating.append(float(row_stats[1].get('title')))
                pages.append(int(row_stats[2].get_text()[0:-6].replace(',', '')))
                views.append(int(row_stats[3].get_text()[0:-6].replace(',', '')))
                chapters.append(int(row_stats[4].get_text()[0:-9].replace(',', '')))
            else:
                followers.append(row_stats[0].get_text())
                rating.append(row_stats[1].get('title'))
                pages.append(row_stats[2].get_text())
                views.append(row_stats[3].get_text())
                chapters.append(row_stats[4].get_text())
            genres.append(temp_genres)
            if readable_date_format:
                last_update.append(row_stats[5].get_text())
            else:
                last_update.append(row_stats[5].find('time').get('datetime'))
            
            story_count += 1
            if story_count == limit:
                break;
    
    return_object = {'title': titles, 'followers': followers, 'ratings': rating, 
                     'pages': pages, 'views': views, 'chapters': chapters,
                     'genres': genres, 'last update': last_update}
    
    if return_df:
        return_object = pd.DataFrame(return_object)
        return_object.index = np.arange(1, len(return_object)+1)
    
    return return_object
    

In [157]:
top = get_top_stories(limit=100, return_df=True)

In [158]:
top

Unnamed: 0,title,followers,ratings,pages,views,chapters,genres,last update
1,Mother of Learning,9640,4.84,2930,3463853,108,"[Adventure, Fantasy, Mystery, Magic]","Feb 10, 2020"
2,Super Minion,7185,4.79,685,1383235,44,"[Action, Sci-fi, Non-Human lead, Secret Identi...","Feb 06, 2020"
3,Vainqueur the Dragon,7315,4.70,703,1470178,68,"[Adventure, Comedy, Fantasy, Satire, Anti-Hero...","Feb 27, 2020"
4,RE: Trailer Trash,5001,4.68,436,547551,26,"[Comedy, Contemporary, Psychological, Satire, ...","Feb 02, 2020"
5,Arrogant Young Master Template A Variation 4,6038,4.70,701,1256068,68,"[Adventure, Comedy, Fantasy, LitRPG, Male Lead...","Feb 25, 2020"
...,...,...,...,...,...,...,...,...
96,Body and Soul,2763,4.58,1081,2867975,137,"[Adventure, Fantasy, Romance, Anti-Hero Lead, ...","Oct 22, 2017"
97,Until death? (Old Version),2706,4.59,792,5463764,113,"[Action, Comedy, Fantasy, Romance, Anti-Hero L...","Jun 06, 2015"
98,The Planar Archivist (Old),642,4.68,350,72522,21,"[Action, Adventure, Fantasy, Romance, High Fan...","Jul 13, 2019"
99,Beach Bum,2881,4.56,266,287121,34,"[Adventure, Comedy, Fantasy, GameLit, Magic, M...","Feb 24, 2020"
