In [170]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as bs
import os

In [2]:
def get_content(url):
    
    """Get's HTML content for any provided url"""
    
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    content = bs(urlopen(req).read())
    return content

In [3]:
def get_chapter_links(main_page):
    
    """Get's chapter links from the mainpage of a fiction"""
    
    content = get_content(main_page)
    table_of_contents = content.find_all('tr')
    links = []
    for row in range(1, len(table_of_contents)):
        abb_link = table_of_contents[row].find_all('a')[0].get('href')
        links.append(f'http://www.royalroad.com{abb_link}')
    return links

In [236]:
def get_chapter_content(chapter_url, formatting=True):
    
    """Get's only the story content from a chapter.
    
    Args:
        chapter_url (str): the full url for the chapter
        formatting (bool): determines if tabs and new lines are included im output
        
    Returns:
        content (str): Content formatted as a string.
        title (str): Title of the chapter.
    """
    
    ch_content = get_content(chapter_url)
    ch_content_cleaned = ch_content.find_all('p')
    ch_tltle = ch_content_cleaned[0].get_text()
    ch_content_cleaned = ch_content_cleaned[1:len(ch_content_cleaned)-3]
    content_str = ''
    if formatting:
        for x in ch_content_cleaned:
            content_str += '\t' + x.get_text() + '\n'
    else:
        for x in ch_content_cleaned:
            content_str += x.get_text() + ' '
    return content_str, ch_tltle;

In [229]:
def get_top_stories(limit=20, category='best rated', readable_date_format=True, int_cast=True, return_df=False):
    
    """Get's information on top stories in whatever specified category
    
    Args:
        limit (int): how many stories have information scraped
        category (string): the RR category that stories are taken from
        readable_date_format (bool): decides between RFC 3339 and MMM dd, yyyy formats
        int_cast (bool): casts relevant return values as integers or floats
        return_df (bool): returns data as a pandas.Dataframe
    
    Constants:
        MAX_LIMIT (int): max number of stories that can be pulled
    
    Returns:
        titles (str): Story titles
        links (str href): Link to story's main page
        followers (int): Number of followers
        rating (int): Stories rating out of 5 stars
        pages (int): Number of pages in full story
        views (int): Number of total views for the story
        chapters (int): Total number of chapters
        genres (list(str)): List of genres associated with the story
        last_update (str): Date of last update. Can be formatted in RFC 3339 or MMM dd, yyyy
    """
    
    MAX_LIMIT = 1000
    GENRE_AS_STRING = True
    
    if limit > MAX_LIMIT: limit = LIMIT
    
    possible_categories = {'best rated': 'https://www.royalroad.com/fictions/best-rated',
                           'active only': 'https://www.royalroad.com/fictions/active-popular',
                           'complete': 'https://www.royalroad.com/fictions/complete',
                           'this week': 'https://www.royalroad.com/fictions/weekly-popular',
                           'latest update': 'https://www.royalroad.com/fictions/latest-updates',
                           'new releases': 'https://www.royalroad.com/fictions/new-releases',
                           'trending': 'https://www.royalroad.com/fictions/trending'}
    if category not in possible_categories:
        print('Valid categories include:')
        print(list(possible_categories.keys()))
    
    front_page = get_content(possible_categories[category])
    stories = front_page.find_all('div', class_='fiction-list-item row')
    
    # Assumes 20 entries per page
    if limit % 20 == 0:
        num_pages = int(limit / 20)
    else:
        num_pages = int(limit / 20) + 1
    
    titles = []
    links = []
    followers = []
    rating = []
    pages = []
    views = []
    chapters = []
    genres = []
    last_update = []
    description = []
    
    story_count = 0                              # ya ya I know this should be a while loop since I'm breaking on this
    for page in range(1, num_pages+1):           # I wrote this before I remembered about the limit
        main_page = get_content(f'{possible_categories[category]}?page={page}')
        for story in main_page.find_all('div', class_ = 'fiction-list-item row'):
            info = story.find_all('span')
            row_stats = story.find('div', class_ = 'row stats').find_all('span')
            if GENRE_AS_STRING:
                temp_genres = ''
                for g in info[1].find_all('span'):
                    temp_genres += (g.get_text().strip()) + ', '
                temp_genres = temp_genres[0:-2]
            else:                                # Old list format. Doesn't look good in Dataframe
                temp_genres = []
                for g in info[1].find_all('span'):
                    temp_genres.append(g.get_text().strip())
            
            titles.append(story.find('a', class_ = 'font-red-sunglo bold').get_text())
            links.append('http://www.royalroad.com' + story.find('a', class_ = 'font-red-sunglo bold').get('href'))
            if int_cast:
                followers.append(int(row_stats[0].get_text()[0:-10].replace(',', '')))
                rating.append(float(row_stats[1].get('title')))
                pages.append(int(row_stats[2].get_text()[0:-6].replace(',', '')))
                views.append(int(row_stats[3].get_text()[0:-6].replace(',', '')))
                chapters.append(int(row_stats[4].get_text()[0:-9].replace(',', '')))
            else:
                followers.append(row_stats[0].get_text())
                rating.append(row_stats[1].get('title'))
                pages.append(row_stats[2].get_text())
                views.append(row_stats[3].get_text())
                chapters.append(row_stats[4].get_text())
            genres.append(temp_genres)
            if readable_date_format:
                last_update.append(row_stats[5].get_text())
            else:
                last_update.append(row_stats[5].find('time').get('datetime'))
            desc = story.find_all('div')[-1].get_text().replace('\n', ' ').replace('\xa0', '').replace('*', '').strip()
            description.append(desc)
            
            story_count += 1
            if story_count == limit:
                break;
    
    return_object = {'title': titles, 'link': links, 'followers': followers,
                     'ratings': rating, 'pages': pages, 'views': views,
                     'chapters': chapters, 'genres': genres,
                     'last update': last_update, 'description': description}
    
    if return_df:
        return_object = pd.DataFrame(return_object)
        return_object.index = np.arange(1, len(return_object)+1)
    
    return return_object

In [248]:
def save_chapter_content(chapter_url, format_='txt', overwrite=False, directory=None):
    
    """Saves chapter content with designate format.
    Creates directory for story if not present
    
    Args:
        chapter_url (str href): link to chapter page
        format (str): format of the file the content is saved as
        overwrite (bool): overwrites chapter file if present
    """
    
    MY_DIRECTORY = '/home/coolio5462/pCloudDrive/Documents/RRStories'
    if directory == 'john':
        directory = MY_DIRECTORY
    elif directory == None:
        directory = '.'
    
    story_title = chapter_url.split('/')[-4]
    
    story_dir = f'{directory}/{story_title}'
    if not os.path.exists(story_dir):
        os.makedirs(story_dir)
    
    chapter_content, ch_title = get_chapter_content(chapter_url)
    
    file_name = f'{story_dir}/{ch_title}.{format_}'
    if not os.path.exists(file_name):
        print('it doesnt exist...?')
        with open(file_name, 'w+') as f:
            print(f.name)
            f.write(chapter_content)
    else:
        print(file_name)
    
    return True

In [256]:
top['link'].iloc[0]

'http://www.royalroad.com/fiction/21220/mother-of-learning'

In [258]:
for link in get_chapter_links(top['link'].iloc[0]):
    save_chapter_content(link, format_='txt', directory='john')

/home/coolio5462/pCloudDrive/Documents/RRStories/mother-of-learning/Chapter 001Good Morning Brother.txt
it doesnt exist...?
/home/coolio5462/pCloudDrive/Documents/RRStories/mother-of-learning/Chapter 002Life’s Little Problems.txt
it doesnt exist...?
/home/coolio5462/pCloudDrive/Documents/RRStories/mother-of-learning/Chapter 003The Bitter Truth.txt
it doesnt exist...?
/home/coolio5462/pCloudDrive/Documents/RRStories/mother-of-learning/Chapter 004Stars Fell.txt
it doesnt exist...?
/home/coolio5462/pCloudDrive/Documents/RRStories/mother-of-learning/Chapter 005Start Over.txt
it doesnt exist...?
/home/coolio5462/pCloudDrive/Documents/RRStories/mother-of-learning/Chapter 006Concentrate and Try Again.txt
it doesnt exist...?
/home/coolio5462/pCloudDrive/Documents/RRStories/mother-of-learning/Chapter 007Of Gaps And Pretending.txt
it doesnt exist...?
/home/coolio5462/pCloudDrive/Documents/RRStories/mother-of-learning/Chapter 008Perspective.txt
it doesnt exist...?
/home/coolio5462/pCloudDrive/Doc

In [259]:
top = get_top_stories(limit=1000, return_df=True)

In [260]:
top

Unnamed: 0,title,link,followers,ratings,pages,views,chapters,genres,last update,description
1,Mother of Learning,http://www.royalroad.com/fiction/21220/mother-...,9640,4.84,2930,3468512,108,"Adventure, Fantasy, Mystery, Magic","Feb 10, 2020",Zorian is a teenage mage of humble birth and s...
2,Super Minion,http://www.royalroad.com/fiction/21410/super-m...,7189,4.79,685,1384400,44,"Action, Sci-fi, Non-Human lead, Secret Identit...","Feb 06, 2020","Fortress City has Super Villains, who have evi..."
3,Vainqueur the Dragon,http://www.royalroad.com/fiction/26534/vainque...,7319,4.70,703,1473117,68,"Adventure, Comedy, Fantasy, Satire, Anti-Hero ...","Feb 27, 2020",Vainqueur Knightsbane is your average dragon: ...
4,RE: Trailer Trash,http://www.royalroad.com/fiction/21322/re-trai...,5004,4.68,436,547999,26,"Comedy, Contemporary, Psychological, Satire, F...","Feb 02, 2020","In the year 2045, an MRI mishap transmits Tabi..."
5,Arrogant Young Master Template A Variation 4,http://www.royalroad.com/fiction/28601/arrogan...,6051,4.70,701,1261084,68,"Adventure, Comedy, Fantasy, LitRPG, Male Lead,...","Feb 25, 2020","What would a Xianxia MC do?Seriously, what wou..."
...,...,...,...,...,...,...,...,...,...,...
996,The Nothing Mage,http://www.royalroad.com/fiction/27904/the-not...,2743,4.07,589,425103,59,"Adventure, Fantasy, High Fantasy, Magic, Male ...","Feb 29, 2020","Beware, oh friend, the Nothing Mage, The man h..."
997,How to get lost: a wanderers guide,http://www.royalroad.com/fiction/15294/how-to-...,175,4.42,582,252871,155,"Adventure, Comedy, Fantasy, Magic, Male Lead, ...","Apr 12, 2019",Most people have a really hard time getting lo...
998,Druidification,http://www.royalroad.com/fiction/29070/druidif...,431,4.37,293,85308,74,"Adventure, Fantasy, Psychological, Tragedy, Fe...","Feb 29, 2020",People have spent centuries perfecting the ind...
999,Superhuman Princess,http://www.royalroad.com/fiction/23561/superhu...,475,4.29,1681,76853,51,"Adventure, Fantasy, Romance, Sci-fi, Female Le...","Jan 16, 2020","Since World War 2, superhumans and supernatura..."


In [261]:
csv = top.to_csv()

In [264]:
with open('/home/coolio5462/pCloudDrive/Documents/RRStories/top1000.csv', 'w+') as f:
    f.write(csv)