In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
import os
#os.mkdir("data")

# Fish Table

Goal: read the "Fish Types" table from https://stardewvalleywiki.com/Fish into a Pandas dataframe

Request the Fish page, <Response [200]> indicates successful request.

In [3]:
fish_page = requests.get("https://stardewvalleywiki.com/Fish")
fish_page

ConnectionError: HTTPSConnectionPool(host='stardewvalleywiki.com', port=443): Max retries exceeded with url: /Fish (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f9438a43df0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [None]:
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
tables = soup.find_all('table', class_ = 'wikitable')
table1 = tables[0]

Compile the contents of the table into a list of lists, and convert into a dataframe.

In [None]:
data = []
row_marker = 0
for row in table1.find_all('tr'):
    stuff = []
    column_marker = 0
    for column in row.find_all('td'):
        if column.find('table'):
            continue
        stuff.append(column.get_text())
        column_marker += 1
    data.append(stuff)

data = [x for x in data if len(x)>5]

In [None]:
desired_cols = [1, 2] + list(range(27, 35))

fish_df = pd.DataFrame(data, columns = list(range(35)))
fish_df = df[desired_cols]

fish_df.head()

## Data Cleaning

Remove the '\n' newline at the end of string:

In [None]:
def remove_newline(text):
    return text.replace('\n', '')
    
fish_df = fish_df.applymap(remove_newline)
fish_df.head()

Rename columns from default numbers to actual column names on the website, convert 'Base XP' column from strings to ints:

In [None]:
site_cols = ["Name", "Description", "Location", "Time", "Seasons", "Weather", "Size (inches)",
             "Difficult and Behavior", "Base XP", "Used In"]

col_map = dict(zip(desired_cols, site_cols))

fish_df = fish_df.rename(columns = col_map)
fish_df = fish_df.set_index("Name")

fish_df["Base XP"] = pd.to_numeric(fish_df["Base XP"], errors='coerce')

fish_df.head()

In [None]:
fish_df.to_csv("data/fish.csv")

# Villager Images

Goal: download the thumbnail images of all the "villagers" on https://stardewvalleywiki.com/Villagers

In [None]:
villager_page = requests.get("https://stardewvalleywiki.com/Villagers")
soup2 = BeautifulSoup(villager_page.content, 'html.parser')

Download the images in all instances of the "thumb" class, which contain the thumbnail image of the villager and a link to its character page. 

In [None]:
thumbs = soup2.find_all(class_ = 'thumb')

for thumb in thumbs:  
    img_html = thumb.find('img')

    file_name = img_html.get('src')
    img_name = file_name[file_name.rfind('/') + 1 :]
    url = 'http://www.stardewvalleywiki.com' + file_name

    f = open("data/" + img_name,'wb')
    f.write(requests.get(url).content)
    f.close()

### Some scraped Villager images:
![title](data/Abigail.png)
![title](data/Alex.png)
![title](data/Bouncer.png)