In [1]:
import pandas as pd # library for data analysis
import requests # library to handle requests
import logging # library to handle logging
from bs4 import BeautifulSoup # library to parse HTML documents
import requests # library to handle requests
from tqdm import tqdm # library to display progress bar
from requests.exceptions import Timeout # library to handle timeout exception

In [None]:
# Set up logging
logging.basicConfig(filename='app.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)

# Make a request to the website
logging.info('Making request to website')
url = "https://nethackwiki.com/wiki/List_of_vanilla_NetHack_tiles"
response = requests.get(url)

# Parse the HTML content
logging.info('Parsing HTML content')
soup = BeautifulSoup(response.text, 'html.parser')

# Find all img tags
logging.info('Finding img tags')
img_tags = soup.find_all('img')

# Initialize lists to store the data
titles = []
hrefs = []
srcs = []

# Loop through the img tags and extract the data
logging.info('Extracting data from img tags')
for img in img_tags:
    if 'png' in img['src']:
        titles.append(img.get('alt'))
        hrefs.append(img.parent.get('href'))
        srcs.append(img.get('src'))

# Create a DataFrame from the data
logging.info('Creating DataFrame')
tileset= pd.DataFrame({
    'Title': titles,
    'Wiki_link': hrefs,
    'Image_link': srcs
})

#Replace the .png in the Title column
logging.info('Replacing .png in Title column')
tileset['Title'].replace('.png', '', regex=True, inplace=True)

#Save to csv file
logging.info('Saving DataFrame to CSV file')
tileset.to_csv('tileset_vanilla.csv', index=False)

logging.info('Done')

In [None]:
url = "https://nethackwiki.com/wiki/Tileset"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the map with name 'ImageMap_1_514738766'
image_map = soup.find('map', {'name': 'ImageMap_1_2102875891'})

# Find all areas in the map
areas = image_map.find_all('area')

# Extract the href, alt, and title attributes of each area
area_data = []
for area in areas:
    Tile_description = area.get('alt')
    Tile_name = area.get('title')
    description_link = area.get('href')
    area_data.append({'Tile_name': Tile_name, 'Tile_description': Tile_description, 'description_link': description_link})

# Convert the data to a DataFrame
tileset_from_map = pd.DataFrame(area_data)


#If the string in Tile_name contains 'statue' then delete row
for index, row in tileset_from_map.iterrows():
    if 'statue' in row['Tile_name']:
        tileset_from_map.drop(index, inplace=True)

#For all strings in the description_link add the base_wikiurl
tileset_from_map['description_link'] = tileset_from_map['description_link'].apply(lambda x: f"https://nethackwiki.com{x}" if x is not None else None)

tileset_from_map.to_csv('tileset_from_map.csv', index=False)


tileset_with_links=pd.read_csv('tileset_from_map.csv')
tileset_with_links['Tile_name'] = tileset_with_links['Tile_name'].str.replace(' ', '_').replace('_(monster)', '')

def find_image(tile_name, url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        img_tags = soup.find_all('img')
        for img in img_tags:
            if 'png' in img['src'] and tile_name in img['src']:
                return img['src']
    except Exception as e:
        print(f"An error occurred: {e}")
    return None


tileset_with_links['Tile_name'] = tileset_with_links['Tile_name'].str.replace('_(monster)', '')
tileset_with_links['Tile_name'] = tileset_with_links['Tile_name'].str.capitalize()
# Initialize a progress bar
pbar = tqdm(total=tileset_with_links.shape[0])

# Apply the function to each row in the DataFrame
for i, row in tileset_with_links.iterrows():
    if pd.isnull(row['image_link']):
        tile_name = row['Tile_name']
        url = 'https://nethackwiki.com/wiki/List_of_vanilla_NetHack_tiles'
        img_src = find_image(tile_name, url)
        if img_src is None:
            print(f"No image found for tile_name: {tile_name}")
        else:
            tileset_with_links.at[i, 'image_link'] = img_src
    pbar.update()

# Close the progress bar
pbar.close()
