# Web Scraping NSAF Database

Data scraped from [FBI's National Stolen Art File](https://artcrimes.fbi.gov/nsaf-view?searchText=&crimeCategory=).

In [1]:
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

import os

In [2]:
# get file names and read them in
files = os.listdir('../fbi_nsaf_pages')
files.sort(key=lambda f: int(''.join(filter(str.isdigit, f))))

pages = [open(os.path.join('../fbi_nsaf_pages/', f)).read() for f in files]

len(pages)

46

In [3]:
def scrape(html_page):
	"""
	Takes in the html source code of a page and returns a dataframe of the scraped content.
    """
	# intialize soup and find all items
	soup = BeautifulSoup(html_page, "html.parser")
	grid_items = soup.find_all("li", "grid-item")
  
	# setup collection lists
	title = []
	category = []
	ref_num = []
	artist = []
	materials = []
	measurements = []
	time_period = []
	add_info = []
	image = []

	# parse through contents of each item
	for item in grid_items:
		# included for every item
		title.append(item.find("h3").text)

		if item.find("img") is not None:
			image.append(item.find("img")["src"])
		else:
			image.append(None)

		# isolate and create attribute dictionary
		attributes = item.find_all("li")
		attributes = {a.text.strip().split(':')[0]:a.text.strip().split(': ')[1] for a in attributes}

		# check for missing attributes and add to collection
		if "Category" in attributes:
			category.append(attributes["Category"])      
		else: 
			category.append(None)
		
		if "Reference Number" in attributes:
			ref_num.append(attributes["Reference Number"])      
		else: 
			ref_num.append(None)

		if "Maker/Artist" in attributes:
			artist.append(attributes["Maker/Artist"])      
		else: 
			artist.append(None)

		if "Materials" in attributes:
			materials.append(attributes["Materials"])      
		else: 
			materials.append(None)

		if "Measurements" in attributes:
			measurements.append(attributes["Measurements"])      
		else: 
			measurements.append(None)
		
		if "Time Period" in attributes:
			time_period.append(attributes["Time Period"])      
		else: 
			time_period.append(None)
		
		if "Additional Information" in attributes:
			add_info.append(attributes["Additional Information"])      
		else: 
			add_info.append(None)
	
	# store data in dataframe format
	data = pd.DataFrame({
		"Title": title,
		"Category": category, 
		"Reference Number": ref_num, 
		"Maker/Artist": artist, 
		"Materials": materials, 
		"Measurements": measurements, 
		"Time Period": time_period,
		"Additional Information": add_info,
		"Image Link":image
	})

	df = pd.DataFrame(data)
	return df


In [4]:
# scrape all files and combine data
dataframes = []
for page in pages:
    dataframes.append(scrape(page))

nsaf_df = pd.concat(dataframes, ignore_index=True)

print(nsaf_df.shape)
nsaf_df.head()

(4523, 9)


Unnamed: 0,Title,Category,Reference Number,Maker/Artist,Materials,Measurements,Time Period,Additional Information,Image Link
0,Kapow,Paintings,747,Nicole Charbonnet,Mixed media on canvas,"72"" x 60""",,,https://artcrimes.fbi.gov/nsaf/kapow/00747_kap...
1,Acorn,lamp,967,Tiffany,Metal; copper; stained glass,24 in,1906.0,"lamp; green; Signed ""LCT"" on the base; signed ...",https://artcrimes.fbi.gov/nsaf/acorn/Acorn.gif...
2,Quatre Etats du Saut,Print,800,Vladimir Velickovic,Silkscreen on paper,31.5 in x 47.25 in,1977.0,signed and dated; edition 30/99,https://artcrimes.fbi.gov/nsaf/quatre-etats-du...
3,llex aquifolium,Paintings,1056,Mang Hang Ho,Watercolor and pencil on paper,Height,1984.0,signed; Ho Mang Hang,https://artcrimes.fbi.gov/nsaf/llex-aquifolium...
4,Homage to Chagall,Print,451,Marc Chagall,Lithograph,,,print; Signed; # 9/75,https://artcrimes.fbi.gov/nsaf/homage-to-chaga...


In [5]:
# save output as file
nsaf_df.to_csv('../data/nsaf_data.csv', index=False)