In [1]:
import requests # for making standard html requests
from bs4 import BeautifulSoup # magical tool for parsing html data
import json # for parsing data
from pandas import DataFrame as df 

In [2]:
page = requests.get("https://locations.familydollar.com/id/")
soup = BeautifulSoup(page.text, 'html.parser') 

In [3]:
dollar_tree_list = soup.find_all(class_ = 'itemlist')

In [4]:
len(dollar_tree_list)

48

In [5]:
for i in dollar_tree_list[:2]:
    print(i)

<div class="itemlist"><a dta-linktrack="City index page - Aberdeen" href="https://locations.familydollar.com/id/aberdeen/">Aberdeen</a></div>
<div class="itemlist"><a dta-linktrack="City index page - American Falls" href="https://locations.familydollar.com/id/american-falls/">American Falls</a></div>


In [12]:
example = dollar_tree_list[2] # Arco, ID (single representative example)
example_content = example.contents[0]
print(example_content)

<a dta-linktrack="City index page - Arco" href="https://locations.familydollar.com/id/arco/">Arco</a>


In [13]:
example_href = example_content['href']
example_href

'https://locations.familydollar.com/id/arco/'

In [14]:
city_hrefs = [] # initialise empty list

for i in dollar_tree_list:
    cont = i.contents[0]
    href = cont['href']
    city_hrefs.append(href)

#  check to be sure all went well
for i in city_hrefs[:2]:
  print(i)

https://locations.familydollar.com/id/aberdeen/
https://locations.familydollar.com/id/american-falls/


In [18]:
page2 = requests.get(city_hrefs[2]) # representative example
soup2 = BeautifulSoup(page2.text, 'html.parser')

In [19]:
arco = soup2.find_all(type="application/ld+json")

In [20]:
print(arco)

[<script type="application/ld+json">
    {
      "@context": "https://schema.org",
      "@type": "BreadcrumbList",
      "itemListElement": [{
        "@type": "ListItem",
        "position": 1,
        "item": {
          "@id": "https://locations.familydollar.com/",
          "name": "Index"
        }
      },{
        "@type": "ListItem",
        "position": 2,
        "item": {
          "@id": "https://locations.familydollar.com/id/",
          "name": "ID"
        }
      }]
    }
</script>, <script type="application/ld+json">
	{
	  "@context":"https://schema.org",
	  "@type":"Schema Business Type",
	  "name": "Family Dollar #9143",
	  "address":{
	    "@type":"PostalAddress",
	    "streetAddress":"157 W Grand Avenue",
	    "addressLocality":"Arco",
	    "addressRegion":"ID",
	    "postalCode":"83213",
	    "addressCountry":"US"
	  },
	  "containedIn":"",  
	  "branchOf": {
	    "name":"Family Dollar",
	    "url": "https://www.familydollar.com/"
	  },
	  "url":"https://locations

In [21]:
arco_contents = arco[1].contents[0]

In [22]:
print(arco_contents)


	{
	  "@context":"https://schema.org",
	  "@type":"Schema Business Type",
	  "name": "Family Dollar #9143",
	  "address":{
	    "@type":"PostalAddress",
	    "streetAddress":"157 W Grand Avenue",
	    "addressLocality":"Arco",
	    "addressRegion":"ID",
	    "postalCode":"83213",
	    "addressCountry":"US"
	  },
	  "containedIn":"",  
	  "branchOf": {
	    "name":"Family Dollar",
	    "url": "https://www.familydollar.com/"
	  },
	  "url":"https://locations.familydollar.com/id/arco/29143/",
	  "telephone":"208-881-5738",
	  "image": "//hosted.where2getit.com/familydollarstore/images/storefront.png"
	}			
	


In [23]:
arco_json =  json.loads(arco_contents)

In [24]:
type(arco_json)

dict

In [25]:
print(arco_json)

{'@context': 'https://schema.org', '@type': 'Schema Business Type', 'name': 'Family Dollar #9143', 'address': {'@type': 'PostalAddress', 'streetAddress': '157 W Grand Avenue', 'addressLocality': 'Arco', 'addressRegion': 'ID', 'postalCode': '83213', 'addressCountry': 'US'}, 'containedIn': '', 'branchOf': {'name': 'Family Dollar', 'url': 'https://www.familydollar.com/'}, 'url': 'https://locations.familydollar.com/id/arco/29143/', 'telephone': '208-881-5738', 'image': '//hosted.where2getit.com/familydollarstore/images/storefront.png'}


In [26]:
arco_address = arco_json['address']

In [27]:
print(arco_address)

{'@type': 'PostalAddress', 'streetAddress': '157 W Grand Avenue', 'addressLocality': 'Arco', 'addressRegion': 'ID', 'postalCode': '83213', 'addressCountry': 'US'}


In [28]:
locs_dict = [] # initialise empty list

for link in city_hrefs:
  locpage = requests.get(link)   # request page info
  locsoup = BeautifulSoup(locpage.text, 'html.parser') 
      # parse the page's content
  locinfo = locsoup.find_all(type="application/ld+json") 
      # extract specific element
  loccont = locinfo[1].contents[0]  
      # get contents from the bs4 element set
  locjson = json.loads(loccont)  # convert to json
  locaddr = locjson['address'] # get address
  locs_dict.append(locaddr) 

In [29]:
locs_df = df.from_records(locs_dict)
locs_df.drop(['@type', 'addressCountry'], axis = 1, inplace = True)
locs_df.head(n = 5)

Unnamed: 0,streetAddress,addressLocality,addressRegion,postalCode
0,111 N Main Street,Aberdeen,ID,83210
1,253 Harrison St,American Falls,ID,83211
2,157 W Grand Avenue,Arco,ID,83213
3,177 Main Street,Ashton,ID,83420
4,747 N. Main St.,Bellevue,ID,83313
