This notebook contains the code in data scraping.

In [1]:
# Import the dependencies
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import json
import time
from tqdm import tqdm

from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
from config import gkey

In [2]:
# Setting the url
url = "https://dojos.info/NewJersey/Counties.aspx"

# Creating the response
response = requests.get(url)

# Using beautiful soup
soup = bs(response.text,"html.parser")

# Code to create a list of counties.

county_tag = []
countylist = []

state = soup.find('ul', class_ = "olist")
county = state.find_all("a")
for i in range(len(county)):
    county_tag.append(county[i].text)

for j in county_tag:
    counties = j.replace(" ","")
    county_url = f"https://dojos.info/NewJersey/{counties}"
    countylist.append(county_url)

In [3]:
# Launches browser to search
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
dojo_list = []

# Visits the url
url = "https://dojos.info/NewJersey/Counties.aspx"
browser.visit(url)
html = browser.html
soup = bs(html, "html.parser")

# In case the computer is a little slow.
# time.sleep(2)

# This will begin the scrape
for i in tqdm(range(len(countylist))):
    
    # This will visit the county url
    browser.visit(countylist[i])
    
    # Keep track of the county
    county = county_tag[i]
    
    # This will scrape the browser to find all of the dojos in the county
    response = requests.get(browser.url)
    soup = bs(response.text,"html.parser")
    
    # Creates a list to hold all of the dojo urls
    places = []
    
    # A nested for loop to find the href and appends them to the places list.
    for ulist in soup.find_all('ul', class_ = 'dojolist'):
        for h3 in ulist.find_all('h3'):
            for a in h3.find_all('a'):
                places.append(a['href'])
    dojo = []
    
    for k in places:
        dojo_url = f"https://dojos.info{k}"
        dojo.append(dojo_url)
    
    for x in range(len(dojo)):
        
        # Visits the url for the specific dojo.
        browser.visit(dojo[x])

        # Finds the address for the dojo.
        address = browser.find_by_tag('address')

        # This will pull the address text
        p_address = address.text

        # This will split the text string into usable pieces.
        recreate = p_address.replace('\n'," | ").split("|")

#       Some addresses are missing a street or have a proper title.
#       If there is a proper title to the address
        if len(recreate) > 3:
            recreate.pop(0)

#       If the address is missing a street name.
        elif len(recreate) < 3:
        
#       To ensure a clean scrape insert a dummy address and report it.
            recreate.insert(0,"Missing Street Address")
            print('There is a missing piece of the address.')

        # This is the street address
        street = recreate[0].split()
        streets = " ".join(street)
        split = recreate[1].split(',')

        # The city address
        city = split[0]

        # Further splits the string into the state and zipcode parts.
        split2 = split[1].split(" ")
        state = split2[1]
        zipcode = split2[2]

        # This will pull the name of the dojo.
        dojo_name = browser.find_by_css('#pageTop')
        dojo_proper = dojo_name.find_by_tag("h1").text

        # This will find the styles that the dojo teaches
        response = requests.get(browser.url)
        soup2 = bs(response.text , "html")

#       Creating a list to hold all the styles.
        style = []

#       Code will dig through using beautifulsoup and pull the styles.
        styles = soup2.find('ul', class_ = 'styles')
        for arts in styles.find_all('li'):
            style.append(arts.text)

        # Finding the phone number.
        phone = browser.find_by_css('.phone')
        dojo_phone = phone.text.split(" ")[1]

#       Creates a dictionary for the dojos
        dojo_dict = {'Name': dojo_proper,
                    'Phone': dojo_phone,
                    'Zipcode': zipcode,
                    'Street': streets,
                    'City': city,
                    'State': state,
                    'Style': style,
                    'County': county}

#       Appends the dictionary to a list to create a list of dictionaries
        dojo_list.append(dojo_dict)
        
#         If your computer is running a little slow
#         Recommend the sleep function below.
#         time.sleep(1)

# Quits out of the browser.
browser.quit()

# Shows the full list of dojos
dojo_list

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [C:\Users\Kevin\.wdm\drivers\chromedriver\win32\87.0.4280.88\chromedriver.exe] found in cache


 


  0%|                                                                                           | 0/21 [05:51<?, ?it/s]


WebDriverException: Message: chrome not reachable
  (Session info: chrome=87.0.4280.141)


In [22]:
call_list = []
for rows in range(len(dojo_list)):
    strt = dojo_list[rows]['Street']
    city = dojo_list[rows]['City']
    state = dojo_list[rows]['State']
    key = 'dank'
    if strt == "Missing Street Address":
        api_url = f'https://maps.googleapis.com/maps/api/geocode/json?address={city},+{state}&key={gkey}'
    else:
        api_url = f'https://maps.googleapis.com/maps/api/geocode/json?address={strt},{city},+{state}&key={gkey}'

    call_list.append(api_url.replace(" ","+"))

In [26]:
geo_data = requests.get(call_list[0]).json()
print(json.dumps(geo_data, indent=4, sort_keys=True))

{
    "results": [
        {
            "address_components": [
                {
                    "long_name": "Unit 13 A",
                    "short_name": "Unit 13 A",
                    "types": [
                        "subpremise"
                    ]
                },
                {
                    "long_name": "2511",
                    "short_name": "2511",
                    "types": [
                        "street_number"
                    ]
                },
                {
                    "long_name": "Fire Road",
                    "short_name": "Fire Rd",
                    "types": [
                        "route"
                    ]
                },
                {
                    "long_name": "Egg Harbor Township",
                    "short_name": "Egg Harbor Township",
                    "types": [
                        "locality",
                        "political"
                    ]
                },
              

In [35]:
for i in range(len(call_list)):
    geo_data = requests.get(call_list[0]).json()
    lat = geo_data['results'][0]['geometry']['location']['lat']
    lng = geo_data['results'][0]['geometry']['location']['lng']
    dojo_list[i]['lat'] = lat
    dojo_list[i]['lng'] = lng

In [36]:
dojo_list[0]

{'Name': "Cardo Urso's HABU PIT",
 'Phone': '609-415-5014',
 'Zipcode': '08234',
 'Street': '2511 Fire Road, Unit 13 A',
 'City': ' Egg Harbor Township',
 'State': 'NJ',
 'Style': ['Boxing',
  'Jiu-Jitsu (Jujutsu/Jujitsu)',
  'Judo',
  'Mixed Martial Arts (MMA)',
  'Muay Thai',
  'Sambo'],
 'lat': 39.4209654,
 'lng': -74.5275233}

In [39]:
df = pd.DataFrame(test_dojo_list)
df

Unnamed: 0,Name,Phone,Zipcode,Street,City,State,Style,lat,lng
0,Cardo Urso's HABU PIT,609-415-5014,08234,"2511 Fire Road, Unit 13 A",Egg Harbor Township,NJ,"[Boxing, Jiu-Jitsu (Jujutsu/Jujitsu), Judo, Mi...",39.420965,-74.527523
1,Lightning Lyall MMA,609-474-3055,08406,Ventnor Ave,Ventnor City,NJ,"[Karate, Kick Boxing, Mixed Martial Arts (MMA)...",39.420965,-74.527523
2,Lightning MMA,609-474-3055,08406,Newport Ave,Ventnor City,NJ,"[Kick Boxing, Mixed Martial Arts (MMA), Muay T...",39.420965,-74.527523
3,South Jersey Wing Chun Kuen Do,609-432-6156,08232,801 N Main Street,Pleasantville,NJ,"[Kung Fu, Wing Chun]",39.420965,-74.527523
4,Ultimate Martial Arts Academy,609-625-0880,08330,4450 E Black Horse Pike,Hamilton Township,NJ,"[Hapkido, Karate, Tae Kwon Do]",39.420965,-74.527523
...,...,...,...,...,...,...,...,...,...
126,Bai's Tae Kwon DO,201-307-8922,07656,168 Kinderkamack Rd,Park Ridge,NJ,[Tae Kwon Do],39.420965,-74.527523
127,Power Martial Arts,201-224-5789,07024,1282 Palisade Ave,Fort Lee,NJ,"[Hapkido, Judo, Karate, Mixed Martial Arts (MM...",39.420965,-74.527523
128,Kokushi Dojo,201-575-0669,07481,Wyckoff family YMCA,Wyckoff,NJ,"[Aiki Jitsu (Aikijutsu), Aikido, Brazilian Jiu...",39.420965,-74.527523
129,Northern Valley Martial Arts,201-784-2411,07648,"55 Walnut Street , Suite 103",Norwood,NJ,[Mixed Martial Arts (MMA)],39.420965,-74.527523


In [40]:
df.to_json(r'Test_data\test_dojo.json',orient='table')

In [25]:
new = dojo_list[0]
new['lat'] = "-100"
new

{'Name': "Cardo Urso's HABU PIT",
 'Phone': '609-415-5014',
 'Zipcode': '08234',
 'Street': '2511 Fire Road, Unit 13 A',
 'City': ' Egg Harbor Township',
 'State': 'NJ',
 'Style': ['Boxing',
  'Jiu-Jitsu (Jujutsu/Jujitsu)',
  'Judo',
  'Mixed Martial Arts (MMA)',
  'Muay Thai',
  'Sambo'],
 'lat': '-100'}

In [34]:
dojo_list[0]

{'Name': "Cardo Urso's HABU PIT",
 'Phone': '609-415-5014',
 'Zipcode': '08234',
 'Street': '2511 Fire Road, Unit 13 A',
 'City': ' Egg Harbor Township',
 'State': 'NJ',
 'Style': ['Boxing',
  'Jiu-Jitsu (Jujutsu/Jujitsu)',
  'Judo',
  'Mixed Martial Arts (MMA)',
  'Muay Thai',
  'Sambo'],
 'lat': '-100'}

In [None]:
dojo_list = []

# Visits the url
url = "https://dojos.info/NewJersey/Counties.aspx"
browser.visit(url)
html = browser.html
soup = bs(html, "html.parser")
time.sleep(2)


# Searches the website for the link to the counties
browser.links.find_by_partial_text("County").click()
time.sleep(1)

if browser.url == "https://dojos.info/NewJersey/Counties.aspx#google_vignette":
    browser.visit(url)
    browser.links.find_by_partial_text("County").click()

response = requests.get(browser.url)
soup1 = bs(response.text,'html.parser')

uList = soup1.find('ul',class_ = 'dojolist')
dojolist = uList.find_all('h3')
dojolen = len(dojolist)


# Selects the dojo list to click.
ulist = browser.find_by_css('.dojolist')

h3 = ulist.find_by_tag('h3')

# Click the the link to move to the dojo information.
a_tags = h3.find_by_tag('a')

a_tags.click()

for x in range(1,dojolen + 1):

    # Finds the address for the dojo.
    address = browser.find_by_tag('address')

    # This will pull the address text
    p_address = address.text

    # This will split the text string into usable pieces.
    recreate = p_address.replace('\n'," | ").split("|")

    if len(recreate) > 3:
        recreate.pop(0)

    # This is the street address
    street = recreate[0].split()
    streets = " ".join(street)
    split = recreate[1].split(',')

    # The city address
    city = split[0]

    # Further splits the string into the state and zipcode parts.
    split2 = split[1].split(" ")
    state = split2[1]
    zipcode = split2[2]

    # This will pull the name of the dojo.
    dojo_name = browser.find_by_css('#pageTop')
    dojo = dojo_name.find_by_tag("h1").text

    # This will find the styles that the dojo teaches
    response = requests.get(browser.url)
    soup2 = bs(response.text , "html")

    style = []

    styles = soup2.find('ul', class_ = 'styles')
    for dank in styles.find_all('li'):
        style.append(dank.text)

    # Finding the phone number.
    phone = browser.find_by_css('.phone')
    dojo_phone = phone.text.split(" ")[1]

    dojo_dict = {'Name': dojo,
                'Phone': dojo_phone,
                'Zipcode': zipcode,
                'Street': streets,
                'City': city,
                'State': state,
                'Style': style}

    dojo_list.append(dojo_dict)
    browser.back()
    time.sleep(2)

    try:
        ulist = browser.find_by_css('.dojolist')
        h3 = ulist.find_by_tag('h3')[x]
        h3.find_by_tag("a").click()

    except:
        print('done')
        
# Quits out of the browser.
browser.quit()