## This notebook contains the code in data scraping.

In [80]:
# Import the dependencies
import pandas as pd
import numpy as np

import pymongo
from bs4 import BeautifulSoup as bs
import requests
import json
import time
from tqdm import tqdm

from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
from config import gkey

In [2]:
# Setting the url
url = "https://dojos.info/NewJersey/Counties.aspx"

# Creating the response
response = requests.get(url)

# Using beautiful soup
soup = bs(response.text,"html.parser")

# Code to create a list of counties.

county_tag = []
countylist = []

state = soup.find('ul', class_ = "olist")
county = state.find_all("a")
for i in range(len(county)):
    county_tag.append(county[i].text)

for j in county_tag:
    counties = j.replace(" ","")
    county_url = f"https://dojos.info/NewJersey/{counties}"
    countylist.append(county_url)

In [3]:
# Launches browser to search
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
dojo_list = []

# Visits the url
url = "https://dojos.info/NewJersey/Counties.aspx"
browser.visit(url)
html = browser.html
soup = bs(html, "html.parser")

# In case the computer is a little slow.
# time.sleep(2)

# This will begin the scrape
for i in tqdm(range(len(countylist))):
    
    # This will visit the county url
    browser.visit(countylist[i])
    
    # Keep track of the county
    county = county_tag[i]
    
    # This will scrape the browser to find all of the dojos in the county
    response = requests.get(browser.url)
    soup = bs(response.text,"html.parser")
    
    # Creates a list to hold all of the dojo urls
    places = []
    
    # A nested for loop to find the href and appends them to the places list.
    for ulist in soup.find_all('ul', class_ = 'dojolist'):
        for h3 in ulist.find_all('h3'):
            for a in h3.find_all('a'):
                places.append(a['href'])
    dojo = []
    
    for k in places:
        dojo_url = f"https://dojos.info{k}"
        dojo.append(dojo_url)
    
    for x in range(len(dojo)):
        
        # Visits the url for the specific dojo.
        browser.visit(dojo[x])

        # Finds the address for the dojo.
        address = browser.find_by_tag('address')

        # This will pull the address text
        p_address = address.text

        # This will split the text string into usable pieces.
        recreate = p_address.replace('\n'," | ").split("|")

#       Some addresses are missing a street or have a proper title.
#       If there is a proper title to the address
        if len(recreate) == 4:
            recreate.pop(0)

#       If the address is missing a street name.
        elif (len(recreate) < 3) | (len(recreate) > 4):
#       To ensure a clean scrape insert a dummy address and report it.
            continue
            print('There is a missing piece of the address.')

        # This is the street address
        street = recreate[0].split()
        streets = " ".join(street)
        split = recreate[1].split(',')

        # The city address
        city = split[0]

        # Further splits the string into the state and zipcode parts.
        split2 = split[1].split(" ")
        state = split2[1]
        zipcode = split2[2]

        # This will pull the name of the dojo.
        dojo_name = browser.find_by_css('#pageTop')
        dojo_proper = dojo_name.find_by_tag("h1").text

        # This will find the styles that the dojo teaches
        response = requests.get(browser.url)
        soup2 = bs(response.text , "html")

#       Creating a list to hold all the styles.
        style = []

#       Code will dig through using beautifulsoup and pull the styles.
        styles = soup2.find('ul', class_ = 'styles')
        for arts in styles.find_all('li'):
            style.append(arts.text)

        # Finding the phone number.
        phone = browser.find_by_css('.phone')
        dojo_phone = phone.text.split(" ")[1]

#       Creates a dictionary for the dojos
        dojo_dict = {'Name': dojo_proper,
                    'Phone': dojo_phone,
                    'Zipcode': zipcode,
                    'Street': streets,
                    'City': city,
                    'State': state,
                    'Style': style,
                    'County': county}

#       Appends the dictionary to a list to create a list of dictionaries
        dojo_list.append(dojo_dict)
        
#         If your computer is running a little slow
#         Recommend the sleep function below.
#         time.sleep(1)
    if len(dojo_list)%15 == 0:
        time.sleep(10)

# Quits out of the browser.
browser.quit()

# Shows the full list of dojos
dojo_list

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - There is no [win32] chromedriver for browser 88.0.4324 in cache


 


[WDM] - Get LATEST driver version for 88.0.4324
[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/88.0.4324.96/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\Kevin\.wdm\drivers\chromedriver\win32\88.0.4324.96]
100%|█████████████████████████████████████████████████████████████████████████████████| 21/21 [56:41<00:00, 161.99s/it]


[{'Name': "Cardo Urso's HABU PIT",
  'Phone': '609-415-5014',
  'Zipcode': '08234',
  'Street': '2511 Fire Road, Unit 13 A',
  'City': ' Egg Harbor Township',
  'State': 'NJ',
  'Style': ['Boxing',
   'Jiu-Jitsu (Jujutsu/Jujitsu)',
   'Judo',
   'Mixed Martial Arts (MMA)',
   'Muay Thai',
   'Sambo'],
  'County': 'Atlantic County'},
 {'Name': 'Lightning Lyall MMA',
  'Phone': '609-474-3055',
  'Zipcode': '08406',
  'Street': 'Ventnor Ave',
  'City': ' Ventnor City',
  'State': 'NJ',
  'Style': ['Karate',
   'Kick Boxing',
   'Mixed Martial Arts (MMA)',
   'Muay Thai',
   'Privates',
   'Group'],
  'County': 'Atlantic County'},
 {'Name': 'Lightning MMA',
  'Phone': '609-474-3055',
  'Zipcode': '08406',
  'Street': 'Newport Ave',
  'City': ' Ventnor City',
  'State': 'NJ',
  'Style': ['Kick Boxing', 'Mixed Martial Arts (MMA)', 'Muay Thai'],
  'County': 'Atlantic County'},
 {'Name': 'South Jersey Wing Chun Kuen Do',
  'Phone': '609-432-6156',
  'Zipcode': '08232',
  'Street': '801 N Main 

In [5]:
# Saves the dojo listed as a dataframe.
dojo_df = pd.DataFrame(dojo_list)
dojo_df

Unnamed: 0,Name,Phone,Zipcode,Street,City,State,Style,County
0,Cardo Urso's HABU PIT,609-415-5014,08234,"2511 Fire Road, Unit 13 A",Egg Harbor Township,NJ,"[Boxing, Jiu-Jitsu (Jujutsu/Jujitsu), Judo, Mi...",Atlantic County
1,Lightning Lyall MMA,609-474-3055,08406,Ventnor Ave,Ventnor City,NJ,"[Karate, Kick Boxing, Mixed Martial Arts (MMA)...",Atlantic County
2,Lightning MMA,609-474-3055,08406,Newport Ave,Ventnor City,NJ,"[Kick Boxing, Mixed Martial Arts (MMA), Muay T...",Atlantic County
3,South Jersey Wing Chun Kuen Do,609-432-6156,08232,801 N Main Street,Pleasantville,NJ,"[Kung Fu, Wing Chun]",Atlantic County
4,Ultimate Martial Arts Academy,609-625-0880,08330,4450 E Black Horse Pike,Hamilton Township,NJ,"[Hapkido, Karate, Tae Kwon Do]",Atlantic County
...,...,...,...,...,...,...,...,...
874,US Family Martial Arts Center,908-213-9477,08865,101 Foch Blvd,Phillipsburg,NJ,"[Jiu-Jitsu (Jujutsu/Jujitsu), Kick Boxing, Tae...",Warren County
875,Twisted Dragon,610-417-0128,08865,405B Thomas Street,Phillipsburg,NJ,[Shorin Ryu Karate],Warren County
876,White Dragon Mixed Martial arts Academy,908-454-4525,08865,"463 SO Main ST, Suite # 2",Phillipsburg,NJ,"[Arnis/Kali/Escrima/FMA, Brazilian Jiu-jitsu, ...",Warren County
877,"Golden Dragon Karate School, LLC",908-387-1100,08865,1100 East Blvd,Alpha,NJ,"[Jiu-Jitsu (Jujutsu/Jujitsu), Kobudo, Shorin R...",Warren County


In [6]:
# Saves the dataframe as a json file. This will be a back up before being uploaded to mongo.
dojo_df.to_json(r'Test_data\dojo.json',orient='table')

In [43]:
# Cleaning the dataframe for any irregularities.
# Any needed fixes to the dataset.
dojo_df.loc[11].replace('(Behind Atlanticare)','120 White Horse Pike',inplace = True)
dojo_df.loc[48].replace('Lower Level','101 Rt. 46',inplace = True)
dojo_df.loc[135].replace('','08505',inplace = True)
dojo_df.loc[182].replace('','08043',inplace = True)
dojo_df.loc[195].replace('','476 Centennial Blvd',inplace = True)
dojo_df.loc[314].replace('ACME Shopping Center','Center Square Road',inplace = True)
dojo_df.loc[316].replace('ACME Shopping Plaza','Center Square Road',inplace = True)
dojo_df.loc[342].replace('Superfitness','553 Beckett rd',inplace = True)
dojo_df.loc[357].replace('Store Front','484 West Side Ave',inplace = True)
dojo_df.loc[581].replace('56 Union Ave. - Rear',"56 Union Ave.",inplace = True)
clean_dojo_df = dojo_df.drop([59,134,136,137,161,176,201,233,239,250,272,285,297,307,328,351,360,371,384,385,391,407,410,411,412,413,
             433,445,447,452,482,511,512,514,516,520,523,569,600, 662,665,706,725,745,750,752,754,759,762,765,783,
             861])
# Shows the cleaned dataframe before google api.
clean_dojo_df

Unnamed: 0,Name,Phone,Zipcode,Street,City,State,Style,County
0,Cardo Urso's HABU PIT,609-415-5014,08234,"2511 Fire Road, Unit 13 A",Egg Harbor Township,NJ,"[Boxing, Jiu-Jitsu (Jujutsu/Jujitsu), Judo, Mi...",Atlantic County
1,Lightning Lyall MMA,609-474-3055,08406,Ventnor Ave,Ventnor City,NJ,"[Karate, Kick Boxing, Mixed Martial Arts (MMA)...",Atlantic County
2,Lightning MMA,609-474-3055,08406,Newport Ave,Ventnor City,NJ,"[Kick Boxing, Mixed Martial Arts (MMA), Muay T...",Atlantic County
3,South Jersey Wing Chun Kuen Do,609-432-6156,08232,801 N Main Street,Pleasantville,NJ,"[Kung Fu, Wing Chun]",Atlantic County
4,Ultimate Martial Arts Academy,609-625-0880,08330,4450 E Black Horse Pike,Hamilton Township,NJ,"[Hapkido, Karate, Tae Kwon Do]",Atlantic County
...,...,...,...,...,...,...,...,...
874,US Family Martial Arts Center,908-213-9477,08865,101 Foch Blvd,Phillipsburg,NJ,"[Jiu-Jitsu (Jujutsu/Jujitsu), Kick Boxing, Tae...",Warren County
875,Twisted Dragon,610-417-0128,08865,405B Thomas Street,Phillipsburg,NJ,[Shorin Ryu Karate],Warren County
876,White Dragon Mixed Martial arts Academy,908-454-4525,08865,"463 SO Main ST, Suite # 2",Phillipsburg,NJ,"[Arnis/Kali/Escrima/FMA, Brazilian Jiu-jitsu, ...",Warren County
877,"Golden Dragon Karate School, LLC",908-387-1100,08865,1100 East Blvd,Alpha,NJ,"[Jiu-Jitsu (Jujutsu/Jujitsu), Kobudo, Shorin R...",Warren County


In [44]:
# Ensuring the file is saved.
clean_dojo_df.to_json(r'Test_data\clean_dojo.json',orient='table')

In [50]:
# Creates a list of dictionaries
dojo_list = clean_dojo_df.to_dict('records')

# Creates a list of urls for the api
call_list = []

# This will build the api.
for rows in range(len(dojo_list)):
    strt = dojo_list[rows]['Street']
    city = dojo_list[rows]['City']
    state = dojo_list[rows]['State']
    if strt == "Missing Street Address":
        api_url = f'https://maps.googleapis.com/maps/api/geocode/json?address={city},+{state}&key={gkey}'
    else:
        api_url = f'https://maps.googleapis.com/maps/api/geocode/json?address={strt},{city},+{state}&key={gkey}'
#   Make the call list api friendly.
    call_list.append(api_url.replace(" ","+"))

In [66]:
# Loops through the call list and will send a request to google api geocode.
count = 0
batch = 1
for i in range(len(call_list)):
    try:
        geo_data = requests.get(call_list[i]).json()

        # From geodata, pull the latitude and longitude coordinates.
        lat = geo_data['results'][0]['geometry']['location']['lat']
        lng = geo_data['results'][0]['geometry']['location']['lng']

        # Adds the latitude and longitude to the dictionary.
        dojo_list[i]['lat'] = lat
        dojo_list[i]['lng'] = lng

        count += 1
        
        # To not receive a query limit error
        if count % 49 == 0:
            print(count)
            print(f'Batch {batch}: Complete')
            time.sleep(1)
            batch += 1
            
    # Catches any exceptions        
    except IndexError:
        print(f'No coordinates: {count}')
        dojo_list[i]['lat'] = ''
        dojo_list[i]['lng'] = ''

# Shows when the file is done.        
print('done')

No coordinates: 33
49
Batch 1: Complete
98
Batch 2: Complete
No coordinates: 104
No coordinates: 119
147
Batch 3: Complete
No coordinates: 153
No coordinates: 173
No coordinates: 176
196
Batch 4: Complete
No coordinates: 202
No coordinates: 217
No coordinates: 227
No coordinates: 242
No coordinates: 242
245
Batch 5: Complete
294
Batch 6: Complete
No coordinates: 298
No coordinates: 322
343
Batch 7: Complete
No coordinates: 366
392
Batch 8: Complete
No coordinates: 398
No coordinates: 434
441
Batch 9: Complete
No coordinates: 448
No coordinates: 469
No coordinates: 484
No coordinates: 487
No coordinates: 488
490
Batch 10: Complete
No coordinates: 491
539
Batch 11: Complete
No coordinates: 539
No coordinates: 548
No coordinates: 551
No coordinates: 553
588
Batch 12: Complete
No coordinates: 594
637
Batch 13: Complete
No coordinates: 637
No coordinates: 663
686
Batch 14: Complete
No coordinates: 688
No coordinates: 691
No coordinates: 691
No coordinates: 696
No coordinates: 719
735
Batch 

In [79]:
#Creates a json with the final database.

final_df = pd.DataFrame(dojo_list)
final_df.replace('',np.nan,inplace = True)
final_df.dropna(inplace = True)
final_df.to_json(r'Test_data\final_dojo.json',orient='table')

In [81]:
# Uploads the dataframe to mongodb
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# Declare the database
db = client.dojo_db

# Declare the collection
dojos = db.dojos

dojos.insert_many(dojo_list)

<pymongo.results.InsertManyResult at 0x14e63302848>

# Below is a test set. Is not used in the final json.

In [26]:
geo_data = requests.get(call_list[0]).json()
print(json.dumps(geo_data, indent=4, sort_keys=True))

{
    "results": [
        {
            "address_components": [
                {
                    "long_name": "Unit 13 A",
                    "short_name": "Unit 13 A",
                    "types": [
                        "subpremise"
                    ]
                },
                {
                    "long_name": "2511",
                    "short_name": "2511",
                    "types": [
                        "street_number"
                    ]
                },
                {
                    "long_name": "Fire Road",
                    "short_name": "Fire Rd",
                    "types": [
                        "route"
                    ]
                },
                {
                    "long_name": "Egg Harbor Township",
                    "short_name": "Egg Harbor Township",
                    "types": [
                        "locality",
                        "political"
                    ]
                },
              

In [39]:
df = pd.DataFrame(test_dojo_list)
df

Unnamed: 0,Name,Phone,Zipcode,Street,City,State,Style,lat,lng
0,Cardo Urso's HABU PIT,609-415-5014,08234,"2511 Fire Road, Unit 13 A",Egg Harbor Township,NJ,"[Boxing, Jiu-Jitsu (Jujutsu/Jujitsu), Judo, Mi...",39.420965,-74.527523
1,Lightning Lyall MMA,609-474-3055,08406,Ventnor Ave,Ventnor City,NJ,"[Karate, Kick Boxing, Mixed Martial Arts (MMA)...",39.420965,-74.527523
2,Lightning MMA,609-474-3055,08406,Newport Ave,Ventnor City,NJ,"[Kick Boxing, Mixed Martial Arts (MMA), Muay T...",39.420965,-74.527523
3,South Jersey Wing Chun Kuen Do,609-432-6156,08232,801 N Main Street,Pleasantville,NJ,"[Kung Fu, Wing Chun]",39.420965,-74.527523
4,Ultimate Martial Arts Academy,609-625-0880,08330,4450 E Black Horse Pike,Hamilton Township,NJ,"[Hapkido, Karate, Tae Kwon Do]",39.420965,-74.527523
...,...,...,...,...,...,...,...,...,...
126,Bai's Tae Kwon DO,201-307-8922,07656,168 Kinderkamack Rd,Park Ridge,NJ,[Tae Kwon Do],39.420965,-74.527523
127,Power Martial Arts,201-224-5789,07024,1282 Palisade Ave,Fort Lee,NJ,"[Hapkido, Judo, Karate, Mixed Martial Arts (MM...",39.420965,-74.527523
128,Kokushi Dojo,201-575-0669,07481,Wyckoff family YMCA,Wyckoff,NJ,"[Aiki Jitsu (Aikijutsu), Aikido, Brazilian Jiu...",39.420965,-74.527523
129,Northern Valley Martial Arts,201-784-2411,07648,"55 Walnut Street , Suite 103",Norwood,NJ,[Mixed Martial Arts (MMA)],39.420965,-74.527523


In [40]:
# Original creation of a test set.
df.to_json(r'Test_data\test_dojo.json',orient='table')

## Below is a similar scraping code. But Uses clicks.

In [None]:
dojo_list = []

# Visits the url
url = "https://dojos.info/NewJersey/Counties.aspx"
browser.visit(url)
html = browser.html
soup = bs(html, "html.parser")
time.sleep(2)


# Searches the website for the link to the counties
browser.links.find_by_partial_text("County").click()
time.sleep(1)

if browser.url == "https://dojos.info/NewJersey/Counties.aspx#google_vignette":
    browser.visit(url)
    browser.links.find_by_partial_text("County").click()

response = requests.get(browser.url)
soup1 = bs(response.text,'html.parser')

uList = soup1.find('ul',class_ = 'dojolist')
dojolist = uList.find_all('h3')
dojolen = len(dojolist)


# Selects the dojo list to click.
ulist = browser.find_by_css('.dojolist')

h3 = ulist.find_by_tag('h3')

# Click the the link to move to the dojo information.
a_tags = h3.find_by_tag('a')

a_tags.click()

for x in range(1,dojolen + 1):

    # Finds the address for the dojo.
    address = browser.find_by_tag('address')

    # This will pull the address text
    p_address = address.text

    # This will split the text string into usable pieces.
    recreate = p_address.replace('\n'," | ").split("|")

    if len(recreate) > 3:
        recreate.pop(0)

    # This is the street address
    street = recreate[0].split()
    streets = " ".join(street)
    split = recreate[1].split(',')

    # The city address
    city = split[0]

    # Further splits the string into the state and zipcode parts.
    split2 = split[1].split(" ")
    state = split2[1]
    zipcode = split2[2]

    # This will pull the name of the dojo.
    dojo_name = browser.find_by_css('#pageTop')
    dojo = dojo_name.find_by_tag("h1").text

    # This will find the styles that the dojo teaches
    response = requests.get(browser.url)
    soup2 = bs(response.text , "html")

    style = []

    styles = soup2.find('ul', class_ = 'styles')
    for dank in styles.find_all('li'):
        style.append(dank.text)

    # Finding the phone number.
    phone = browser.find_by_css('.phone')
    dojo_phone = phone.text.split(" ")[1]

    dojo_dict = {'Name': dojo,
                'Phone': dojo_phone,
                'Zipcode': zipcode,
                'Street': streets,
                'City': city,
                'State': state,
                'Style': style}

    dojo_list.append(dojo_dict)
    browser.back()
    time.sleep(2)

    try:
        ulist = browser.find_by_css('.dojolist')
        h3 = ulist.find_by_tag('h3')[x]
        h3.find_by_tag("a").click()

    except:
        print('done')
        
# Quits out of the browser.
browser.quit()