# Texas Tow Trucks

We're going to scrape some [tow trucks in Texas](https://www.tdlr.texas.gov/tools_search/).

## Import your imports

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
import time
import pandas as pd

## Search for the TLDR Number `006179570C`, and scrape the information on that company

Using [license information system](https://www.tdlr.texas.gov/tools_search/), find information about the tow truck number above, displaying the

- The business name
- Owner/operator
- Phone number
- License status (Active, Expired, Etc)
- Physical address

If you can't figure a 'nice' way to locate something, your two last options might be:

- **Find a "parent" element, then dig inside**
- **Find all of a type of element** (like we did with `td` before) and get the `[0]`, `[1]`, `[2]`, etc
- **XPath** (inspect an element, Copy > Copy XPath)

These kinds of techniques tend to break when you're on other result pages, but... maybe not! You won't know until you try.

> - *TIP: When you use xpath, you CANNOT use double quotes or Python will get confused. Use single quotes.*
> - *TIP: You can clean your data up if you want to, or leave it dirty to clean later*
> - *TIP: The address part can be tough, but you have a few options. You can use a combination of `.split` and list slicing to clean it now, or clean it later in the dataframe with regular expressions. Or other options, too, probably*

In [None]:
driver = webdriver.Chrome()
url = "https://www.tdlr.texas.gov/tools_search/mccs_display.asp"
driver.get(url)

In [None]:
tldr_button = driver.find_element_by_id("mcrbutton")
tldr_button.click()

In [None]:
tldr = driver.find_element_by_id("mcrdata")
tldr.send_keys("006179570C")

submit_button = driver.find_element_by_xpath('//*[@id="submit3"]')
submit_button.click()

In [None]:
business_info = driver.find_elements_by_tag_name("table")[3]

In [None]:
business_name = business_info.find_elements_by_tag_name("tr")[1].text
business_name

In [None]:
business_owner = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text
business_owner

In [None]:
business_phone = business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
business_phone

In [None]:
business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
status_check = business_status[0].find_elements_by_tag_name('td')[1].text
status_check

In [None]:
business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
business_address = business_status[1].find_elements_by_tag_name('td')[1]
business_address.text


# Adapt this to work inside of a single cell

Double-check that it works. You want it to print out all of the details.

In [None]:
driver = webdriver.Chrome()

In [None]:
url = "https://www.tdlr.texas.gov/tools_search/mccs_display.asp"
driver.get(url)
tldr_button = driver.find_element_by_id("mcrbutton")
tldr_button.click()
tldr = driver.find_element_by_id("mcrdata")
tldr.send_keys("006179570C")

submit_button = driver.find_element_by_xpath('//*[@id="submit3"]')
submit_button.click()
business_info = driver.find_elements_by_tag_name("table")[3]
business_owner = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text
print(business_owner)
business_phone = business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
print(business_phone)
business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
status_check = business_status[0].find_elements_by_tag_name('td')[1].text
print(status_check)
business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
business_address = business_status[1].find_elements_by_tag_name('td')[1]
print(business_address.text.split('\n')[-2:])

In [None]:
url = "https://www.tdlr.texas.gov/tools_search/mccs_display.asp"
driver.get(url)
tldr_button = driver.find_element_by_id("mcrbutton")
tldr_button.click()
tldr = driver.find_element_by_id("mcrdata")
tldr.send_keys("006179570C")

submit_button = driver.find_element_by_xpath('//*[@id="submit3"]')
submit_button.click()
business_info = driver.find_elements_by_tag_name("table")[3]
business_owner = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text
print(business_owner.replace("Owner/Officer:   ", "").replace(" / OWNER", ""))
business_phone = business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
print(business_phone#.replace("Phone:   ", "")
business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
status_check = business_status[0].find_elements_by_tag_name('td')[1].text
print(status_check.replace("Status:  ", ""))
business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
business_address = business_status[1].find_elements_by_tag_name('td')[1]
print(business_address.text.split('\n')[-2:])



In [None]:
url = "https://www.tdlr.texas.gov/tools_search/mccs_display.asp"
driver.get(url)
tldr_button = driver.find_element_by_id("mcrbutton")
tldr_button.click()
tldr = driver.find_element_by_id("mcrdata")
tldr.send_keys("006448786C")

submit_button = driver.find_element_by_xpath('//*[@id="submit3"]')
submit_button.click()
business_info = driver.find_elements_by_tag_name("table")[3]
business_owner_one = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text
business_owner_two = business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
print(business_owner_one.replace("Owner/Officer:   ", ""))
print(business_owner_two.replace("Owner/Officer:   ", ""))
business_phone = business_info.find_elements_by_tag_name("tr")[4].find_element_by_tag_name("td").text
print(business_phone.replace("Phone:   ", ""))
business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
status_check = business_status[0].find_elements_by_tag_name('td')[1].text
print(status_check.replace("Status:  ", ""))
business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
business_address = business_status[1].find_elements_by_tag_name('td')[1]
print(business_address.text.split('\n')[-2:])

# Using .apply to find data about SEVERAL tow truck companies

The file `trucks-subset.csv` has information about the trucks, we'll use it to find the pages to scrape.

### Open up `trucks-subset.csv` and save it into a dataframe

In [2]:
df = pd.read_csv("trucks-subset.csv")
df.head(3)

Unnamed: 0,TDLR Number
0,006507931C
1,006179570C
2,006502097C


## Go through each row of the dataset, displaying the URL you will need to scrape for the information on that row

You don't have to actually use the search form for each of these - look at the URL you're on, it has the number in it!

For example, one URL might look like `https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006495492C`.

- *TIP: Use .apply and a function*
- *TIP: Unlike the Yelp example, you'll need to build this URL from pieces*
- *TIP: You probably don't want to `print` unless you're going to fix it for the next question 
- *TIP: pandas won't showing you the entire url! Run `pd.set_option('display.max_colwidth', -1)` to display aaaalll of the text in a cell*

In [3]:
def get_tdlr_url(row):
    built_url = "https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=" + row["TDLR Number"]
    return built_url

### Save this URL into a new column of your dataframe, called `url`

- *TIP: Use a function and `.apply`*
- *TIP: Be sure to use `return`*

In [4]:
pd.set_option('display.max_colwidth', -1)
df['url'] = df.apply(get_tdlr_url, axis=1)
df.head()

Unnamed: 0,TDLR Number,url
0,006507931C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006507931C
1,006179570C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006179570C
2,006502097C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006502097C


## Go through each row of the dataset, printing out information about each tow truck company.

Now will be **scraping** inside of your function.

- The business name
- Owner/operator
- Phone number
- License status (Active, Expired, Etc)
- Physical address

Just print it out for now.

- *TIP: use .apply*
- *TIP: You'll be using the code you wrote before, but converted into a function*
- *TIP: Remember how the TDLR Number is in the URL? You don't need to do the form submission if you don't want!*
- *TIP: Make sure you adjust any variables so you don't scrape the same page again and again*

In [None]:
driver = webdriver.Chrome()

def get_tdlr_info(row):
    driver.get(row)
    driver.get(row)
    
    business_info = driver.find_elements_by_tag_name("table")[3]

    business_owner = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text
    business_owner_clean = business_owner.replace("Owner/Officer:   ", "").replace(" / OWNER", "")
    
    business_phone = business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
    business_phone_clean = business_phone.replace("Phone:   ", "")
    
    business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
    status_check = business_status[0].find_elements_by_tag_name('td')[1].text
    status_check_clean = status_check.replace("Status:  ", "")
                         
    business_address = business_status[1].find_elements_by_tag_name('td')[1]
    business_address_clean = str(business_address.text.split('\n')[-2:])

    return pd.Series({
      "Owner": business_owner_clean,
      "Phone": business_phone_clean,
      "Status": status_check_clean,
      "address": business_address_clean
    })


In [6]:
driver = webdriver.Chrome()

def get_tdlr_info(row):
    driver.get(row)
    driver.get(row)
    
    business_info = driver.find_elements_by_tag_name("table")[3]
    
    try:
        if business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").str.contains("Owner/Officer"):
            business_owner_one = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text
            business_owner_one_clean = business_owner_one.replace("Owner/Officer:   ", "")
               
            business_owner_two = business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
            business_owner_two_clean = business_owner_two.replace("Owner/Officer:   ", "")
                
            business_phone = business_info.find_elements_by_tag_name("tr")[4].find_element_by_tag_name("td").text
            business_phone_clean = business_phone.replace("Phone:   ", "")
    
    except:
        business_owner = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text
        business_owner_clean = business_owner.replace("Owner/Officer:   ", "")
            
        business_phone = business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
        business_phone_clean = business_phone.replace("Phone:   ", "")
               
    business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
    status_check = business_status[0].find_elements_by_tag_name('td')[1].text
    status_check_clean = status_check.replace("Status:  ", "")
                         
    business_address = business_status[1].find_elements_by_tag_name('td')[1]
    business_address_clean = str(business_address.text.split('\n')[-2:])

    return pd.Series({
      "Owner": business_owner_clean,
      "Phone": business_phone_clean,
      "Status": status_check_clean,
      "address": business_address_clean
    })


In [7]:
df['url'].apply(get_tdlr_info).join(df)

Unnamed: 0,Owner,Phone,Status,address,TDLR Number,url
0,AUGUSTUS EUGENE SMITH / OWNER,9032276464,Active,"['103 N MAIN ST', 'BONHAM, TX. 75418']",006507931C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006507931C
1,BRANDT SMITH / OWNER,8173330706,Expired,"['13619 BRETT JACKSON RD.', 'FORT WORTH, TX. 76179']",006179570C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006179570C
2,BARRY MICHAEL SMITH / OWNER,8066544404,Active,"['4501 W CEMETERY RD', 'CANYON, TX. 79015']",006502097C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006502097C


## Scrape the following information for each row of the dataset, and save it into new columns in your dataframe.

- The business name
- Owner/operator
- Phone number
- License status (Active, Expired, Etc)
- Physical address

It's basically what we did before, but using the function a little differently.

- *TIP: Same as above, but you'll be returning a `pd.Series` and the `.apply` line is going to be a lot longer*
- *TIP: Save it to a new dataframe!*
- *TIP: Make sure you change your `df` variable names correctly if you're cutting and pasting - there are a few so it can get tricky*

In [8]:
new_df = df['url'].apply(get_tdlr_info).join(df)
new_df.head()

Unnamed: 0,Owner,Phone,Status,address,TDLR Number,url
0,AUGUSTUS EUGENE SMITH / OWNER,9032276464,Active,"['103 N MAIN ST', 'BONHAM, TX. 75418']",006507931C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006507931C
1,BRANDT SMITH / OWNER,8173330706,Expired,"['13619 BRETT JACKSON RD.', 'FORT WORTH, TX. 76179']",006179570C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006179570C
2,BARRY MICHAEL SMITH / OWNER,8066544404,Active,"['4501 W CEMETERY RD', 'CANYON, TX. 79015']",006502097C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006502097C


### Save your dataframe as a CSV named `tow-trucks-extended.csv`

In [9]:
new_df.to_csv("tow-trucks-extended.csv", index=False)

### Re-open your dataframe to confirm you didn't save any extra weird columns

In [10]:
new_df = pd.read_csv("tow-trucks-extended.csv")
new_df.head()

Unnamed: 0,Owner,Phone,Status,address,TDLR Number,url
0,AUGUSTUS EUGENE SMITH / OWNER,9032276464,Active,"['103 N MAIN ST', 'BONHAM, TX. 75418']",006507931C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006507931C
1,BRANDT SMITH / OWNER,8173330706,Expired,"['13619 BRETT JACKSON RD.', 'FORT WORTH, TX. 76179']",006179570C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006179570C
2,BARRY MICHAEL SMITH / OWNER,8066544404,Active,"['4501 W CEMETERY RD', 'CANYON, TX. 79015']",006502097C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006502097C


## Process the entire `tow-trucks.csv` file

We just did it on a short subset so far. Now try it on all of the tow trucks. **Save as the same filename as before**

In [11]:
entire_df = pd.read_csv("tow-trucks.csv")
entire_df.head()

Unnamed: 0,TDLR Number
0,006507931C
1,006179570C
2,006502097C
3,006494912C
4,0649468VSF


In [12]:
entire_df['url'] = entire_df.apply(get_tdlr_url, axis=1)
entire_df.head()

Unnamed: 0,TDLR Number,url
0,006507931C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006507931C
1,006179570C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006179570C
2,006502097C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006502097C
3,006494912C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006494912C
4,0649468VSF,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=0649468VSF


In [24]:
driver = webdriver.Chrome()

In [31]:
def get_tdlr_info(row):
    driver.get(row)
    driver.get(row)
    
    business_info = driver.find_elements_by_tag_name("table")[3]
    
               
    try:
        if 'Phone' in business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text:
            business_owner = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text
            business_owner_clean = business_owner.replace("Owner/Officer:   ", "")
            
            business_phone = business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
            business_phone_clean = business_phone.replace("Phone:   ", "")

    except:
        business_owner = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text + business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
        business_owner_clean = business_owner.replace("Owner/Officer:   ", "")
            
        business_phone = business_info.find_elements_by_tag_name("tr")[4].find_element_by_tag_name("td").text
        business_phone_clean = business_phone.replace("Phone:   ", "")
    
               
    business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
    status_check = business_status[0].find_elements_by_tag_name('td')[1].text
    status_check_clean = status_check.replace("Status:  ", "")
                         
    business_address = business_status[1].find_elements_by_tag_name('td')[1]
    business_address_clean = str(business_address.text.split('\n')[-2:])

    return pd.Series({
      "Owner": business_owner_clean,
      "Phone": business_phone_clean,
      "Status": status_check_clean,
      "address": business_address_clean
    })

In [32]:
entire_df['url'].apply(get_tdlr_info).join(df)

IndexError: list index out of range

In [27]:
def get_tdlr_info(row):
    driver.get(row)
    driver.get(row)
    
    business_info = driver.find_elements_by_tag_name("table")[3]
    
    try:
        if business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text.str.contains("Owner/Officer"):
            business_owner = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text + business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
            business_owner_clean = business_owner.replace("Owner/Officer:   ", "")
               
            #business_owner_two = business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
            #business_owner_two_clean = business_owner_two.replace("Owner/Officer:   ", "")
                
            business_phone = business_info.find_elements_by_tag_name("tr")[4].find_element_by_tag_name("td").text
            business_phone_clean = business_phone.replace("Phone:   ", "")
    
    except:
        business_owner = business_info.find_elements_by_tag_name("tr")[2].find_element_by_tag_name("td").text
        business_owner_clean = business_owner.replace("Owner/Officer:   ", "")
            
        business_phone = business_info.find_elements_by_tag_name("tr")[3].find_element_by_tag_name("td").text
        business_phone_clean = business_phone.replace("Phone:   ", "")
               
    business_status = driver.find_elements_by_tag_name("table")[4].find_elements_by_tag_name('tr')
    status_check = business_status[0].find_elements_by_tag_name('td')[1].text
    status_check_clean = status_check.replace("Status:  ", "")
                         
    business_address = business_status[1].find_elements_by_tag_name('td')[1]
    business_address_clean = str(business_address.text.split('\n')[-2:])

    return pd.Series({
      "Owner": business_owner_clean,
      "Phone": business_phone_clean,
      "Status": status_check_clean,
      "address": business_address_clean
    })


In [28]:
entire_df['url'].apply(get_tdlr_info).join(df)

Unnamed: 0,Owner,Phone,Status,address,TDLR Number,url
0,AUGUSTUS EUGENE SMITH / OWNER,9032276464,Active,"['103 N MAIN ST', 'BONHAM, TX. 75418']",006507931C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006507931C
1,BRANDT SMITH / OWNER,8173330706,Expired,"['13619 BRETT JACKSON RD.', 'FORT WORTH, TX. 76179']",006179570C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006179570C
2,BARRY MICHAEL SMITH / OWNER,8066544404,Active,"['4501 W CEMETERY RD', 'CANYON, TX. 79015']",006502097C,https://www.tdlr.texas.gov/tools_search/mccs_display.asp?mcrnumber=006502097C
3,HEATH A SMITH / OWNER,940-552-0687,Expired,"['1529 WILBARGER ST', 'VERNON, TX. 76384']",,
4,HEATH A SMITH / OWNER,9405520687,Expired,"['1529 WILBARGER ST', 'VERNON, TX. 76384']",,
5,WILLIAM THOMAS HYSMITH / PRESIDENT,Owner/Officer: ASHLEY ERIN HYSMITH / TREASURER,Active,"['1210 US 380 BYPASS', 'GRAHAM, TX. 76450']",,
6,HYSMITH ERIN ASHLEY / TREASURER,Owner/Officer: WILLIAM THOMAS HYSMITH / PRESIDENT,Suspended,"['927 LOVING HWY', 'GRAHAM, TX. 76450']",,
7,WILLIAM THOMAS HYSMITH / PRESIDENT,Owner/Officer: ASHLEY ERIN HYSMITH / TREASURER,Active,"['1210 380 BYPASS', 'GRAHAM, TX. 76450']",,
8,JEFF SMITH / PARTNER,Owner/Officer: WENDY SMITH / PARTNER,Suspended,"['10842 FM 2138 N', 'JACKSONVILLE, TX. 75766']",,
9,JEFFREY JOHN SMITH / OWNER,8324354670,Active,"['4338 HARVEY RD', 'CROSBY, TX. 77532']",,
