# Using ML to Predict Illegal U.S. Southwest Cross-Border Activity

'''
My intent is to map out previous years' land-based border activity (since FY 2017) and use ML to predict where future cross-border activity would be most prevalent and at what times of the year. My overall goal is to map out trends in illegal border activity to assist with coordinating U.S. Customs and Border Protection response planning.

Data sets for each year:

U.S. Border Patrol Southwest Border Apprehensions by Sector | U.S. Customs and Border Protection (cbp.go
https://www.cbp.gov/newsroom/stats/southwest-land-border-encounters/usbp-sw-border-apprehensions
- **FY 2021 (October 1, 2020 - March 31, 2021)**
 
U.S. Border Patrol Southwest Border Apprehensions by Sector Fiscal Year 2020 | U.S. Customs and Border Protection (cbp.gov)
https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2020
- **FY 2020 (October 1, 2019 - September 30, 2020)**
 
U.S. Border Patrol Southwest Border Apprehensions by Sector Fiscal Year 2019 | U.S. Customs and Border Protection (cbp.gov)
https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2019
- **FY 2019 (October 1, 2018 - September 30, 2019)**
 
U.S. Border Patrol Southwest Border Apprehensions by Sector FY2018 | U.S. Customs and Border Protection (cbp.gov)
https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions
- **FY 2018 (October 1, 2017 - September 30, 2018)**
 
U.S. Border Patrol Southwest Border Apprehensions by Sector FY2017 | U.S. Customs and Border Protection (cbp.gov)
https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions-fy2017
- **FY 2017 (October 1, 2016 - September 30, 2017)**"
 
As you begin EDA and reviewing the data look for elements present that you can 'predict' (ie. months when crossings occur, days when crossings occur, etc. ). Look for correlations, etc.
'''

"This predictor seems to be highly associated with this outcome..."

USBP and OFO official year end reporting for FY20; USBP and OFO end of month reporting for FY21TD. Data is current as of 7/6/21.

## sklearn
1. Import
2. Instantiate
3. Fit
4. Predict

In [1]:
# Import libraries to interact with html, tables, and plotting data

import pandas as pd
import numpy as np # used for linear algebra and random sampling
import seaborn as sns
from matplotlib import pyplot as plt
from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser
import glob
import requests
import urllib.request
from urllib.request import Request, urlopen
import requests
import lxml
import html5lib
import webbrowser
import joblib
import random

# sklearn libraries for ML
# You need to re-import scikit-learn algorithms after patch_sklearn()
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
# from sklearn.datasets import load_boston
from sklearn.dummy import DummyRegressor
from sklearn import model_selection
from sklearn.model_selection import cross_val_score

# used for plotting charts within the notebook (instead of a separate window)
# Allow plots to appear in the notebook.
%matplotlib inline

# Use Pickle to save data models in their current state so a ML model retrain is not necessary
import pickle

print(f'Pandas v{pd.__version__}')
print(f'Numpy v{np.__version__}')
print(f'joblib v{joblib.__version__}')
print(f'sklearn v{sklearn.__version__}')

Pandas v1.3.4
Numpy v1.21.4
joblib v1.1.0
sklearn v1.0.1


In [2]:
data_FY2021_url = "https://www.cbp.gov/newsroom/stats/southwest-land-border-encounters/usbp-sw-border-apprehensions"
data_FY2020_url = "https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2020"
data_FY2019_url = "https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2019"
data_FY2018_url = "https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions"
data_FY2017_url = "https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions-fy2017"
test_url1 = "https://www.tutorialspoint.com/python_web_scraping/python_web_scraping_testing_with_scrapers.htm"
test_url2 = "https://webscraper.io/test-sites"
test_url3 = "https://towardsdatascience.com/web-scraping-scraping-table-data-1665b6b2271c"
test_urls = [test_url1, test_url2, test_url3]

In [3]:
websites = [data_FY2021_url, data_FY2020_url, data_FY2019_url, data_FY2018_url, data_FY2017_url]
# websites = test_urls

In [4]:
webpage_tables = []
webpages = []

def website_tables_scrape(websites):
     user_agent_list = [
        ('Mozilla/5.0'),  # firefox
        ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'),  # safari
        ('Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'),  # firefox
        ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'),  # chrome
        ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0'),  # firefox
        ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'),  # chrome
        ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'),  # chrome
        ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'),  # chrome
        ('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0'),  # firefox
        ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'),  # chrome
        ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'),  # chrome
        ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'),  # chrome
        ]

     for url in websites: 
        #Pick a random user agent
        user_agent = random.choice(user_agent_list)
        #Set the headers 
        headers = {'User-Agent': user_agent}
        # Retrieve the html from the url
        resp = requests.get(url, headers=headers)

        if resp.status_code == 200:
            print(f'Status code: {resp.status_code} - {url} was successfully processed\n')
            
            webpage = resp.text
            # Use soup to parse the html response
            soup = bs(webpage,"html.parser")
            webpages.append(soup)
            # print("\n\nRequest #%s\nUser-Agent Sent:%s\n\nHeaders Recevied by HTTPBin:\n\n"%(webpage,user_agent))

            try:
                table = pd.read_html(webpage)
                # Save all tables into a list dataframe
                webpage_tables.append(table)
            except:
                print(f"ERROR: No tables found on {url}.")
                continue

        elif resp.status_code == 403:
            print(f'ERROR: Status code: {resp.status_code} - access forbidden to {url}.\n'
            'Trying again with a different url agent string.\n')

            #Pick a random user agent
            user_agent = random.choice(user_agent_list)
            #Set the headers 
            headers = {'User-Agent': user_agent}
            # Retrieve the html from the url
            resp = requests.get(url, headers=headers)
            webpage = resp.text
            soup = bs(webpage,"html.parser")
            webpages.append(soup)

            try:
                table = pd.read_html(webpage)
                # Save all tables into a list dataframe
                webpage_tables.append(table)
            except:
                print(f"ERROR: No tables found on {url}.")
                continue

        else:
            print(f'ERROR: Status code {resp.status_code} on website {url}\n')
            continue

website_tables_scrape(websites)
print("\n" * 5)
print(webpage_tables)
print("\n" * 5)
print(webpages)

Status code: 200 - https://www.cbp.gov/newsroom/stats/southwest-land-border-encounters/usbp-sw-border-apprehensions was successfully processed

Status code: 200 - https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2020 was successfully processed

Status code: 200 - https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2019 was successfully processed

Status code: 200 - https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions was successfully processed

Status code: 200 - https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions-fy2017 was successfully processed







[[  Unaccompanied Children Encounters by Sector                          \
                                       Sector FY20 TD MAR FY21 TD MAR   
0                                    Big Bend         254         845   
1                                     Del Rio        1166        3431   
2                                   El Centro      

## Code Notes

In [5]:
# ### User Agent Strings

# headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"}
# #Lets test what headers are sent by sending a request to HTTPBin
# r = requests.get('http://httpbin.org/headers',headers=headers)
# print(r.json())


# user_agent_list = [
# 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
# 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
# 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
# ]

# url = 'https://httpbin.org/headers'
# for i in range(1,4):
#     #Pick a random user agent
#     user_agent = random.choice(user_agent_list)
#     #Set the headers 
#     headers = {'User-Agent': user_agent}
#     #Make the request
#     response = requests.get(url,headers=headers)
#     print("Request #%d\nUser-Agent Sent:%s\n\nHeaders Recevied by HTTPBin:"%(i,user_agent))
#     print(response.json())
#     print("-------------------")

In [6]:
# webpage_tables = []
# webpages = []

# def website_tables_scrape(websites):
#      user_agent_list = [
#         ('Mozilla/5.0'),  # firefox
#         ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'),  # safari
#         ('Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'),  # firefox
#         ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0'),  # firefox
#         ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0'),  # firefox
#         ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'),  # chrome
#         ]

#      for url in websites:
#         #Pick a random user agent
#         user_agent = random.choice(user_agent_list)
#         #Set the headers 
#         headers = {'User-Agent': user_agent}
#         # Retrieve the html from the url
#         resp = requests.get(url, headers=headers)

#         if resp.status_code == 200:
#             print(f'Status code: {resp.status_code} - {url} was successfully processed\n')
            
#             ## Using this process will provide the same html details as resp.text
#             # req = Request(url, headers=headers)
#             # webpage = urlopen(req).read()
#             webpage = resp.text
#             # Use soup to parse the html response
#             soup = bs(webpage,"html.parser")

#             try:
#                 table = pd.read_html(webpage)
#                 # table = pd.read_html(webpage, attrs={'id': 'table'})
#                 # table = pd.read_html(webpage, attrs='table')
#                 # Save all tables into a list dataframe
#                 # tables = soup.find_all("table")  
#                 webpage_tables.append(table)
#             except:
#                 # raise ValueError("No tables found")
#                 print(f"ERROR: No tables found on {url}.")
#                 continue

#             # try:
#             #     webpage_tables.append(table)
#             # except:
#             #     # raise ValueError("No tables found")
#             #     print(f"ERROR: No tables found on {url}.")
#             #     continue

#             # if table:
#             #     webpage_tables.append(table)
#             # elif not table:
#             #     # raise ValueError("No tables found")
#             #     print(f"ERROR: No tables found on {url}.")
#             #     continue

#             webpages.append(soup)

#         elif resp.status_code == 403:
#             print(f'ERROR: Status code: {resp.status_code} - access forbidden to {url}.\n'
#             'Trying again with a different url agent string.\n')
            
#             #Pick a random user agent
#             user_agent = random.choice(user_agent_list)
#             #Set the headers 
#             headers = {'User-Agent': user_agent}
#             # Retrieve the html from the url
#             resp = requests.get(url, headers=headers)
#             webpage = resp.text

#             try:
#                 table = pd.read_html(webpage)
#                 # table = pd.read_html(webpage, attrs={'id': 'table'})
#                 # table = pd.read_html(webpage, attrs='table')
#                 # Save all tables into a list dataframe
#                 # tables = soup.find_all("table")  
#                 webpage_tables.append(table)
#             except:
#                 # raise ValueError("No tables found")
#                 print(f"ERROR: No tables found on {url}.")
#                 continue

#             # try:
#             #     webpage_tables.append(table)
#             # except:
#             #     # raise ValueError("No tables found")
#             #     print(f"ERROR: No tables found on {url}.")
#             #     continue

#             # if table:
#             #     webpage_tables.append(table)
#             # elif not table:
#             #     # raise ValueError("No tables found")
#             #     print(f"ERROR: No tables found on {url}.")
#             #     continue
            
#             webpages.append(soup)

#         else:
#             print(f'ERROR: Status code {resp.status_code} on website {url}\n')
#             continue

# website_tables_scrape(websites)
# print(webpage_tables)
# print(webpages)

In [7]:
# webpage_tables = []
# webpages = []

# def website_tables_scrape(websites):
#      user_agent_list = [
#         ('Mozilla/5.0'),  # firefox
#         ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15'),  # safari
#         ('Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0'),  # firefox
#         ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0'),  # firefox
#         ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0'),  # firefox
#         ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'),  # chrome
#         ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'),  # chrome
#         ]

#      for url in websites:
#         #Pick a random user agent
#         user_agent = random.choice(user_agent_list)
#         #Set the headers 
#         headers = {'User-Agent': user_agent}
#         # Retrieve the html from the url
#         resp = requests.get(url, headers=headers)

#         if resp.status_code == 200:
#             print(f'Status code: {resp.status_code} - {url} was successfully processed\n')
            
#             ## Using this process will provide the same html details as resp.text
#             # req = Request(url, headers=headers)
#             # webpage = urlopen(req).read()
#             webpage = resp.text
#             # Use soup to parse the html response
#             soup = bs(webpage,"html.parser")
#             # Save all tables into a list dataframe
#             table = pd.read_html(webpage)
#             # tables = soup.find_all("table")
            

#             if not table:
#                 print("Test Test Test")
#             else:
#                 webpage_tables.append(table)

            
#             webpages.append(soup)

#         elif resp.status_code == 403:
#             print(f'ERROR: Status code: {resp.status_code} - access forbidden to {url}.\n'
#             'Trying again with a different url agent string.\n')
            
#             #Pick a random user agent
#             user_agent = random.choice(user_agent_list)
#             #Set the headers 
#             headers = {'User-Agent': user_agent}
#             # Retrieve the html from the url
#             resp = requests.get(url, headers=headers)
#             webpage = resp.text
#             ## Using this process will provide the same html details as resp.text
#             # req = Request(url, headers=headers)
#             # webpage = urlopen(req).read()

#             # tables = soup.find_all("table")
#             table = pd.read_html(webpage)
#             if table:
#                 webpage_tables.append(table)
#             else:
#                 continue


#             # Use soup to parse the html response
#             soup = bs(webpage,"html.parser")
#             # Save all tables into a list dataframe
#             try:
#                 # tables = p.parse_tables()
#                 table = pd.read_html(webpage)
#                 webpage_tables.append(table)
#             except: # ValueError as caught:
#                 continue
            
#             webpages.append(soup)

#         else:
#             print(f'ERROR: Status code {resp.status_code} on website {url}\n')
#             continue

# website_tables_scrape(websites)
# print(webpage_tables)
# print(webpages)

In [8]:
# # https://pypi.org/project/scikit-learn-intelex/
# # Intel® Extension for Scikit-learn* offers you a way to 
# # accelerate existing scikit-learn code. The acceleration 
# # is achieved through patching: replacing the stock 
# # scikit-learn algorithms with their optimized versions 
# # provided by the extension.
# ## Intel CPU optimizations patching
# from sklearnex import patch_sklearn, config_context
# ## Intel GPU optimizations patching
# import daal4py
# from daal4py import *
# from daal4py._daal4py import *
# from daal4py.oneapi import sycl_context
# import daal4py.sklearn
# from daal4py.sklearn.neighbors import KNeighborsClassifier
# from daal4py.sklearn.neighbors import KNeighborsRegressor
# from daal4py.sklearn.neighbors import NearestNeighbors
# from daal4py.sklearn.tree import DecisionTreeClassifier
# from daal4py.sklearn.ensemble import RandomForestClassifier
# from daal4py.sklearn.ensemble import RandomForestRegressor
# from daal4py.sklearn.ensemble import AdaBoostClassifier
# from daal4py.sklearn.cluster import KMeans
# from daal4py.sklearn.cluster import DBSCAN
# from daal4py.sklearn.decomposition import PCA
# from daal4py.sklearn.linear_model import Ridge
# from daal4py.sklearn.svm import SVC
# from daal4py.sklearn.linear_model import logistic_regression_path
# from daal4py.sklearn.linear_model import LogisticRegression
# from daal4py.sklearn.linear_model import ElasticNet
# from daal4py.sklearn.linear_model import Lasso
# from daal4py.sklearn.model_selection import _daal_train_test_split
# from daal4py.sklearn.metrics import _daal_roc_auc_score
# patch_sklearn()
# daal4py.sklearn.patch_sklearn()

# patch_sklearn()

# !pip install requests
# !pip install requests-html
# !pip install lxml
# !pip install html5lib
# !pip install tensorflow
# import tensorflow as tf
# print(f'TensorFlow v{tf.__version__}')

In [9]:
### This code works! But, it's not the cleanest way.

# tables_list = []
# html_raw_data = []
# def website_tables(websites):
#     for website in websites:
#         # resp = requests.get(website)

#         if resp.status_code == 200:
#             print(f'Status code: {resp.status_code} - {website} was successfully processed\n')
#             req = Request(website, headers={'User-Agent': 'Mozilla/5.0'})
#             webpage = urlopen(req).read()
#             table = pd.read_html(webpage)
#             # We need a parser, Python built-in HTML parser will work
#             soup = bs(resp.text,"html.parser")
            
#             tables_list.append(table)
#             html_raw_data.append(soup)

#         else:
#             print(f'ERROR: Status code {resp.status_code} on website {website}\n')
#             continue

# website_tables(websites)

# for table in tables_list:
#     print(tables_list)
#     print("\n\n\n")
# # print(tables_list)
# # print("\n\n\n")
# # print(html_raw_data[4])

In [10]:
# def websites():
#     # The websites we want to scrape
#     # Need to put these into a dictionary/list to call later, not sure which one just yet
#     data_FY2021_url = "https://www.cbp.gov/newsroom/stats/southwest-land-border-encounters/usbp-sw-border-apprehensions"
#     data_FY2020_url = "https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2020"
#     data_FY2019_url = "https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2019"
#     data_FY2018_url = "https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions"
#     data_FY2017_url = "https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions-fy2017"
    
#     # Open with GET method
#     resp = requests.get(data_FY2021_url)
    
#     # http_response 200 means OK status
#     if resp.status_code == 200:
#         print("Successfully opened webpage")
#         print("Here is your webpage:-\n")
        
#         # We need a parser, Python built-in HTML parser will work
#         soup = BeautifulSoup(resp.text,'html.parser')
        
#     else:
#         print(f"Error, your response code was {resp.status_code}")
# websites()

In [11]:
#         # text_list is the list that contains all the text
#         text_list = soup.find("ul",{"class":"searchNews"})
#         print(text_list)
        
#         # Now we want to print only the text part of the anchor.
#         # Find all the elements of a, i.e. anchor
# #         for i in text_list.findAll("dt"):
# #             print(i.text)

In [12]:
# soup = BeautifulSoup(resp.text,'html.parser')
# print(soup.head.title)
# print(soup.body.a.text)
# print(soup.body.p.b)     # returns <b>Body's title</b>
# print(soup.body.div)
# print(soup.body.thead)
# print(soup.body.tbody)

# resp = requests.get("https://www.cbp.gov/newsroom/stats/southwest-land-border-encounters/usbp-sw-border-apprehensions")

# soup = BeautifulSoup(resp.text,'html.parser')
# print(resp)
# print(soup)
# soup.body
# resp.status_code
# requests.status_codes
# resp_code  = resp.status_code(data_FY2021_url)
# resp_code

# table = soup.find_all('table')
# df = pd.read_html(str(table))

In [13]:
# URLs from the U.S. Customs and Border Protection website with data tables 

# def websites():
#     # The websites we want to scrape
#     # Need to put these into a dictionary/list to call later, not sure which one just yet
#     data_FY2021_url = "https://www.cbp.gov/newsroom/stats/southwest-land-border-encounters/usbp-sw-border-apprehensions"
#     data_FY2020_url = "https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2020"
#     data_FY2019_url = "https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2019"
#     data_FY2018_url = "https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions"
#     data_FY2017_url = "https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions-fy2017"
    
#     # Open with GET method
#     resp = requests.get(data_FY2021_url)
    
#     # http_response 200 means OK status
#     if resp.status_code == 200:
#         print("Successfully opened webpage")
#         print("Here is your webpage:-\n")
        
#         # We need a parser, Python built-in HTML parser will work
#         soup = BeautifulSoup(resp.text,'html.parser')
        
#         # text_list is the list that contains all the text
#         text_list = soup.find("ul",{"class":"searchNews"})
        
#         # Now we want to print only the text part of the anchor.
#         # Find all the elements of a, i.e. anchor
#         for i in text_list.findAll("a"):
#             print(i.text)
#     else:
#         print("Error")
# websites()

# # Import html file using BeautifulSoup
# with open(data_FY2021_url) as f:
#     # read file
#     content = f.read()
#     # parse html
#     soup = soup(content, 'html.parser')
#     # print Title tag
#     print(soup.title)

# HTMLParser.feed(' ',data_FY2021_url)

# # urllib.request.urlopen(url).read()

# webbrowser.open(data_FY2021_url)

# res = requests.get(data_FY2021_url)

# type(res)
# res.status_code == requests.codes.ok
# len(res.text)
# print(res.text[:250])

# res.raise_for_status()
# noStarchSoup = soup(res.text)
# type(noStarchSoup)


# soup_file = open(data_FY2021_url)
# soup_pull = soup(data_FY2021_url.read())

In [14]:
# # import requests module
# import requests
  
# # Making a get request
# response = requests.get('https://api.github.com/')
  
# # print response
# print(response)
  
# # print check if an error has occurred
# print(response.raise_for_status())
  
# # ping an incorrect url
# response = requests.get('https://geeksforgeeks.org/naveen/')
  
# # print check if an error has occurred
# print(response.raise_for_status())

In [15]:
# def websites_data(URL):
#     # The websites we want to scrape
#     # Need to put these into a dictionary/list to call later, not sure which one just yet
#     data_FY2021_url = "https://www.cbp.gov/newsroom/stats/southwest-land-border-encounters/usbp-sw-border-apprehensions"
#     data_FY2020_url = "https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2020"
#     data_FY2019_url = "https://www.cbp.gov/newsroom/stats/sw-border-migration/usbp-sw-border-apprehensions-fy2019"
#     data_FY2018_url = "https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions"
#     data_FY2017_url = "https://www.cbp.gov/newsroom/stats/usbp-sw-border-apprehensions-fy2017"
    
#     if resp.status_code == 200:
#         print("Successfully opened webpage")
#         print("Here is your webpage:-\n")
#         # Open with GET method
#         resp = requests.get(URL)

In [16]:
# range(len(websites))