<a href="https://colab.research.google.com/github/leonasting/NLP-Devcon/blob/main/Data_Scraping_Page_link_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Scraping Test Drive

### Importing Libraries

In [28]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

### Functions to interact with link data extraction.

In [29]:
def get_url_content(url_temp):
  """
  takes the querry for url with page id and returns list 
  """
  try:
    page = requests.get(url_temp)
    soup = BeautifulSoup(page.text, 'html.parser')
    assert soup is not None,"unable to retrieve data."
    ls_a=soup.findAll("listing-titles-only")
    ls_url = [[ind_element["article-url"],ind_element["publish-date-ap"]] for ind_element in ls_a]
    return [ls_url,"Successful in retrieving:"+str(len(ls_url))+" urls"]
  except AssertionError as msg:
    return  [[],msg]
  except:
    return [[],"other error"]



In [30]:
def get_content(link_url,publish_date_ap,page_id):
  link_page = requests.get(link_url)
  soup = BeautifulSoup(link_page.text, 'html.parser')
  assert soup is not None,"unable to retrieve data from url."
  div_content=soup.find("div", {"class":"body"})
  assert div_content is not None,"missing body division"
  ls_p = div_content.findAll("p")
  assert ls_p is not None,"missing paragraphs in body division"
  
  page=page_id
  counter=1
  cat=""
  ls_content = []
  for ind_p in ls_p:
    strong_element = ind_p.find("strong")
    if strong_element:
      cat=strong_element.text
    else:
      ls_content.append({"para_id":"pg_"+str(page)+"_para_"+str(counter),
                        "content":ind_p.text,
                        "category":cat,
                        "publish_date_ap":publish_date_ap
                          })
      counter+=1
  return ls_content 

  



In [31]:
#ls_url[0]
['http://www.defense.gov/News/Contracts/Contract/Article/3175072/',
 'Sept. 29, 2022']
 #link_data = requests.get(ls_url[0][0])

#publish_date_ap= ls_url[0][1]

['http://www.defense.gov/News/Contracts/Contract/Article/3175072/',
 'Sept. 29, 2022']

In [32]:
base_url="https://www.defense.gov/News/Contracts/?Page="
ls_main_content =[]
url_count=0
for i in range(1,4):
  page_id=2*i
  url_temp = base_url+str(page_id)
  ls_url, response = get_url_content(url_temp)  
  
  print("Sample link_url:",ls_url[0][0])
  print("Sample link_url publish date:",ls_url[0][1])
  if "Successful" in response:
    print("Page id:"+str(page_id)+" "+response)
  else:
    print("Page id:"+str(page_id)+" "+response)
    continue
  url_count+=len(ls_url)
  # link handling block
  for link_url, publish_date_ap in ls_url:
    try:
      ls_content = get_content(link_url,publish_date_ap,page_id)    
    except AssertionError as msg:
      print(link_url,"\n Response",msg)
      continue
    except:
      print(link_url,"\n Response")
      print("Failed with Unknown error while reteiving url link content.")
      continue
    if ls_content:
      ls_main_content.extend(ls_content)


Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3175072/
Sample link_url publish date: Sept. 29, 2022
Page id:2 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3146093/
Sample link_url publish date: Aug. 31, 2022
Page id:4 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3115725/
Sample link_url publish date: Aug. 3, 2022
Page id:6 Successful in retrieving:10 urls


In [33]:
url_count

30

## Understanding the Scraped Data

In [34]:
print("Total URL accessed:"+str(url_count))
print("Total Contracts accessed:"+str(len(ls_main_content)))


Total URL accessed:30
Total Contracts accessed:741


In [35]:
df=pd.DataFrame(ls_main_content)

In [36]:
df.head()

Unnamed: 0,para_id,content,category,publish_date_ap
0,pg_2_para_1,"DPR-RQ Construction, LLC, Carlsbad, California...",NAVY,"Sept. 29, 2022"
1,pg_2_para_2,"Lockheed Martin Corp., Fort Worth, Texas, is a...",NAVY,"Sept. 29, 2022"
2,pg_2_para_3,"Innovative Mechanical Contractors LLC, * Westm...",NAVY,"Sept. 29, 2022"
3,pg_2_para_4,"Raytheon Missiles & Defense, Tucson, Arizona, ...",NAVY,"Sept. 29, 2022"
4,pg_2_para_5,"Hornbeck Offshore Operators, LLC, of Covington...",NAVY,"Sept. 29, 2022"


In [37]:
df.iloc[0,1]

'DPR-RQ Construction, LLC, Carlsbad, California (N62473-22-D-1403); Gilbane Federal JV, Concord, California (N62473-22-D-1404); Hensel Phelps Construction Co., Irvine, California (N62473-22-D-1406); Kiewit Building Group Inc., Springfield, Virginia (N62473-22-D-1406); The Robins & Morton Group, Birmingham, Alabama (N62473-22-D-1407); StructSure Projects, Inc., Kansas City, Missouri (N62473-22-D-1408); and Walsh Construction Group, LLC, Chicago, Illinois (N62473-22-D-1409) are awarded a combined $1,000,000,000 firm-fixed-price, indefinite-delivery/indefinite-quantity contract for new construction and repair and renovation of medical treatment facility projects at various government installations located in California, Arizona, Nevada, Hawaii, Utah, Colorado, and New Mexico. The work to be performed provides for but is not limited to hospitals, ambulatory care centers, medical and dental clinics, and medical research laboratories. Each awardee will be awarded $5,000 (minimum contract gua

In [38]:
df.iloc[1,1]

'Lockheed Martin Corp., Fort Worth, Texas, is awarded a not-to-exceed $152,329,105 fixed-price-incentive-firm-target, cost-plus-fixed-fee undefinitized indefinite-delivery/indefinite-quantity contract. This contract provides for the production of the F-35 Logistics Information Systems to include Autonomic Logistics Information System and Operational Data Integrated Network (ODIN), and Mission Planning Environment (MPE) hardware, as well as associated contract management, planning and readiness reviews and non-recurring introduction to service activities necessary to field the F-35 ODIN, MPE, and components of any future ODIN and MPE retrofits for the F-35A, F-35B and F-35C aircraft. Work will be performed in Orlando, Florida (95%) and Fort Worth, Texas (5%), and is expected to be completed in December 2024. No funds will be obligated at the time of award; funds will be obligated on individual orders as they are issued. This contract was not competitively procured pursuant to 10 USC 230

In [39]:
set(df.iloc[:,2])

{'AIR FORCE',
 'AIR FORCE\xa0',
 'ARMY',
 'DEFENSE ADVANCED RESEARCH PROJECTS AGENCY',
 'DEFENSE COUNTERINTELLIGENCE AND SECURITY AGENCY',
 'DEFENSE HEALTH AGENCY',
 'DEFENSE HUMAN RESOURCES ACTIVITY',
 'DEFENSE INFORMATION SYSTEMS AGENCY',
 'DEFENSE LOGISTICS AGENCY',
 'DEFENSE LOGISTICS AGENCY\xa0',
 'DEFENSE THREAT REDUCTION AGENCY',
 'DEPARTMENT OF DEFENSE EDUCATION ACTIVITY',
 'MISSILE DEFENSE AGENCY',
 'NAVY',
 'U.S. SPECIAL OPERATIONS COMMAND',
 'U.S. TRANSPORTATION COMMAND',
 'WASHINGTON HEADQUARTERS SERVICE',
 'WASHINGTON HEADQUARTERS SERVICES'}

In [40]:
set(df.iloc[:,3])

{'Aug. 1, 2022',
 'Aug. 18, 2022',
 'Aug. 19, 2022',
 'Aug. 2, 2022',
 'Aug. 22, 2022',
 'Aug. 23, 2022',
 'Aug. 24, 2022',
 'Aug. 25, 2022',
 'Aug. 26, 2022',
 'Aug. 29, 2022',
 'Aug. 3, 2022',
 'Aug. 30, 2022',
 'Aug. 31, 2022',
 'July 21, 2022',
 'July 22, 2022',
 'July 25, 2022',
 'July 26, 2022',
 'July 27, 2022',
 'July 28, 2022',
 'July 29, 2022',
 'Sept. 16, 2022',
 'Sept. 19, 2022',
 'Sept. 20, 2022',
 'Sept. 21, 2022',
 'Sept. 22, 2022',
 'Sept. 23, 2022',
 'Sept. 26, 2022',
 'Sept. 27, 2022',
 'Sept. 28, 2022',
 'Sept. 29, 2022'}