<a href="https://colab.research.google.com/github/leonasting/NLP-Devcon/blob/main/Data_Scraping_Page_link_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Scraping Test Drive

### Importing Libraries

In [1]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

### Functions to interact with link data extraction.

In [2]:
def get_url_content(url_temp):
  """
  takes the querry for url with page id and returns list 
  """
  try:
    page = requests.get(url_temp)
    soup = BeautifulSoup(page.text, 'html.parser')
    assert soup is not None,"unable to retrieve data."
    ls_a=soup.findAll("listing-titles-only")
    ls_url = [[ind_element["article-url"],ind_element["publish-date-ap"]] for ind_element in ls_a]
    return [ls_url,"Successful in retrieving:"+str(len(ls_url))+" urls"]
  except AssertionError as msg:
    return  [[],msg]
  except:
    return [[],"other error"]



In [3]:
def get_content(link_url,publish_date_ap,page_id=None):
  link_page = requests.get(link_url)
  soup = BeautifulSoup(link_page.text, 'html.parser')
  assert soup is not None,"unable to retrieve data from url."
  div_content=soup.find("div", {"class":"body"})
  assert div_content is not None,"missing body division"
  ls_p = div_content.findAll("p")
  assert ls_p is not None,"missing paragraphs in body division"
  link_url_art_id=link_url.split('/')[-2]
  #page=page_id
  counter=1
  cat=""
  ls_content = []
  for ind_p in ls_p:
    strong_element = ind_p.find("strong")
    if strong_element:
      cat=strong_element.text
    else:
      ls_content.append({"para_id":"art_"+link_url_art_id+"_para_"+str(counter),
                        "content":ind_p.text,
                        "category":cat,
                        "publish_date_ap":publish_date_ap
                          })
      counter+=1
  return ls_content 

  



In [None]:
#ls_url[0]
['http://www.defense.gov/News/Contracts/Contract/Article/3175072/',
 'Sept. 29, 2022']
 #link_data = requests.get(ls_url[0][0])

#publish_date_ap= ls_url[0][1]

['http://www.defense.gov/News/Contracts/Contract/Article/3175072/',
 'Sept. 29, 2022']

In [9]:
base_url="https://www.defense.gov/News/Contracts/?Page="
ls_main_content =[]
url_count=0
for i in range(1,30):
  #page_id=2*i
  page_id=i
  url_temp = base_url+str(page_id)
  ls_url, response = get_url_content(url_temp)  
  
  print("Sample link_url:",ls_url[0][0])
  print("Sample link_url publish date:",ls_url[0][1])
  if "Successful" in response:
    print("Page id:"+str(page_id)+" "+response)
  else:
    print("Page id:"+str(page_id)+" "+response)
    continue
  url_count+=len(ls_url)
  # link handling block
  for link_url, publish_date_ap in ls_url:
    try:
      ls_content = get_content(link_url,publish_date_ap,page_id)    
    except AssertionError as msg:
      print(link_url,"\n Response",msg)
      continue
    except:
      print(link_url,"\n Response")
      print("Failed with Unknown error while reteiving url link content.")
      continue
    if ls_content:
      ls_main_content.extend(ls_content)


Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3196439/
Sample link_url publish date: Oct. 21, 2022
Page id:1 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3182145/
Sample link_url publish date: Oct. 6, 2022
Page id:2 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3168041/
Sample link_url publish date: Sept. 22, 2022
Page id:3 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3153056/
Sample link_url publish date: Sept. 8, 2022
Page id:4 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3138645/
Sample link_url publish date: Aug. 24, 2022
Page id:5 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3123651/
Sample link_url publish date: Aug. 10, 2022
Page id:6 Successful in retrieving:

In [10]:
url_count

290

## Understanding the Scraped Data

In [11]:
print("Total URL accessed:"+str(url_count))
print("Total Contracts accessed:"+str(len(ls_main_content)))


Total URL accessed:290
Total Contracts accessed:4542


In [12]:
df=pd.DataFrame(ls_main_content)

In [13]:
df.head()

Unnamed: 0,para_id,content,category,publish_date_ap
0,art_3196439_para_1,"Engineering Research and Consulting Inc., Hunt...",ARMY,"Oct. 21, 2022"
1,art_3196439_para_2,"Lockheed Martin Corp., Grand Prairie, Texas, w...",ARMY,"Oct. 21, 2022"
2,art_3196439_para_3,"Northrop Grumman Systems Corp., doing business...",NAVY,"Oct. 21, 2022"
3,art_3196439_para_4,"Lockheed Martin Aeronautics Co., Fort Worth, T...",NAVY,"Oct. 21, 2022"
4,art_3196439_para_5,"Lockheed Martin Corp., Fort Worth, Texas, is a...",NAVY,"Oct. 21, 2022"


In [14]:
df.iloc[0,1]

'Engineering Research and Consulting Inc., Huntsville, Alabama, was awarded a $640,512,703 cost-plus-fixed-fee contract to support planning, conducting, analyzing and reporting the results of developmental tests, production tests and other tests conducted by the U.S. Army Aberdeen Test Center. Bids were solicited via the internet with three received. Work locations and funding will be determined with each order, with an estimated completion date of Oct. 20, 2027. U.S. Army Contracting Command, Aberdeen Proving Ground, Maryland, is the contracting activity (W91CRB-23-D-0001).'

In [15]:
df.iloc[1,1]

'Lockheed Martin Corp., Grand Prairie, Texas, was awarded a $476,814,000 hybrid (cost-plus-fixed-fee and firm-fixed-price) contract for the Guided Multiple Launch Rocket System. Bids were solicited via the internet with one received. Work locations and funding will be determined with each order, with an estimated completion date of Aug. 15, 2025. U.S. Army Contracting Command, Redstone Arsenal, Alabama, is the contracting activity (W31P4Q-23-D-0003).'

In [16]:
set(df.iloc[:,2])

{'AIR FORCE',
 'AIR FORCE\xa0',
 'ARMY',
 'ARMY\xa0',
 'Air Force',
 'Army',
 'DEFENSE ADVANCED RESEARCH PROJECTS AGENCY',
 'DEFENSE COMMISSARY AGENCY',
 'DEFENSE COUNTERINTELLIGENCE AND SECURITY AGENCY',
 'DEFENSE COUNTERINTELLIGENCE SECURITY AGENCY',
 'DEFENSE FINANCE AND ACCOUNTING SERVICE',
 'DEFENSE HEALTH AGENCY',
 'DEFENSE HUMAN RESOURCES ACTIVITY',
 'DEFENSE INFORMATION SYSTEMS AGENCY',
 'DEFENSE INTELLIGENCE AGENCY',
 'DEFENSE LOGISTICS AGENCY',
 'DEFENSE LOGISTICS AGENCY\xa0',
 'DEFENSE THREAT REDUCTION AGENCY',
 'DEPARTMENT OF DEFENSE EDUCATION ACTIVITY',
 'Defense Health Agency',
 'Defense Logistics Agency',
 'MISSILE DEFENSE AGENCY',
 'MISSLE DEFENSE AGENCY',
 'NAVY',
 'Navy',
 'SPACE DEVELOPMENT AGENCY',
 'U.S. SPECIAL OPERATIONS COMMAND',
 'U.S. Special Operations Command',
 'U.S. TRANSPORTATION COMMAND',
 'UNIFORMED SERVICES UNIVERSITY OF THE HEALTH SCIENCES',
 'WASHINGTON HEADQUARTERS AGENCY',
 'WASHINGTON HEADQUARTERS SERVICE',
 'WASHINGTON HEADQUARTERS SERVICES'}

In [17]:
set(df.iloc[:,3])

{'April 1, 2022',
 'April 11, 2022',
 'April 12, 2022',
 'April 13, 2022',
 'April 14, 2022',
 'April 15, 2022',
 'April 18, 2022',
 'April 19, 2022',
 'April 20, 2022',
 'April 21, 2022',
 'April 22, 2022',
 'April 25, 2022',
 'April 26, 2022',
 'April 27, 2022',
 'April 28, 2022',
 'April 29, 2022',
 'April 4, 2022',
 'April 5, 2022',
 'April 6, 2022',
 'April 7, 2022',
 'April 8, 2022',
 'Aug. 1, 2022',
 'Aug. 10, 2022',
 'Aug. 11, 2022',
 'Aug. 12, 2022',
 'Aug. 15, 2022',
 'Aug. 16, 2022',
 'Aug. 17, 2022',
 'Aug. 18, 2022',
 'Aug. 19, 2022',
 'Aug. 2, 2022',
 'Aug. 22, 2022',
 'Aug. 23, 2022',
 'Aug. 24, 2022',
 'Aug. 25, 2021',
 'Aug. 25, 2022',
 'Aug. 26, 2021',
 'Aug. 26, 2022',
 'Aug. 27, 2021',
 'Aug. 29, 2022',
 'Aug. 3, 2022',
 'Aug. 30, 2021',
 'Aug. 30, 2022',
 'Aug. 31, 2021',
 'Aug. 31, 2022',
 'Aug. 4, 2022',
 'Aug. 5, 2022',
 'Aug. 8, 2022',
 'Aug. 9, 2022',
 'Dec. 1, 2021',
 'Dec. 10, 2021',
 'Dec. 13, 2021',
 'Dec. 14, 2021',
 'Dec. 15, 2021',
 'Dec. 16, 2021',
 'D