<a href="https://colab.research.google.com/github/leonasting/NLP-Devcon/blob/main/Data_Scraping_Page_link_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Scraping Script

This script is used for data scraping from the multiple pages of url patter https://www.defense.gov/News/Contracts/?Page=1 .
It exports the web scrapped data into two file formats JSON: data/data_main.json and CSV (data/data_url.csv)
Please run this script from root directory.

### Importing Libraries

In [1]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup

### Functions for  data extraction.

In [3]:
def get_url_content(url_temp):
  """
  takes the querry for url with page id and returns list of url 
  """
  try:
    page = requests.get(url_temp)
    soup = BeautifulSoup(page.text, 'html.parser')
    assert soup is not None,"unable to retrieve data."
    ls_a=soup.findAll("listing-titles-only")
    ls_url = [[ind_element["article-url"],ind_element["publish-date-ap"]] for ind_element in ls_a]
    return [ls_url,"Successful in retrieving:"+str(len(ls_url))+" urls"]
  except AssertionError as msg:
    return  [[],msg]
  except:
    return [[],"other error"]



In [6]:
def get_content(link_url,publish_date_ap,page_id=None):
    """
    content is extracted and whole extracted code is put into list of dictionaries
    returns list of extraxted content
    """
    
    link_page = requests.get(link_url)
    soup = BeautifulSoup(link_page.text, 'html.parser')
    assert soup is not None,"unable to retrieve data from url."
    div_content=soup.find("div", {"class":"body"})
    assert div_content is not None,"missing body division"
    ls_p = div_content.findAll("p")
    assert ls_p is not None,"missing paragraphs in body division"
    link_url_art_id=link_url.split('/')[-2]
    #page=page_id
    counter=1
    cat=""
    ls_content = []
    for ind_p in ls_p:
        strong_element = ind_p.find("strong")
        if strong_element:
          cat=strong_element.text
        else:
          ls_content.append({"para_id":"art_"+link_url_art_id+"_para_"+str(counter),
                            "content":ind_p.text,
                            "category":cat,
                            "publish_date_ap":publish_date_ap
                              })
          counter+=1
    return ls_content 

  



In [8]:
#ls_url[0]
['http://www.defense.gov/News/Contracts/Contract/Article/3175072/',
 'Sept. 29, 2022']
 #link_data = requests.get(ls_url[0][0])

#publish_date_ap= ls_url[0][1]

['http://www.defense.gov/News/Contracts/Contract/Article/3175072/',
 'Sept. 29, 2022']

## Running on 11/22/2022

In [10]:
base_url="https://www.defense.gov/News/Contracts/?Page="
ls_main_content =[]
url_count=0
for i in range(1,31):
  #page_id=2*i
  page_id=i
  url_temp = base_url+str(page_id)
  ls_url, response = get_url_content(url_temp)  
  
  print("Sample link_url:",ls_url[0][0])
  print("Sample link_url publish date:",ls_url[0][1])
  if "Successful" in response:
    print("Page id:"+str(page_id)+" "+response)
  else:
    print("Page id:"+str(page_id)+" "+response)
    continue
  url_count+=len(ls_url)
  # link handling block
  for link_url, publish_date_ap in ls_url:
    try:
      ls_content = get_content(link_url,publish_date_ap,page_id)    
    except AssertionError as msg:
      print(link_url,"\n Response",msg)
      continue
    except:
      print(link_url,"\n Response")
      print("Failed with Unknown error while reteiving url link content.")
      continue
    if ls_content:
      ls_main_content.extend(ls_content)


Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3226621/
Sample link_url publish date: Nov. 22, 2022
Page id:1 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3211815/
Sample link_url publish date: Nov. 7, 2022
Page id:2 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3198058/
Sample link_url publish date: Oct. 24, 2022
Page id:3 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3183579/
Sample link_url publish date: Oct. 7, 2022
Page id:4 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3169379/
Sample link_url publish date: Sept. 23, 2022
Page id:5 Successful in retrieving:10 urls
Sample link_url: http://www.defense.gov/News/Contracts/Contract/Article/3154083/
Sample link_url publish date: Sept. 9, 2022
Page id:6 Successful in retrieving:1

In [13]:
print("Total Count of content Extracted from the main webpage",url_count)

Total Count of content Extracted from the main webpage 300


## Understanding the Scraped Data

In [14]:
print("Total URL accessed:"+str(url_count))
print("Total Contracts accessed:"+str(len(ls_main_content)))


Total URL accessed:300
Total Contracts accessed:4564


In [15]:
df=pd.DataFrame(ls_main_content)

## Viewing Data in Data Frame

In [16]:
df.head()

Unnamed: 0,para_id,content,category,publish_date_ap
0,art_3226621_para_1,"Quidel Corp., San Diego, California (SPE2DE-23...",DEFENSE LOGISTICS AGENCY,"Nov. 22, 2022"
1,art_3226621_para_2,"Change Healthcare Technologies LLC, Alpharetta...",DEFENSE LOGISTICS AGENCY,"Nov. 22, 2022"
2,art_3226621_para_3,"Creighton AB Inc., Reidsville, North Carolina,...",DEFENSE LOGISTICS AGENCY,"Nov. 22, 2022"
3,art_3226621_para_4,"IMT Defense Corporation,* Westerville, Ohio, w...",ARMY,"Nov. 22, 2022"
4,art_3226621_para_5,"SilverStar Consulting Inc.,* Falls Church, Vir...",ARMY,"Nov. 22, 2022"


Content Columns Data

In [18]:
df.iloc[0,1]

'Quidel Corp., San Diego, California (SPE2DE-23-D-0009); Access Bio Inc., Somerset, New Jersey (SPE2DE-23-D-0011); iHealth Labs, Sunnydale, California (SPE2DE-23-D-0012); and Orasure Technologies, Bethlehem, Pennsylvania (SPE2DE-23-D-0010), are sharing a maximum $803,000,000 firm-fixed-price, indefinite-delivery/indefinite-quantity contract under solicitation SPE2DE-22-R-0013 for the procurement of over-the-counter rapid antigen COVID-19 test kits. This was a competitive acquisition with 13 responses received. These are one-year contracts with no option periods. The performance completion date is Nov. 21, 2023. Using customer is Department of Health and Human Services. Type of appropriation is fiscal 2023 defense working capital funds, using funds that were allocated to the Department of Health and Human Services in September 2022. The contracting activity is the Defense Logistics Agency Troop Support, Philadelphia, Pennsylvania.'

In [19]:
df.iloc[1,1]

'Change Healthcare Technologies LLC, Alpharetta, Georgia, has been awarded a maximum $328,279,472 modification (P00033) exercising the five-year option period of a five-year base contract (SPE2D1-18-D-0006) with one five-year option period for digital imaging network-picture archive communication systems, components, training, maintenance service and incidental services. This is a fixed-price with economic-price-adjustment, indefinite-delivery/indefinite-quantity contract. The ordering period end date is Dec. 14, 2027. Using customers are Army, Navy, Air Force, Marine Corps and federal civilian agencies. Type of appropriation is fiscal 2023 through 2028 defense working capital funds. The contracting activity is the Defense Logistics Agency Troop Support, Philadelphia, Pennsylvania.'

In [20]:
set(df.iloc[:,2])

{'AIR FORCE',
 'AIR FORCE\xa0',
 'ARMY',
 'ARMY\xa0',
 'Air Force',
 'Army',
 'DEFENSE ADVANCED RESEARCH PROJECTS AGENCY',
 'DEFENSE COMMISSARY AGENCY',
 'DEFENSE COUNTERINTELLIGENCE AND SECURITY AGENCY',
 'DEFENSE COUNTERINTELLIGENCE SECURITY AGENCY',
 'DEFENSE FINANCE AND ACCOUNTING SERVICE',
 'DEFENSE HEALTH AGENCY',
 'DEFENSE HUMAN RESOURCES ACTIVITY',
 'DEFENSE INFORMATION SYSTEMS AGENCY',
 'DEFENSE INTELLIGENCE AGENCY',
 'DEFENSE LOGISTICS AGENCY',
 'DEFENSE LOGISTICS AGENCY\xa0',
 'DEFENSE THREAT REDUCTION AGENCY',
 'DEPARTMENT OF DEFENSE EDUCATION ACTIVITY',
 'Defense Health Agency',
 'Defense Logistics Agency',
 'MISSILE DEFENSE AGENCY',
 'MISSLE DEFENSE AGENCY',
 'NAVY',
 'Navy',
 'SPACE DEVELOPMENT AGENCY',
 'SPECIAL OPERATIONS COMMAND',
 'U.S. SPECIAL OPERATIONS COMMAND',
 'U.S. Special Operations Command',
 'U.S. TRANSPORTATION COMMAND',
 'UNIFORMED SERVICES UNIVERSITY OF THE HEALTH SCIENCES',
 'WASHINGTON HEADQUARTERS AGENCY',
 'WASHINGTON HEADQUARTERS SERVICE',
 'WASHING

In [21]:
set(df.iloc[:,3])

{'April 1, 2022',
 'April 11, 2022',
 'April 12, 2022',
 'April 13, 2022',
 'April 14, 2022',
 'April 15, 2022',
 'April 18, 2022',
 'April 19, 2022',
 'April 20, 2022',
 'April 21, 2022',
 'April 22, 2022',
 'April 25, 2022',
 'April 26, 2022',
 'April 27, 2022',
 'April 28, 2022',
 'April 29, 2022',
 'April 4, 2022',
 'April 5, 2022',
 'April 6, 2022',
 'April 7, 2022',
 'April 8, 2022',
 'Aug. 1, 2022',
 'Aug. 10, 2022',
 'Aug. 11, 2022',
 'Aug. 12, 2022',
 'Aug. 15, 2022',
 'Aug. 16, 2022',
 'Aug. 17, 2022',
 'Aug. 18, 2022',
 'Aug. 19, 2022',
 'Aug. 2, 2022',
 'Aug. 22, 2022',
 'Aug. 23, 2022',
 'Aug. 24, 2022',
 'Aug. 25, 2022',
 'Aug. 26, 2022',
 'Aug. 29, 2022',
 'Aug. 3, 2022',
 'Aug. 30, 2022',
 'Aug. 31, 2022',
 'Aug. 4, 2022',
 'Aug. 5, 2022',
 'Aug. 8, 2022',
 'Aug. 9, 2022',
 'Dec. 1, 2021',
 'Dec. 10, 2021',
 'Dec. 13, 2021',
 'Dec. 14, 2021',
 'Dec. 15, 2021',
 'Dec. 16, 2021',
 'Dec. 17, 2021',
 'Dec. 2, 2021',
 'Dec. 20, 2021',
 'Dec. 21, 2021',
 'Dec. 22, 2021',
 'De

## Saving the content as both JSON and csv format

In [24]:
import json 
      
# Data to be written 
dictionary ={"content":ls_main_content,
             "date":"11/22/22"
} 
      
# Serializing json  
with open("./data/data_main.json", "w") as outfile:
    json.dump(dictionary, outfile)

In [27]:
df.to_csv("./data/data_url.csv",index=False)