Python web scraping to fetch companies data from [ambitionbox.com/list-of-companies](https://www.ambitionbox.com/list-of-companies?page=1) using BeautifulSoup library.

Web pages contains data about different companies.
    

    Columns which are required for analysis:
        -name
        -rating
        -review
        -type
        -location
        -age
        -employee_count
        -description
    
We need to convert these data of  into pd dataframe.

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# web page 1
url = 'https://www.ambitionbox.com/list-of-companies?page=1'
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
# headers for accessing the web page
req = requests.get(url=url,headers=headers)
html = req.text

In [3]:
# converting html to soup object for data parsing
soup = BeautifulSoup(html)

# printing as html code viewer
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
  <meta content="5; URL='/list-of-companies?page=1&amp;bm-verify=AAQAAAAH_____7QChgEqMsM4-mxyub-ysKWbRALxtqx_Qq388Qo24N_R7Zs6aq6w0O8RJoIDjsQredfy3pAuAcLIBv7c82AeXkYz5ol6lXa8jXWC7aG6NJsXWGGCv5BioD6osDMv1CbvfbWqcEBRYWajHCwppNUqs0SbJcKNxDp5y8TDy7Nz0qUYHCJReGjm5AXwzkFFKQ9d_3SqQ-vKA3GoqW3wB_xmXD7bZjpoUBuPt4H7Z0KP065ImkLGJ5wEGfH6z087wsCLDvHYH3BQyGtQYxZsmhleNWLxdw'" http-equiv="refresh">
   <title>
   </title>
   <script>
    var i = 1685560921;
        var j = i + Number("8436" + "47882");
   </script>
  </meta>
 </head>
 <noscript>
  <iframe src="" style="border: none; height: 100%; width: 100%;">
  </iframe>
 </noscript>
 <script>
  var xhr = new XMLHttpRequest();
          xhr.withCredentials = true;
          xhr.addEventListener("loadend", function() {
              try {
                  var data = JSON.parse(xhr.responseText);
            

In [None]:
# # h1 tags
# soup.findAll('h1')[0].text

In [5]:
# scraping web pages
company_data_list = [] # list of each_company's data
page_no = 1 # web page_no: 1

while True:
    url = 'https://www.ambitionbox.com/list-of-companies?page={}'.format(page_no)
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}
    # headers for accessing the web page
    req = requests.get(url=url,headers=headers)
    html = req.text
    soup= BeautifulSoup(html)

    # if there are no 
    if soup.find('div','error-page'):
        print('There is no data in further web-page.')
        print('Total web pages scraped = ' + str(page_no-1))
        print('Total no. of companies = ' + str(len(company_data_list)))
        break

    companies = soup.findAll('div','company-content-wrapper') # companies data in a list
    # len(companies) #30

    for company in companies:
        # h2 tags for fetching name
        name = company.find('h2').text.strip() # '\n\t\t\t\t\t\t\t\t\tTCS\n\t\t\t\t\t\t\t\t' to 'TCS'

        # p tags for rating
        rating = company.find('p','rating').text.strip() # stripping the tab & new line characters 
        
        # a tags for review
        review = company.find('a','review-count').text.strip().strip('()').split()[0] # (51.7k Reviews) to 51.7k

        # p tags for type,location,age,employee_count
        info = company.findAll('p','infoEntity') # list of each company's type,location,age &memployee_count
        try:
            type = info[0].text.strip()
        except:
            type = ''
        try:
            location = info[1].text.strip()
        except:
            location = ''
        try:
            age = info[2].text.strip() # '55 years old' to 55
        except:
            age = ''
        try:
            employee_count = info[3].text.strip()
        except:
            employee_count = 0

        # p tag for description
        try:
            description = company.find('p','description').text.strip()
        except:
            description = ''
        company_dict = {'name':     name,
                        'rating':   rating,
                        'review':   review,
                        'rating':   rating,
                        'type':     type,
                        'location': location,
                        'age':      age,
                        'employee_count': employee_count,
                        'description': description}

        company_data_list.append(company_dict)

    page_no += 1 

df = pd.DataFrame(company_data_list)
df.shape  

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9959 entries, 0 to 9958
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            9959 non-null   object
 1   rating          9959 non-null   object
 2   review          9959 non-null   object
 3   type            9959 non-null   object
 4   location        9959 non-null   object
 5   age             9959 non-null   object
 6   employee_count  9959 non-null   object
 7   description     9959 non-null   object
dtypes: object(8)
memory usage: 622.6+ KB


In [None]:
df.duplicated().sum()

1007