In [45]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [46]:
# Sometimes we get response 403 which means the server doesn't allow us to access the data.
# To tackle this we'll have to make request like a browser does with help of headers.

headers = {"user-agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/109.0"}
data = pd.DataFrame(columns=['Companies', 'Benefits', 'Ratings', 'High_Rated', 'Low_Rated'])

In [47]:
def getDetails(company_details):
    
    df = pd.DataFrame(columns=['Companies', 'Benefits', 'Ratings', 'High_Rated', 'Low_Rated'])
    
    for l in range(len(company_details)):
        company_name = company_details[l].find("h2", class_ = "companyCardWrapper__companyName").text.strip()
        benefits = company_details[l].find(class_ = 'companyCardWrapper__interLinking').text.strip()
        ratings = company_details[l].find(class_ = 'companyCardWrapper__companyRatingValue').text.strip()

        high = company_details[l].find(class_ = 'companyCardWrapper__ratingHeader--high')
        critical = company_details[l].find(class_ = 'companyCardWrapper__ratingHeader--critical')
        rated_for = company_details[l].find_all(class_="companyCardWrapper__ratingValues")

        if high and critical:
            high_rated = rated_for[0].text.strip()
            low_rated = rated_for[1].text.strip()
            df = pd.concat([df, pd.DataFrame([[company_name, benefits, ratings, high_rated, low_rated]],
                                              columns=df.columns)], ignore_index=True)
        elif high:
            high_rated = rated_for[0].text.strip()
            df = pd.concat([df, pd.DataFrame([[company_name, benefits, ratings, high_rated, ""]],
                                              columns=df.columns)], ignore_index=True)
        elif critical:
            low_rated = rated_for[0].text.strip()
            df = pd.concat([df, pd.DataFrame([[company_name, benefits, ratings, "", low_rated]],
                                              columns=df.columns)], ignore_index=True)
        else:
            df = pd.concat([df, pd.DataFrame([[company_name, benefits, ratings, "", ""]],
                                              columns=df.columns)], ignore_index=True)
    return df

In [49]:
# If our request to the website is timing out, likely due to slow or unresponsive servers. 

retry_strategy = Retry(
    total=3,
    status_forcelist=[429, 500, 502, 503, 504],
    backoff_factor=1
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session = requests.Session()
session.mount('http://', adapter)
session.mount('https://', adapter)

In [50]:
start_time = time.time()

for i in range(1,501):
    url = f'https://www.ambitionbox.com/list-of-companies?campaign=desktop_nav&page={i}'
    
    try:
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an error for bad responses (4xx or 5xx)
        
        soup = BeautifulSoup(response.text, 'lxml')
        company_details = soup.find_all(class_='companyCardWrapper')
        
        # Append the data to the DataFrame
        data = pd.concat([data, getDetails(company_details)], ignore_index=True)

    except requests.exceptions.RequestException as e:
        print(f"Error: {e} - Skipping page {i}")

print(time.time() - start_time)

411.9562768936157


In [51]:
data

Unnamed: 0,Companies,Benefits,Ratings,High_Rated,Low_Rated
0,TCS,IT Services & Consulting | 1 Lakh+ Employees |...,3.8,"Job Security, Work Life Balance","Promotions / Appraisal, Salary & Benefits"
1,Accenture,IT Services & Consulting | 1 Lakh+ Employees |...,4.0,"Company Culture, Skill Development / Learning,...",
2,Cognizant,IT Services & Consulting | 1 Lakh+ Employees |...,3.9,Skill Development / Learning,Promotions / Appraisal
3,Wipro,IT Services & Consulting | 1 Lakh+ Employees |...,3.8,Job Security,"Promotions / Appraisal, Salary & Benefits"
4,Capgemini,IT Services & Consulting | 1 Lakh+ Employees |...,3.9,"Job Security, Work Life Balance, Skill Develop...","Promotions / Appraisal, Salary & Benefits"
...,...,...,...,...,...
9995,Icreon Communications,IT Services & Consulting | 51-200 Employees | ...,3.7,Skill Development / Learning,"Work Life Balance, Work Satisfaction, Job Secu..."
9996,OpenXcell Technolabs,IT Services & Consulting | 201-500 Employees |...,4.2,"Work Life Balance, Company Culture, Job Security",Promotions / Appraisal
9997,Reinforced Earth,Engineering & Construction | 201-500 Employees...,3.9,"Company Culture, Job Security, Skill Developme...",Promotions / Appraisal
9998,Black Turtle,Recruitment | 201-500 Employees | 7 years old ...,3.6,,"Salary & Benefits, Promotions / Appraisal, Wor..."


In [53]:
data.to_csv('Indian Companies Dataset')