## IMPORTING LIBRARIES

In [1]:
import random
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from nltk.tokenize import word_tokenize

## CREATING A DATAFRAME

In [2]:
df = pd.DataFrame(columns=['Review', 'Rating'])

## FUNCTION TO SCRAPE REVIEW

In [3]:
def scrape_reviews(url):
    global df
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    page = requests.get(url, headers=headers)
    
    if page.status_code == 200:
        soup = BeautifulSoup(page.text, 'lxml')
        reviews = soup.find_all('div', {'class': 'company-reviews'})
        data = []
        
        for rev in reviews:
            desc_tag = rev.find('p', {'class': 'comments foggy'})
            description = desc_tag.text.strip() if desc_tag else None
            
            if description:
                rating_container = rev.find('div', {'class': 'rating-container'})
                if rating_container:
                    rating_div = rating_container.find('div', {'class': 'rating large-star'})
                    if rating_div:
                        full_stars = len(rating_div.find_all('span', class_='full'))
                        half_stars = len(rating_div.find_all('span', class_='half'))
                        rating = full_stars + 0.5 * half_stars
                    else:
                        rating = None
                else:
                    rating = None
                
                if rating is not None:
                    row = {
                        'Review': description,
                        'Rating': rating
                    }
                    data.append(row)

        new_df = pd.DataFrame(data)
        df = pd.concat([df, new_df], ignore_index=True)
        print("Data successfully collected and appended.")
    else:
        print(f'Failed to retrieve the page. Status code: {page.status_code}')


## FUNCTION FOR BALANCING DATASET

In [4]:
def balance_dataset(df):
    positive = df[df['Rating'] >= 4]
    negative = df[df['Rating'] <= 2]
    neutral = df[(df['Rating'] > 2) & (df['Rating'] < 4)]
    
    min_count = min(len(positive), len(negative), len(neutral))
    balanced_df = pd.concat([
        positive.sample(min_count, random_state=42),
        negative.sample(min_count, random_state=42),
        neutral.sample(min_count, random_state=42)
    ])
    
    return balanced_df

## LIST OF COMPANIES

In [5]:
companies = ['abbott-laboratories', 'accenture','accountemps','adecco','adp','amd','aerotek','affiliated-computer-services','aflac','agilent-technologies','aig','alcatel-lucent','allstate','amazon','american-airlines','american-express','american-red-cross','amgen','aol','apple','applied-materials','aramark','asu','att','avaya',
            'bae-systems','baker-hughes','bank-of-america','barnes-noble','basf','baxter','bearing-point','bechtel','beckman-coulter','bellsouth','best-buy','blockbuster','bcbs','bmc-software','bmw','bank-new-york-mellon','boeing','boeing-commercial-airplanes','booz-allen-hamilton','boston-scientific','bp','bristol-myers-squibb','broadcom','burger-king',
             'csu','canon','capgemini','capital-one','cardinal-health','cat','cbs-broadcasting','charter-communications','chevron','chrysler','cigna','cingular-wireless','circuit-city','cisco-systems','citibank','citigroup','coca-cola','cognizant','coldwell-banker','comcast','compucom','compusa','computer-sciences-corp','convergys','cvs-pharmacy',
             'davita','dell', 'deloitte','delphi-automotive','delta-air-lines','deutsche-bank','dhl-express','dicks-sporting-goods','digital-equipment','dillards','directv','discover-financial','dish-network','diversyfund','dollar-general','dollar-tree','dominos-pizza','dow-chemical','drs-technologies','duke-energy','dun-bradstreet','dunkin-donuts','dupont','dyncorp',
             'earthlink','eastman-kodak','eaton','ebay','echostar','ecolab','edward-jones','electrolux','electronic-arts','eds','eli-lilly-co','emc-corp','emory-university','enterprise-holdings','enterprise-rent-a-car','epam-systems','epic-systems','ericsson','ernst-young','espn','events','everest-college','experian','express-scripts','exxonmobil',
             'family-dollar','fannie-mae','farmers','federal-reserve-system','federal-mogul','fedex','fidelity-information-services','fidelity-investments','fifth-third-bank','first-american-title','first-data','first-fidelity-bank','first-national-bank','fiserv','flextronics','fiu','florida-power-light','fluor','foot-locker','ford-motor','freddie-mac','fresenius-medical-care','frontier-communications','frys','fujitsu',
             'gamestop','gannett','gateway','ge-capital','ge-energy','ge-healthcare','geico','genentech','general-dynamics','general-electric','gmc','gnc','george-mason-university','georgia-tech','georgia-pacific','glaxosmithkline','goldman-sachs','goodrich','goodwill','goodyear','google','grainger-industrial-supply','gte-corp','guitar-center','gulfstream',
             'hr-block','halliburton','harley-davidson','harris','harvard','hcl-global','hcl-technologies','heb-grocery','hertz','hewitt-associates','hp','hilton-worldwide','hitachi','holiday-inn','hollywood-video','home-depot','honda-motor','honeywell','honeywell-technology','hp-software','hsbc','huawei-technologies','humana','hyatt-hotels','hyundai-motors'
            ]

print(len(companies))

198


## SCRAPING REVIEWS

In [6]:
base_url = 'https://www.careerbliss.com/{}/reviews/'
for company in companies:
    for page in range(20):
        if page == 0:
            url = base_url.format(company)
        else:
            url = base_url.format(company) + f'?page={page}'
        scrape_reviews(url)

  df = pd.concat([df, new_df], ignore_index=True)


Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and appended.
Data successfully collected and ap

## SAVING SCRAPED REVIEW

In [7]:
df.to_csv('scraped_reviews.csv',index=False)

## BALANCING THE SCRAPED REVIEWS AND SAVING IT AS A CSV FILE 

In [8]:
balanced_df = balance_dataset(df)

In [9]:
balanced_df.to_csv('balanced_reviews.csv', index=False)
print("Balanced dataset saved to 'balanced_reviews.csv'.")

Balanced dataset saved to 'balanced_reviews.csv'.
