### Scrape indeed
##### Lakkas Giannis 

*  Basic imports 

In [None]:
# !pip install -U selenium
# !pip install webdriver-manager

In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service

from webdriver_manager.chrome import ChromeDriverManager

import re
import time
import csv
from tqdm import tqdm
import os
import urllib

In [2]:
"""
Creation of a folder to save the images(the logo from this notebook and the rest from the second(the summarizer))
"""
if not os.path.exists('customer_analytics1'):
    os.makedirs('customer_analytics1')

In [3]:
"""
Extracts descriptions, the ratings and the text from the page
"""
def get_data_from_page(writer:csv.writer):
    
    reviews=driver.find_elements(by=By.CSS_SELECTOR, value='[itemprop="review"]')
    print("Reviews:", len(reviews))
    for review in reviews:
        
        descr, text, rating = 'NA','NA','NA'
        # extract the description(date, job role, region)
        try: 
            descr=review.find_element(by=By.CSS_SELECTOR, value='[itemprop="author"]').text        
        except NoSuchElementException as e: # headline or link could not be found
            print('could not extract headline')
        # extract the text    
        try: 
            text = review.find_element(by=By.CSS_SELECTOR, value='[itemprop="reviewBody"]').text
        except NoSuchElementException as e: # headline or link could not be found
            print('could not extract text')
        # extract the rating 
        try: 
            rating = review.find_element(by=By.CSS_SELECTOR, value='[itemprop="reviewRating"]').text
        except NoSuchElementException as e: # headline or link could not be found
            print('could not extract rating')  
        # save the extracted description, text, rating in a row 
        writer.writerow([descr, text, rating])
        

In [4]:
"""
Extracts the company's logo from the main page.
"""
def scrape_logo(url:str, query: str):
    logo = driver.find_element(by=By.CSS_SELECTOR, value='[itemprop="image"]')
    logo_link = logo.get_attribute("src")
    urllib.request.urlretrieve(logo_link, "customer_analytics1/"+ query + ".jpg")

In [5]:
def scrape(query:str,
                delay:int = 2):
    # create a csv with name "<company_name>_data.csv"
    with open(query+'_data.csv', 'w', encoding='utf-8') as fw:
        writer = csv.writer(fw, lineterminator='\n')
        #write the headers
        writer.writerow(['description','text','rating'])
        # search 
        url='https://www.indeed.com/companies/search?q=' + query

        driver.get(url) # visit the page
                    
        #select the first company(which is closer to our search)
        company = driver.find_element(by=By.CSS_SELECTOR, value='[data-tn-section="CompaniesRowGroup"]')
        company_results = company.find_elements(by=By.CSS_SELECTOR, value='[data-tn-component="CompanyRow"]')
        first_company = company_results[0]
        driver.implicitly_wait(2)
        driver.find_element(By.LINK_TEXT, 'Reviews').click()

        #select all the countries and select lang the english
        get_url = driver.current_url
        get_url = get_url + '?fcountry=ALL&lang=en'
        driver.get(get_url)
        
        #scrape the company's logo
        logo = scrape_logo(url=get_url, query=query)
        
        #find the number of the company's reviews
        total_reviews = driver.find_element(by=By.CSS_SELECTOR, value='[data-testid="review-count"]').text
        print(total_reviews)
        total_revies_num =  re.findall(r"[0-9,.]+", total_reviews.replace(',','').replace('.',''))[0]
        print("Total Reviews ", total_revies_num)
        # calculate the number of pages. 
        #This number will be used later in order to change pages due to the fact that the site follows a specific pattern (plus 20 for each page)
        total_pages = int(total_revies_num) // 20 if int(total_revies_num) % 20 == 0 else (int(total_revies_num) // 20) + 1
        print("Total Pages ", total_pages)

        page_cnt = 1
        while True: # keep going until there are no more pages
            print('Page: ' + str(page_cnt) + ". Progress: " + str(round(page_cnt/total_pages, 2)*100) + "%") # print current page count
            page_cnt += 1 # increment 
            #extract data from the page
            get_data_from_page(writer) 
            # check if the next button is not presented, this means that there aren't any other pages
            if len(driver.find_elements(By.LINK_TEXT, 'Next')) < 1:
                break; 
            # if the button exists, then clink it
            next_button = WebDriverWait(driver, delay).until(EC.element_to_be_clickable((By.LINK_TEXT, 'Next')))
            driver.execute_script("arguments[0].click();", next_button)
        fw.close()

In [6]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.maximize_window()
driver.implicitly_wait(5)

In [7]:
scrape('Corteva-Agriscience')

Showing all 257 reviews
Total Reviews  257
Total Pages  13
Page: 1. Progress: 8.0%
Reviews: 21
Page: 2. Progress: 15.0%
Reviews: 21
Page: 3. Progress: 23.0%
Reviews: 21
Page: 4. Progress: 31.0%
Reviews: 21
Page: 5. Progress: 38.0%
Reviews: 21
Page: 6. Progress: 46.0%
Reviews: 21
Page: 7. Progress: 54.0%
Reviews: 21
Page: 8. Progress: 62.0%
Reviews: 21
Page: 9. Progress: 69.0%
Reviews: 21
Page: 10. Progress: 77.0%
Reviews: 21
Page: 11. Progress: 85.0%
Reviews: 21
Page: 12. Progress: 92.0%
Reviews: 21
Page: 13. Progress: 100.0%
Reviews: 17
