# Grab glassdoor links and review amounts

### Load dataframe

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from urllib.parse import unquote
import random
import time
from pathlib import Path
from torpy.http.requests import TorRequests

pd.options.mode.chained_assignment = None  # default='warn'

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
input_raw = pd.read_csv(r"W:\019_Glassdoor\1 Data\1 Glassdoor Links\0725_UniqueFirmList_raw.csv", sep=";", decimal=",", encoding='unicode_escape')

## sort
input_raw.sort_values(by=["2008_2022", "EnglishSpeaking", "Max_Cosine_Similarity"], ascending=[0,0,0], inplace=True)
input_raw.reset_index(inplace=True)

## filter
input = input_raw[["CompanyName", "ISIN", "Link_Marius", "Max_Cosine_Similarity", "2008_2022", "EnglishSpeaking"]]

## move ticker (written in brackets for some firms) to new column
if 'Ticker' not in input.columns:
    input.insert(input.columns.get_loc("CompanyName")+1, "Ticker", np.nan)
    input[['CompanyName', 'Ticker']] = input['CompanyName'].str.split('(', 1, expand=True)
    input["CompanyName"] = input["CompanyName"].str.replace("&", "and")
    input["Ticker"] = input["Ticker"].str.replace(")", "")

## replace sonderzeichen
sonderzeichen = {"àö¬ß": "ä", "‚àö‚â•": "o"}
input.replace({"CompanyName": sonderzeichen})

## remove legal addons in firm names
input["CompanyName"] = input["CompanyName"].str.replace(",| Inc[.]| Corp[.]| Limited| Ltd[.]| S[.]A[.]| AG| Co[.]| plc| LP| LLC| L[.]L[.]C[.]| Corporation| Holdings", "", regex=True, case=False) #. is used a a placeholder. -> put it in character class by using []
########### OHNE CLEANING NOCHMAL DURCHLAUFEN LASSEN, BSPW BEI Foreign Currency Exchange Corp. ist das Corp wichtig!

# input

## save file with dynamic file name
path_with_time = Path.joinpath(Path.cwd().parent, f'{time.strftime("%m%d")}_UniqueFirmList.csv')
#"W:\019_Glassdoor\1 Data\1 Glassdoor Links\0808_UniqueFirmList_HannesLinks_1500-1509.csv"
input.to_csv(path_with_time, sep=";", decimal=",", index=False)
print(f"{path_with_time} saved")

In [None]:
input = pd.read_csv(r"W:\019_Glassdoor\1 Data\1 Glassdoor Links\0813_UniqueFirmList.csv", sep=";", decimal=",", encoding='unicode_escape')

## yahoo scraper

#### proxy rotation via tor

In [None]:
def yahoo_scraper(df, sleeptime_max=1, tor_active="no"):
    try: #if operation fails, then return the current df
        firmname = df["CompanyName"]
        print(f"Firm #{df.name + 1}: {firmname}")
        # query = f'site:glassdoor.com {firmname}'.replace(' ', '+')
        query = f'site:glassdoor.com "{firmname}"'.replace(' ', '+') #putting {firmname} around "" might help. however, the query might also be too strict not returning any results
        URL = f"https://search.yahoo.com/search?p={query}&vc=en&pz=20"

        ## shuffle through user agent list for bot prevention. use IP shuffling to prevent further bot detection
        time.sleep(round(random.uniform(0.25,sleeptime_max), 3))
        USER_AGENTS = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0",
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0",
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36 Edg/103.0.1264.71"
                    ]
        headers = {"user-agent":random.choice(USER_AGENTS)}

        ## use tor
        if tor_active=="yes":
            url = sess.get(URL, headers=headers)
        ## use regular IP
        else:
            url = requests.get(URL, headers=headers)

        soup = BeautifulSoup(url.content, "html.parser")
        time.sleep(0.2) #to fully load the page

        ## scraper ##
        #############

        link_found = 0
        ## check if amount of ratings is displayed in at least of the search results. if so, grab this number and the parent div containing the link
        if soup.find("li", {"class":["tc", "bxz-bb"]}) and len(soup.find("li", {"class":["tc", "bxz-bb"]}).text)<15: #check length of the classes text, since class is not exclusively used for amount of ratings
            ## grab the review number
            review_amount = soup.find("li", {"class":["tc", "bxz-bb"]}) #only get the first item, since it appears to be most accurate. could be of the last search result
            try: #sometimes "Currency: EUR" is displayed instead of the review amount
                review_amount_final = review_amount.text.split("(")[1][:-1] #"4/5 (3)"
                if "K" in review_amount_final: # "5.7K" to 5700
                    review_amount_final = review_amount_final.replace(".", "")
                    review_amount_final = review_amount_final.replace("K", "00")
                # print(f"review amount: {review_amount_final}")
                df["review_amount"] = review_amount_final
                ## grab the href from the parent of the same search result
            except:
                df["review_amount"] = "" 
            review_parents = review_amount.find_parents("div")
            for link in review_parents:
                if link.find("a") and "glassdoor" in link.find("a")["href"]:
                    alink = link.find("a")
                    link_found = 1
                    break
        ## if amount of ratings is not on page, then just grab the href
        elif link_found != 1:
            # print("review amount: none found")
            results = soup.findAll("h3")
            for link in results:
                if link.find("a") and "glassdoor" in link.find("a")["href"]:
                    alink = link.find("a")
                    link_found = 1
                    break
        ## if neither link nor review amount was found
            if link_found != 1:
                link_found = "no link found" #"no link found"
                link_final = ""
                print("no link found")

        
        ## decode href link if link was found ##

        if link_found == 1:
            link_with_yahoo = unquote(alink["href"]) #unquote decodes URLs
            try: #weird cases in which an image link is retrieved
                link_final = link_with_yahoo.split("RU=")[1].split("/RK=2")[0] #https://r.search.yahoo.com/_ylt=A0geK9dD9t9i2BIAtodXNyoA;_ylu=Y29sbwNiZjEEcG9zAzEEdnRpZAMEc2VjA3Ny/RV=2/RE=1658873541/RO=10/RU=https://www.glassdoor.com/Overview/Working-at-California-Micro-Devices-EI_IE1221.11,35.htm/RK=2/RS=Ss7CVR85Jhnt899NkWVSw9eSVUw-
                if "/Working" in link_final:
                    # from https://www.glassdoor.com/Overview/Working-at-Tesla-EI_IE43129.11,16.htm
                    # to https://www.glassdoor.com/Reviews/Tesla-Reviews-E43129.htm
                    link_final = link_final.replace("/Overview/Working-at-", "/Reviews/") #shouldnt be an issue to replace the /Overview/ part with /Reviews/ even though the original Reviews link looks different
                    link_final = link_final.replace("EI_IE", "Reviews-E") 
                    link_final = link_final.split(".htm")[0][:-6] + ".htm"
                    link_found = "transformed overview link"
                elif "/Jobs/" in link_final:
                    link_final = link_final.replace("/Jobs/", "/Reviews/")
                    link_final = link_final.replace("-Jobs-", "-Reviews-")
                    link_found = "transformed jobs link"
                elif "/Salary/" in link_final:
                    link_final = link_final.replace("/Salary/", "/Reviews/")
                    link_final = link_final.replace("-Salaries-", "-Reviews-")
                    link_found = "transformed salary link"
                elif "/Reviews/" in link_final:
                    link_found = "original review link"
                else:
                #elif "/job-listing/" or "/Job/" in link_final:
                    link_found = "wrong link"
            except:
                link_final = link_with_yahoo
                
        df["Link_Hannes"] = link_final
        df["link_found"] = link_found
        print(link_final, "\n")

        return df

    except:
        return df

# yahoo_scraper("El Paso Pipeline Partners")

### execute scraper

In [None]:
## set parameters
startrow, endrow = 7000, 8000
tor = "yes" # change to yes or no
sleeptime_max = 0.5 #use >2 with original ip (tor = "no"). around 1000 requests possible until temp. ip block


## create a copy of the input frame; keep old index as extra column
input_copy = input[startrow:endrow].reset_index(drop=False)

input_copy["Link_Hannes"] = np.nan
input_copy["review_amount"] = np.nan
input_copy["link_found"] = np.nan

## execute yahoo scraper for each row by using apply ##
#######################################################

## with Tor-Gateway
if tor == "yes":
    with TorRequests() as tor_requests:
        print("Connecting to tor..")
        with tor_requests.get_session() as sess: #sess pulls a request of the page with the tor connection
            current_ip = sess.get("http://httpbin.org/ip").json()
            print(f"Current IP:{current_ip}; sleeptime: {sleeptime_max}\n")
            input_copy = input_copy.apply(yahoo_scraper, args=(sleeptime_max, "yes"), axis=1) #sess method can be executed from this sibling function
## without tor
else:
    input_copy = input_copy.apply(yahoo_scraper, args=(sleeptime_max, "no"), axis=1)
print(f"Scraping finished for {startrow} to {endrow}")
    

## reorder columns
# input_copy.keys() #['index', 'CompanyName', 'Ticker', 'ISIN', 'Link_Marius', 'Max_Cosine_Similarity', '2008_2022', 'EnglishSpeaking', 'Link_Hannes', 'review_amount', 'link_found']
input_copy = input_copy[['index', 'ISIN', 'Ticker', '2008_2022', 'EnglishSpeaking', 'CompanyName', 'Max_Cosine_Similarity', 'Link_Marius', 'Link_Hannes', 'link_found', 'review_amount']]


## save file with dynamic file name
path_with_time = Path.joinpath(Path.cwd().parent, f'{time.strftime("%m%d")}_UniqueFirmList_HannesLinks_{startrow}-{endrow}_strict.csv')
#"W:\019_Glassdoor\1 Data\1 Glassdoor Links\0808_UniqueFirmList_HannesLinks_1500-1509.csv"
input_copy.to_csv(path_with_time, sep=";", decimal=",", index=False)
print(f"{path_with_time} saved")

#### save function

In [None]:
path_with_time = Path.joinpath(Path.cwd().parent, f'{time.strftime("%m%d")}_UniqueFirmList_HannesLinks_{startrow}-{endrow}.csv')
#"W:\019_Glassdoor\1 Data\1 Glassdoor Links\0808_UniqueFirmList_HannesLinks_1500-1509.csv"
input_copy.to_csv(path_with_time, sep=";", decimal=",", index=False)
print(f"{path_with_time} saved")

## append multiple csv files to singular file

In [None]:
import os
 
# create an empty pandas data frame
df_complete = pd.DataFrame()

dir = r"W:\019_Glassdoor\1 Data\1 Glassdoor Links\HannesLinksStrict"
 
# iterate over all files within folder
for file in os.listdir(dir):
    if file.endswith(".csv"):
        df_complete = pd.concat([df_complete , pd.read_csv(os.path.join(dir, file), sep=";", decimal=",", encoding='unicode_escape' )], axis=0 )
df_complete.reset_index(drop=True, inplace=True) # reset the index 
 
## save
path_with_time = Path.joinpath(Path.cwd().parent, f'{time.strftime("%m%d")}_UniqueFirmList_HannesLinks_English-Post2008_Strict.csv')
#"W:\019_Glassdoor\1 Data\1 Glassdoor Links\0808_UniqueFirmList_HannesLinks_1500-1509.csv"
df_complete.to_csv(path_with_time, sep=";", decimal=",", index=False)
print(f"{path_with_time} saved")

## examples

#### example: apply a function to a column with 2 output columns

In [None]:
df_example = input.copy()

## output only one column
# def yahoo_scraper(x):
#     return x+2

# df_example["new"] = df_example["Max_Cosine_Similarity"].apply(yahoo_scraper)
# df_example


def example_function(df):
    df["example_result1"] = df["Max_Cosine_Similarity"] + 1
    df["example_result2"] = 2
    return df

df_example = df_example.apply(example_function, axis=1)
df_example

#### proxy rotation

In [None]:
## Tor-Gateway
from torpy.http.requests import TorRequests

with TorRequests() as tor_requests:
    print("Connecting..")
    with tor_requests.get_session() as sess:
        current_ip = sess.get("http://httpbin.org/ip").json()
        print(f"Current IP:{current_ip}")

        URL = "https://search.yahoo.com/search?p=site:glassdoor.com 'Tesla'&vc=en&pz=20"
        url = sess.get(URL, timeout=5)
        soup = BeautifulSoup(url.content, "html.parser")
        
        review_amount = soup.find("li", {"class":["tc", "bxz-bb"]}) #only get the first item, since it appears to be most accurate. could be of the last search result
        print(review_amount)
