# Import the Libraries

In [26]:
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
from time import sleep
from random import randint

## Function to Scrape Glassdoor for Data Analyst Interview Questions

In [102]:
'''
Get interview questions for data analyst positions from Glassdoor and put it into a dataframe 
'''
def get_questions(headers, num_pages, sleep_time):    
    company_name = []
    interview_questions = []
    
    for page in range(1, num_pages):
        # Getting the data from each page
        if page == 1:
            html = requests.get('https://www.glassdoor.com/Interview/data-analyst-interview-questions-SRCH_KO0,12_SDMC.htm', headers = headers)
        else: 
            html = requests.get('https://www.glassdoor.com/Interview/data-analyst-interview-questions-SRCH_KO0,12_SDMC_IP' + str(page) + '.htm', headers = headers)
        bsobj = soup(html.content,'lxml')
        
        # Used to control the loop's rate i.e. to ensure that the server is not flooded with too many requests thereby blocking our ip.address
        sleep(sleep_time) 
    
        # Getting company names in the format of "Data Analyst at [company] was asked ..."
        for company in bsobj.findAll('span',{'class':'authorInfo'}): 
            company_name.append(company.a.text.strip())
        
        # Getting the interview questions that were posted
        for questions in bsobj.findAll('p', {'class':'questionText h3'}):
            interview_questions.append(questions.text.strip())
    
    # Turn the list into a Data Frame
    df = pd.DataFrame(company_name)
    
    # FORMATTING. Replace was with at so we can later split it and get the company's name
    df[0] = df[0].apply(lambda x: x.replace('was', 'at'))
    df['name'] = df[0].apply(lambda x: x.split(' at ')[1])
    # Remove original column
    df.drop([0], axis = 1, inplace = True)
    
    # Add interview questions to our company name Data Frame
    df['interview questions'] = interview_questions
    
    return df




In [109]:
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'}

df = get_questions(headers, 21, 15)
df

Unnamed: 0,name,interview questions
0,Optiver,Two players are at deuce in a tennis match - p...
1,Capital One,Case interview: basic business problem (if pro...
2,Capital One,"They check for your attitude, your approach an..."
3,Uber,"Given three tables, user_dimension (user_id an..."
4,Google,If you have 10 bags of marbles with 10 marbles...
...,...,...
195,ZipRecruiter,What metric would you use for revenue?
196,Myriad Genetics,Where do you see yourself in ten years?
197,Walmart Labs,ask me the project in the resume
198,Bayview Asset Management,How many coins are in a sleeve?


In [110]:
number_of_companies = df['name'].nunique()
print ('The number of unique comapanies: ' + str(number_of_companies))

The number of unique comapanies: 150


In [111]:
df = df.sort_values(by = ['name'], ignore_index = True)
print('Interview questions sorted by company name: ')
df

Interview questions sorted by company name: 


Unnamed: 0,name,interview questions
0,ATF,The interview question was about my previous j...
1,Acclaris,Most difficult questions were related to SQL s...
2,Amazon,how did you analyze data? Who uses your data a...
3,American Express,How many quarters at any given time are in a s...
4,American Greetings,Tell me about what makes me stand out from oth...
...,...,...
195,WorldQuant,Sort lines in a txt file by one of the columns...
196,Xavier University,What do you know about Jesuit educational phil...
197,Yerba Buena Beverage,How can you judge yourself?
198,ZipRecruiter,What metric would you use for revenue?


In [52]:
df.to_csv('Study_Guide.csv', index = False)

# BELOW IS SECTION TO REVISIT. NOT DONE

## CREATE CATEGORY PER QUESTION? 

In [88]:
df['q'] = df['interview questions'].str.lower()

In [98]:
def get_category(x):
    if 'sql' in x:
        return 'sql'
    elif 'excel' in x:
        return 'excel'
    elif 'behavioral' in x:
        return 'behavioral'
    elif 'experience' in x:
        return 'experience'
    else:
        return 'other'

df['category'] = df['q'].apply(get_category)
df

Unnamed: 0,name,interview questions,category,q
0,ATF,The interview question was about my previous j...,other,the interview question was about my previous j...
1,Acclaris,Most difficult questions were related to SQL s...,sql,most difficult questions were related to sql s...
2,Amazon,how did you analyze data? Who uses your data a...,sql,how did you analyze data? who uses your data a...
3,American Express,How many quarters at any given time are in a s...,other,how many quarters at any given time are in a s...
4,American Greetings,Tell me about what makes me stand out from oth...,other,tell me about what makes me stand out from oth...
...,...,...,...,...
195,Wayfair,SQL problem easy programming language a/b test...,sql,sql problem easy programming language a/b test...
196,WorldQuant,Sort lines in a txt file by one of the columns...,other,sort lines in a txt file by one of the columns...
197,Xavier University,What do you know about Jesuit educational phil...,other,what do you know about jesuit educational phil...
198,Yerba Buena Beverage,How can you judge yourself?,other,how can you judge yourself?
