# Scrapping Data from Web for NLP
## Importing libraries

In [1]:
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
import numpy as np

## Lets go through website we want to scrap data from

In [2]:
base_url = requests.get('https://www.indeed.com/cmp/Google/reviews', timeout=5)
base_url.text

'<!doctype html><html lang="en"><head><title>Working at Google in Noida, Uttar Pradesh: Employee Reviews | Indeed.com</title><meta http-equiv="content-type" content="text/html;charset=UTF-8"><meta name="description" content="Reviews from Google employees about Google culture, salaries, benefits, work-life balance, management, job security, and more."><meta name="viewport" content="width=device-width, initial-scale=1"><link rel="canonical" href="https://www.indeed.co.in/cmp/Google/reviews?fcountry=IN&amp;floc=Noida%2C+Uttar+Pradesh"><link rel="stylesheet" type="text/css" href="/cmp/_s/s/3cb7454/global.css" ><link rel="stylesheet" type="text/css" href="/cmp/_s/w/50c52677e90563f526e2/css/vendor.css" ><link rel="stylesheet" type="text/css" href="/cmp/_s/w/ec604737d8896895d2f1/css/reviews-client.css" ><link rel="icon" href="/images/favicon.ico"><script type="text/javascript">window.__JSERROR__=[];window.onerror=function() { window.__JSERROR__.push(arguments)}</script><script>(function(w,d,s

## Parsing a url

In [3]:
def parse(full_url):
    page_content = BeautifulSoup(full_url.content, 'lxml')
    containers = page_content.findAll('div', {'class':'cmp-Review-container'})
    df = pd.DataFrame(columns = ['rating', 'rating_title',  'rating_description', 'rating_pros', 'rating_cons'])
    
    for item in containers:
        try:
            rating = item.find('div', {'class': 'cmp-ReviewRating-text'}).text.replace('\n', '')
        except:
            rating = None
        try:
            rating_title = item.find('div', {'class': 'cmp-Review-title'}).text.replace('\n', '')
        except:
            rating_title = None
        try:
            rating_description = item.find('span', {'itemprop': 'reviewBody'}).text.replace('\r', '. ')
        except:
            rating_description = None
        try:
            rating_pros = item.find('div', {'class': 'cmp-ReviewProsCons-prosText'}).text.replace('\n', '')
        except:
            rating_pros = None
        try:
            rating_cons = item.find('div', {'class': 'cmp-ReviewProsCons-consText'}).text.replace('\n', '')
        except:
            rating_cons = None
        df = df.append({'rating': rating, 
             'rating_title': rating_title, 
             'rating_description': rating_description,
             'rating_pros': rating_pros, 
             'rating_cons': rating_cons}, ignore_index=True)
    return df

## Define overall Dataframe 

In [4]:
base_url = 'https://www.indeed.com/cmp/Google/reviews?fcountry=ALL&start='
all_reviews_df = pd.DataFrame(columns = ['rating', 'rating_title', 'rating_description','rating_pros', 'rating_cons'])
num_reviews = 20 # you can adjust this number on how many reviews you which to scrape

## Iterate over all the pages

In [5]:
while num_reviews < 3000:
    print("collecting reviews: "+str(num_reviews))
    full_url = base_url + str(num_reviews)

    get_url = requests.get(full_url, timeout=5)

    partial_reviews_df = parse(get_url)    

    all_reviews_df = all_reviews_df.append(partial_reviews_df, ignore_index=True)

    num_reviews += 20

collecting reviews: 20
collecting reviews: 40
collecting reviews: 60
collecting reviews: 80
collecting reviews: 100
collecting reviews: 120
collecting reviews: 140
collecting reviews: 160
collecting reviews: 180
collecting reviews: 200
collecting reviews: 220
collecting reviews: 240
collecting reviews: 260
collecting reviews: 280
collecting reviews: 300
collecting reviews: 320
collecting reviews: 340
collecting reviews: 360
collecting reviews: 380
collecting reviews: 400
collecting reviews: 420
collecting reviews: 440
collecting reviews: 460
collecting reviews: 480
collecting reviews: 500
collecting reviews: 520
collecting reviews: 540
collecting reviews: 560
collecting reviews: 580
collecting reviews: 600
collecting reviews: 620
collecting reviews: 640
collecting reviews: 660
collecting reviews: 680
collecting reviews: 700
collecting reviews: 720
collecting reviews: 740
collecting reviews: 760
collecting reviews: 780
collecting reviews: 800
collecting reviews: 820
collecting reviews: 

## Check the data collected

In [6]:
all_reviews_df

Unnamed: 0,rating,rating_title,rating_description,rating_pros,rating_cons
0,5.0,Fun Place to work,"Google has their own culture, what you see in ...","Free lunches, Lear alot, gBus commute",No cons
1,2.0,Poor Management,The team I work for is structured of people wi...,,
2,4.0,Fun place to work!,"Fun place to work, but the industry can be com...",,
3,5.0,Productive and fun place to work,Definitely a great place to work. Loads of act...,The cafe and lounge areas,"For me, wasn't in my field of study"
4,1.0,No job security,"Very bad local Management, they are not worth ...",Better job outside,"Do not work under INDIAN PEOPLE, They have the..."
...,...,...,...,...,...
3124,4.0,Positive and fun place to work,great supportive atmosphere. lots of opportuni...,"free food, onsite gym, development opportunities",
3125,4.0,Lots of Professional Experience,I learned the ethics working at such a competi...,"Free lunches, free pickups and dropping, snack...",Na
3126,4.0,"Fun place to work, cool gear, local jobs prosp...","Great place to work, great peers,perks and man...",Free lunches,hindered upward mobility for technical staff t...
3127,4.0,Nice place to work,I work as a contractor at Google. I was happy ...,,


## Save data for future use

In [7]:
all_reviews_df.to_csv('indeed_scrape.csv')