## Getting ready

In [1]:
!pip install bs4



In [4]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
from random import randint
import requests

In [5]:
# get the HTML

url = "https://www.bankbazaar.com/reviews/andhra-bank/all-products.html?reviewPageNumber=2"
r = requests.get(url)
htmlContent = r.content

In [7]:
# parse the HTML

soup = BeautifulSoup(htmlContent, 'html.parser')

In [8]:
soup.title.text

'Page 2 of 50 for ANDHRA BANK  Reviews by Ratings & City'

## Accessing only 1 page

In [9]:
containers = soup.find_all(name='li', attrs={'class':'review-box'})
containers[0]

<li class="review-box">
<div class="review-bank-title">
<img alt="review" src="https://static.bankbazaar.com/images/common/bank-logo/ALL_BANKS.png" title="review">
</img></div>
<a class="user-review-comment js-individual-title" data-action="click:titleLink" data-actionloc="reviews:ReviewCards" href="/showUserReviews-1327841-ANDHRA_BANK-SAVING_ACCOUNT.html" target="_blank" title="Good Service">"Good Service"</a>
<div class="dontshow" itemscope="" itemtype="http://schema.org/Brand">
<span itemprop="name">
                                            ANDHRA BANK</span>
</div>
<div class="rating-section review-user-score">
<div class="medium-rating rating review-score-container" id="review.reviewRating.customer.0">
<span class="star-rating star-rating-5-0"></span>
<input name="review.reviewRating.customer.0" type="hidden" value="5.0"/>
</div> <span> <span class="dontshow">0.5</span>
<span>5.0</span>/<span>5</span></span> "Blown Away!"
                            </div>
<div class="text_here

In [10]:
first = containers[0]
first.find('div', {'class':'text_here review-desc-more'}).text.strip()

'I am holding a salary account with ANDHRA BANK for past 2 years. Need to maintain a minimum balance of 1K. There is no hidden charges in this account. I am using net banking and mobile app both are user friendly to access. There is ATM charges are applicable if I do transactions more than 5 times in a month.'

In [11]:
first.find('input').get_attribute_list(key='value')[0]

'5.0'

In [12]:
first.find('div',{'class':'reviewer-profile'}).get_attribute_list(key='content')[0]

'2019-12-27'

In [13]:
first.find('span',{'class':'js-author-name'}).text.strip().capitalize()

'Ramesh'

In [14]:
first.find('a', {'class':'user-review-comment js-individual-title'}).text.strip('"')

'Good Service'

In [15]:
 first.find('div', {'class':'reviewer-profile'}).text.split('\n')[2].strip(', ').capitalize()

'Bangalore'

In [16]:
# running loop for entire 1st page containers

pname = []
ploc = []
prate = []
preview = []
psummary = []
pdate = []

for container in containers:
  pname.append(container.find('span',{'class':'js-author-name'}).text.strip().capitalize())
  ploc.append(container.find('div', {'class':'reviewer-profile'}).text.split('\n')[2].strip(', ').capitalize())
  prate.append(container.find('input').get_attribute_list(key='value')[0])
  psummary.append(container.find('a', {'class':'user-review-comment js-individual-title'}).text.strip('"'))
  preview.append(container.find('div', {'class':'text_here review-desc-more'}).text.strip())
  pdate.append(container.find('div',{'class':'reviewer-profile'}).get_attribute_list(key='content')[0])

In [17]:
details = list(zip(pname, ploc, prate, pdate, psummary, preview))


df = pd.DataFrame(details, columns =['Name', 'Location', 'Rating', 'Date', 'Summary', 'Review'])
df.head()

Unnamed: 0,Name,Location,Rating,Date,Summary,Review
0,Ramesh,Bangalore,5.0,2019-12-27,Good Service,I am holding a salary account with ANDHRA BANK...
1,Anonymous,Kolkata,5.0,2019-12-11,Good Account,"From 2005, i have been using the savings accou..."
2,Ramaiah,Bangalore,5.0,2019-12-05,Good Account,Using the salary account from Andhra bank and ...
3,Nayak,Bangalore,4.0,2019-12-05,Good Account,Andhra bank account is good. In the year of 20...
4,Anonyms,Parvathipuram,5.0,2019-12-03,Good Service,Andhra bank is a very good bank in providing s...


In [18]:
df.describe()

Unnamed: 0,Name,Location,Rating,Date,Summary,Review
count,20,20,20.0,20,20,20
unique,17,11,5.0,17,13,20
top,Kumar,Bangalore,5.0,2019-12-05,Good Service,I am holding a salary account with ANDHRA BANK...
freq,4,4,14.0,2,3,1


## Accessing all the pages

In [19]:
pages = np.arange(1,51)   # 50 pages in total, taking in a variable

pname = []
ploc = []
prate = []
preview = []
psummary = []
pdate = []

In [20]:
for page in pages:    # running for every page
  url = "https://www.bankbazaar.com/reviews/andhra-bank/all-products.html?reviewPageNumber="+str(page)
  r = requests.get(url)
  htmlContent = r.content
  soup = BeautifulSoup(htmlContent, 'html.parser')
  containers = soup.find_all(name='li', attrs={'class':'review-box'})
  sleep(randint(3,6))           # sleeping for 3 to 6 secs to control crawling

  for container in containers:
    pname.append(container.find('span',{'class':'js-author-name'}).text.strip().capitalize())
    ploc.append(container.find('div', {'class':'reviewer-profile'}).text.split('\n')[2].strip(', ').capitalize())
    prate.append(container.find('input').get_attribute_list(key='value')[0])
    psummary.append(container.find('a', {'class':'user-review-comment js-individual-title'}).text.strip('"'))
    preview.append(container.find('div', {'class':'text_here review-desc-more'}).text.strip())
    pdate.append(container.find('div',{'class':'reviewer-profile'}).get_attribute_list(key='content')[0])

In [21]:
details = list(zip(pname, ploc, prate, pdate, psummary, preview))

df = pd.DataFrame(details, columns =['Name', 'Location', 'Rating', 'Date', 'Summary', 'Review'])
df.shape

(1000, 6)

In [23]:
df.head(10)

Unnamed: 0,Name,Location,Rating,Date,Summary,Review
0,Mavoori m,Hyderabad,4.0,2020-03-19,Good card,I have applied Andhra Bank credit card directl...
1,R n s n,Mahabubnagar,4.0,2020-03-16,Best savings account,"For the past 8 months, i have been using salar..."
2,G hareesh,Medak,5.0,2020-03-14,Good bank,I have opened my savings account in Andhra Ban...
3,Ga,Vijayawada,5.0,2020-03-08,Need to provide more atm,"When I was working in previous company , they ..."
4,Vas,Vijayawada,5.0,2020-03-08,Good card,I have been using ANDHRA card for more than on...
5,S p,Chennai,3.0,2020-02-29,Best savings account,Andhra Bank saving account i am using more tha...
6,Bal t,Chennai,5.0,2020-02-24,Best savings account,I am using ANDHRA Bank savings account more th...
7,Ramesh k a,Hyderabad,5.0,2020-02-21,Good bank,I have been using my savings account from ANDH...
8,Shai,Hyderabad,2.0,2020-02-19,Average service,"From ANDHRA Bank, i have a savings account whi..."
9,B rajesh,Bangalore,4.0,2020-02-19,Need to improve service & technical service ne...,I am using ANDHRA Bank savings account more th...


In [27]:
df['Date'] = pd.to_datetime(df['Date'])

In [26]:
df['Rating'].value_counts()

5.0    405
4.0    291
3.0    155
2.0     56
4.5     29
3.5     27
1.0     20
2.5     13
0.5      4
Name: Rating, dtype: int64

In [28]:
df['Rating'] = pd.to_numeric(df['Rating'])

In [29]:
df.describe()

Unnamed: 0,Rating
count,1000.0
mean,4.0455
std,1.027246
min,0.5
25%,3.5
50%,4.0
75%,5.0
max,5.0


In [30]:
df.head()

Unnamed: 0,Name,Location,Rating,Date,Summary,Review
0,Mavoori m,Hyderabad,4.0,2020-03-19,Good card,I have applied Andhra Bank credit card directl...
1,R n s n,Mahabubnagar,4.0,2020-03-16,Best savings account,"For the past 8 months, i have been using salar..."
2,G hareesh,Medak,5.0,2020-03-14,Good bank,I have opened my savings account in Andhra Ban...
3,Ga,Vijayawada,5.0,2020-03-08,Need to provide more atm,"When I was working in previous company , they ..."
4,Vas,Vijayawada,5.0,2020-03-08,Good card,I have been using ANDHRA card for more than on...


In [37]:
df['Location'].value_counts()

Hyderabad           532
Bangalore           103
Chennai              89
New delhi            26
Vijayawada           25
Visakhapatnam        22
Mumbai               19
Secunderabad         18
Gurgaon              10
Coimbatore           10
Kolkata              10
Kurnool               9
Pune                  9
Nellore               7
Tirupati              7
Lucknow               7
Jaipur                7
Bhubaneshwar          6
Khammam               6
Guntur                6
Vizianagaram          5
Rajahmundry           5
Kochi                 4
Ghaziabad             4
Warangal              4
Noida                 3
Tiruchirapalli        3
Faridabad             2
Madurai               2
Goa                   2
Chandigarh            2
Nagpur                2
Karimnagar            2
Jajpur                2
Kakinada              2
Anantapur             2
Eluru                 2
Medak                 2
Bhimavaram            1
Bhuvanagiri           1
Ramachandrapuram      1
Hosur           

In [36]:
# top 5 locations having least rating

df.groupby(['Location']).agg({'Rating':'mean'}).reset_index().sort_values(by=['Rating'], ascending=True).head(5)

Unnamed: 0,Location,Rating
26,Karimnagar,0.5
19,Hosur,2.0
16,Goa,2.5
0,Adilabad,3.0
31,Kota,3.0
