# Scrape Indian College Data

## Import packages
If you do not have anaconda installed, you will have to install the following packages using your package manager. Using pip, you can install them as: <br>
```
pip install BeautifulSoup4 pandas
```
BeautifulSoup4 package makes extracting information from html easier.

In [None]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd

## Manually list all state urls
You'll have to use the following links one by one to scrape data.

In [None]:
state_urls = ["https://targetstudy.com/colleges/colleges-in-andhra-pradesh.html",
"https://targetstudy.com/colleges/colleges-in-arunachal-pradesh.html",
"https://targetstudy.com/colleges/colleges-in-assam.html",
"https://targetstudy.com/colleges/colleges-in-bihar.html",
"https://targetstudy.com/colleges/colleges-in-chandigarh.html",
"https://targetstudy.com/colleges/colleges-in-chhattisgarh.html",
"https://targetstudy.com/colleges/colleges-in-dadra-and-nagar-haveli.html",
"https://targetstudy.com/colleges/colleges-in-daman-and-diu.html",
"https://targetstudy.com/colleges/colleges-in-delhi.html",
"https://targetstudy.com/colleges/colleges-in-goa.html",
"https://targetstudy.com/colleges/colleges-in-gujarat.html",
"https://targetstudy.com/colleges/colleges-in-haryana.html",
"https://targetstudy.com/colleges/colleges-in-himachal-pradesh.html",
"https://targetstudy.com/colleges/colleges-in-jammu-and-kashmir.html",
"https://targetstudy.com/colleges/colleges-in-jharkhand.html",
"https://targetstudy.com/colleges/colleges-in-karnataka.html",
"https://targetstudy.com/colleges/colleges-in-kerala.html",
# "https://targetstudy.com/colleges/colleges-in-lakshadweep.html", # doesn't have any college data
"https://targetstudy.com/colleges/colleges-in-madhya-pradesh.html",
"https://targetstudy.com/colleges/colleges-in-maharashtra.html",
"https://targetstudy.com/colleges/colleges-in-manipur.html",
"https://targetstudy.com/colleges/colleges-in-meghalaya.html",
"https://targetstudy.com/colleges/colleges-in-mizoram.html",
"https://targetstudy.com/colleges/colleges-in-nagaland.html",
"https://targetstudy.com/colleges/colleges-in-orissa.html",
"https://targetstudy.com/colleges/colleges-in-puducherry.html",
"https://targetstudy.com/colleges/colleges-in-punjab.html",
"https://targetstudy.com/colleges/colleges-in-rajasthan.html",
"https://targetstudy.com/colleges/colleges-in-sikkim.html",
"https://targetstudy.com/colleges/colleges-in-tamil-nadu.html",
"https://targetstudy.com/colleges/colleges-in-tripura.html",
"https://targetstudy.com/colleges/colleges-in-uttar-pradesh.html",
"https://targetstudy.com/colleges/colleges-in-uttarakhand.html",
"https://targetstudy.com/colleges/colleges-in-west-bengal.html"]

## Request data from every webpage of a college
If we scrape all the college data in one go, the site crashes. It would be better to scrape one college at a time. 
1. Change *state_url* in below cell to one of the links in *state_urls* of the college you wish to scrape. 
2. Get the number of pages in that state.
3. We'll send a request for every webpage saving college names in *state_college_names*. and college urls in *state_college_urls*.


In [None]:
state_url = "https://targetstudy.com/colleges/colleges-in-chandigarh.html"

#send request for first page
request = Request(state_url, headers={'User-Agent': 'Mozilla/5.0'})
response = urlopen(request).read()
state_soup = BeautifulSoup(response, 'html')
top_right = state_soup.find_all('div', {'class': ['top-small-heading-bar']})[0].div.strong.text.split(' ')

# get number webpages for the state
num_pages = 0
if len(top_right) > 4:
    num_pages = int(top_right[-1][:-1])
else:
    num_pages = int(top_right[2])

print("Total number of pages in this state link: ", num_pages)
print("Getting page 1")

state_college_names = [] # list containing all college names for the state
state_college_urls = [] # list containing all individual college url for the state

state_college_names += [i.text for i in state_soup.find_all('a', {'class': ['heading1']}) if 'targetstudy.com/institute' in str(i)]
state_college_urls += [i.get('href') for i in state_soup.find_all('a', {'class': ['heading1']}) if 'targetstudy.com/institute' in str(i)]

#send request for every page
if num_pages > 1:
    for page in range(num_pages-1):
        print("Getting page ", page + 2)
        request = Request("https://targetstudy.com/colleges/colleges-in-chandigarh.html" + "?recNo=" + str((page + 1)*10), headers={'User-Agent': 'Mozilla/5.0'})
        response = urlopen(request).read()
        state_soup = BeautifulSoup(response, 'html')
        state_college_names += [i.text for i in state_soup.find_all('a', {'class': ['heading1']}) if 'targetstudy.com/institute' in str(i)]
        state_college_urls += [i.get('href') for i in state_soup.find_all('a', {'class': ['heading1']}) if 'targetstudy.com/institute' in str(i)]

print("Urls of {} colleges collected.".format(len(state_college_urls)))

## Request data from every college url
Now, we will send request to every college url saving college addresses in *state_college_addresses* and college phone numbers in *state_college_phone*.

In [None]:
def get_phone_from_address(address):
    if len(address) < 3:
        return "N/A"
    elif len(address) < 4:
        return (address[2].text)
    elif len(address) < 5:
        return (address[2].text + address[3].text)
    elif len(address) < 6:
        return (address[2].text + address[3].text + address[4].text)
    else:
        return (address[2].text + address[3].text + address[4].text + address[5].text)

In [None]:
state_college_addresses = []
state_college_phones = []

for i, college_url in enumerate(state_college_urls):
        
    r = Request(college_url, headers={'User-Agent': 'Mozilla/5.0'})
    response = urlopen(r).read()
    soup1 = BeautifulSoup(response, 'html')
    
    if (i%10 == 0):
        print("Getting data of college ", i+1)
    
    full_address = [i for i in soup1.find_all('table')[0].find_all('td')]
    address = full_address[1].text.replace(u'\xa0', u' ')
    phone = get_phone_from_address(full_address)
    phone = phone.replace(u'\xa0', u' ')
    
    state_college_addresses.append(address)
    state_college_phones.append(phone)
print("Data from {} colleges collected.".format(len(colleges_url)))

## Print collected data

In [None]:
print("Total college names collected: ", len(state_college_names))
print("Total college addresses collected: ", len(state_college_addresses))
print("Total college phone numbers collected: ", len(state_college_phones))
print("Note: All above should be same. If not you'll have to reject the extras and make them all same to save.")

### Convert data to pandas dataframe and print first 5 rows 

In [None]:
# convert to dataframe
college_dataframe = pd.DataFrame(
    {'college_name': state_college_names,
     'college_address': state_college_addresses,
     'college_phone': state_college_phones
    })
columnsTitles = ["college_name", "college_address", "college_phone"]
college_dataframe = college_dataframe.reindex(columns=columnsTitles)

In [None]:
college_dataframe[:5]

## Save dataframe (REMEMBER TO CHANGE FILENAME)
Change *filename* without extension

In [None]:
filename = "college_name"

college_dataframe.to_csv(filename + ".csv", sep=',', index=None)
college_dataframe.to_excel(filename + ".xlsx")
print("Files saved.")