# QUESTION 5
Focused crawler
Description: Crawling the documents: 
A. Start with the following seed URL from Wikipedia: 
https://en.wikipedia.org/wiki/Tropical_cyclone. 


B. Your crawler has to respect the politeness policy by using a delay of at least one second 
between your HTTP requests.


C. Your crawler must assume that the earlier the hyperlink appears in a page, the more important 
it is (and hence must be crawled first) and that shallower depths are more important than deeper 
pages.


D. Follow the links with the prefix https://en.wikipedia.org/wiki that lead to articles only (avoid 
administrative links containing :) Also, make sure to properly treat URLs with # which basically 
denotes a section within the (same) page and not a different one. Non-English articles, external 
links, main Wikipedia page, navigations and marginal/side links must not be followed. You may 
ignore formulas, images, and non-textual media.


E. Crawl to depth 6. The seed page is the first URL in your frontier and thus counts for depth 1. 


F. Stop once you’ve crawled 100-500 unique URLs. Keep a list of these URLs in a text file. You 
should handle redirected pages to avoid duplicates

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import os

def spider(seed_url):
	main_page = 'https://en.wikipedia.org/wiki/Main_Page'
	crawled_count = 0
	max_depth = 1
	frontier_urls = [seed_url]
	seen_urls = []
	newpath = r'Logs' 
	if not os.path.exists(newpath):
		os.makedirs(newpath)
	crawler_log = open("Logs/crawler_log.txt","w")
	crawler_log.write("Seed : "+seed_url+"\n\n")
	crawler_log.write("Depth 1 :\n\n")
	crawled_count+=1
	crawler_log.write(str(crawled_count)+") "+seed_url+"\n\n")
	flag = True
	print("\n----------------------------------------- At depth 1--------------------------------------------------------")
	print(str(crawled_count)+") "+seed_url)
	source_code = requests.get(seed_url)
	plain_text = source_code.text
	soup = BeautifulSoup(plain_text,"html.parser")
	for depth in range (2, 7):
		if flag:
			print("\n----------------------------------------- At depth "+str(depth)+"--------------------------------------------------------")
			crawler_log.write("Depth "+str(depth)+" :\n\n")
			extracted_urls = []
			for frontier_url in frontier_urls:
				if flag:
					source_code = requests.get(frontier_url)
					plain_text = source_code.text
					soup = BeautifulSoup(plain_text,"html.parser")
					for link in soup.find_all('a', href=re.compile('^/wiki/')):
						#crawling only 250 url links
						if crawled_count < 250 and flag:
							url_text = link.text
							href_url = link.get('href')
							if ':' not in href_url:
								if '#' not in href_url:
									url = 'https://en.wikipedia.org'+href_url
									if url not in frontier_urls and url not in extracted_urls and url not in seen_urls and url != main_page:
										time.sleep(1)
										source_code = requests.get(url)
										plain_text = source_code.text
										soup = BeautifulSoup(plain_text,"html.parser")
										
										extracted_urls.append(url)
										crawled_count+=1
										crawler_log.write(str(crawled_count)+") "+url+"\n")
										print(str(crawled_count)+") "+url)

								else:
									# Handle URLs with '#'
									hash_pos = href_url.index('#')

									# Trim the URL from the start till index before '#'
									url = 'https://en.wikipedia.org'+href_url[:hash_pos]

									# URL should not be in either of Frontier, Extracted or Seen lists and should not be Wiki Main Page too
									if url not in frontier_urls and url not in extracted_urls and url not in seen_urls and url != main_page:
										
										# Respecting the Politeness Policy
										time.sleep(1)

										# get the soup
										source_code = requests.get(url)
										plain_text = source_code.text
										soup = BeautifulSoup(plain_text,"html.parser")
										extracted_urls.append(url)
										crawled_count+=1
										crawler_log.write(str(crawled_count)+") "+url+"\n")
										print(str(crawled_count)+") "+url)


						else:
							flag = False
							print("Limit of 250Ls reached")
							max_depth = depth
							break
					seen_urls.append(frontier_url)
			if len(extracted_urls) == 0:
				print("No matching URLs at Depth "+str(depth)+"\n")
				crawler_log.write("No matching URLs at Depth "+str(depth)+"\n\n")
				flag = False
				max_depth = depth
				break
			frontier_urls = extracted_urls
			crawler_log.write("\n")

	# Maximum depth of Depth 6 reached		
	if flag:
		print("Searched till max depth 6")
		max_depth = 6

	crawler_log.write("------------------------------------------------------------------------------------\n")
	crawler_log.write("Logistics :\n\n")
	crawler_log.write("Number of matching searches : "+str(crawled_count)+"\n")
	crawler_log.write("Maximum depth reached : Depth "+str(max_depth)+"\n")

seed_url = 'https://en.wikipedia.org/wiki/Tropical_cyclone'
spider(seed_url)


----------------------------------------- At depth 1--------------------------------------------------------
1) https://en.wikipedia.org/wiki/Tropical_cyclone

----------------------------------------- At depth 2--------------------------------------------------------
2) https://en.wikipedia.org/wiki/Hurricane_(disambiguation)
3) https://en.wikipedia.org/wiki/Hurricane_No._1
4) https://en.wikipedia.org/wiki/Tropical_Depression_(band)
5) https://en.wikipedia.org/wiki/Typhoon
6) https://en.wikipedia.org/wiki/Hurricane_Isabel
7) https://en.wikipedia.org/wiki/International_Space_Station
8) https://en.wikipedia.org/wiki/Eye_(cyclone)
9) https://en.wikipedia.org/wiki/Rainband
10) https://en.wikipedia.org/wiki/Cyclones
11) https://en.wikipedia.org/wiki/Weather
12) https://en.wikipedia.org/wiki/Season
13) https://en.wikipedia.org/wiki/Winter
14) https://en.wikipedia.org/wiki/Spring_(season)
15) https://en.wikipedia.org/wiki/Summer
16) https://en.wikipedia.org/wiki/Autumn
17) https://en.wikipe

157) https://en.wikipedia.org/wiki/List_of_retired_Philippine_typhoon_names
158) https://en.wikipedia.org/wiki/List_of_retired_Australian_region_cyclone_names
159) https://en.wikipedia.org/wiki/List_of_retired_South_Pacific_cyclone_names
160) https://en.wikipedia.org/wiki/Outline_of_tropical_cyclones
161) https://en.wikipedia.org/wiki/Synoptic_scale_meteorology
162) https://en.wikipedia.org/wiki/Subtropics
163) https://en.wikipedia.org/wiki/Cyclonic_rotation
164) https://en.wikipedia.org/wiki/2014%E2%80%9315_South-West_Indian_Ocean_cyclone_season
165) https://en.wikipedia.org/wiki/Convective_storm
166) https://en.wikipedia.org/wiki/Altitude
167) https://en.wikipedia.org/wiki/Rainbands
168) https://en.wikipedia.org/wiki/Angular_momentum
169) https://en.wikipedia.org/wiki/Sea_surface_temperature
170) https://en.wikipedia.org/wiki/Cyclonic_Ni%C3%B1o
171) https://en.wikipedia.org/wiki/Vorticity
172) https://en.wikipedia.org/wiki/Advection
173) https://en.wikipedia.org/wiki/Pressure_altitud