-
Notifications
You must be signed in to change notification settings - Fork 0
/
ypnepal_school.py
98 lines (72 loc) · 2.67 KB
/
ypnepal_school.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import requests
import csv
import json
def removeDuplicates(listofElements):
uniqueLinks = []
for elem in listofElements:
if elem not in uniqueLinks:
uniqueLinks.append(elem)
return uniqueLinks
page = ("http://www.ypnepal.com/index.php?cat=530&Schools-&-Higher-Secondary-Schools")
m_page = (page)
req = Request(m_page)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
page_no = []
page_no.append(page)
college_data = []
for page in soup.find_all(True,{"class": ["paging"]}):
page_no.append('http://www.ypnepal.com/%s' % page.get('href'))
page_no = removeDuplicates(page_no)
# print(page_no,"\n")
category_links = []
for url in page_no: #page_no
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
# soup = soup.find("div",{"class":"box-content2"})
category = soup.find_all(True,{"class": ["category"]})
for cat in category:
category_links.append('http://www.ypnepal.com/%s' % cat.get('href'))
# print("\n",category_links,"\n")
# print(len(category_links))
college_data = []
def strip_data(label_data):
lab = []
for label in label_data:
label_text = label.text
label_text = label_text.replace(":", "")
label_text = label_text.replace("\n", "")
label_text = label_text.replace(" ", "")
label_text = label_text.replace("http//", "")
lab.append(label_text)
return lab
for url in category_links:
response = requests.get(url)
print(url)
if response.ok:
soup = BeautifulSoup(response.content, "html.parser")
# print(url)
# print(len(category_links))
name = soup.select("h1")[0].text.strip()
data = {'Name': name}
label_left = soup.findAll('div',class_= "labelLeft")
key = strip_data(label_left)
label_right = soup.findAll('div',class_= "labelRight")
if label_right:
label_right.pop()
value = strip_data(label_right)
data.update(dict(zip(key, value)))
if data.get('Description',None): data.pop('Description') #remove
if data.get('ListingName',None): data.pop('ListingName')
college_data.append(data)
print(college_data)
with open('ypnepal_school.csv', 'w') as csv_file:
fieldname = ['Name', 'Address', 'City','Country','Phone','P.O.Box','Email','Mobile', 'Webpage','Fax','Updatedon']
scl = csv.DictWriter(csv_file, fieldnames=fieldname)
scl.writeheader()
scl.writerows(college_data)
with open ('ypnepal_school.json','w') as json_file:
json.dump(college_data,json_file)