-
Notifications
You must be signed in to change notification settings - Fork 0
/
yellowpagesnepal_college.py
101 lines (74 loc) · 2.7 KB
/
yellowpagesnepal_college.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import requests
import csv
import json
def removeDuplicates(listofElements):
uniqueLinks = []
for elem in listofElements:
if elem not in uniqueLinks:
uniqueLinks.append(elem)
return uniqueLinks
page = ("http://yellowpagesnepal.com/index.php?cat=196")
m_page = (page)
req = Request(m_page)
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
page_no = []
page_no.append(page)
college_data = []
for page in soup.find_all(True,{"class": ["paging"]}):
page_no.append('http://yellowpagesnepal.com/%s' % page.get('href'))
page_no = removeDuplicates(page_no)
# print(page_no,"\n")
category_links = []
for url in page_no: #page_no
# print(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')
soup = soup.find("div",{"class":"box-content2"})
category = soup.find_all(True,{"class": ["category"]})
for cat in category:
category_links.append('http://yellowpagesnepal.com/%s' % cat.get('href'))
category_links = removeDuplicates(category_links)
# category_links[:] = [x for x in category_links if "\x92" not in x]
# print("\n",category_links,"\n")
# print(len(category_links))]
college_data = []
def strip_data(label_data):
lab = []
for label in label_data:
label_text = label.text
label_text = label_text.replace(":", "")
label_text = label_text.replace("\n", "")
label_text = label_text.replace(" ", "")
lab.append(label_text)
return lab
for url in category_links:
response = requests.get(url)
print(url)
if response.ok:
soup = BeautifulSoup(response.content, "html.parser")
# print(url)
# print(len(category_links))
name = soup.select("h1")[0].text.strip()
data = {'Name': name}
label_left = soup.findAll('div',class_= "labelLeft")
if label_left:
label_left.pop()
key = strip_data(label_left)
label_right = soup.findAll('div',class_= "labelRight")
value = strip_data(label_right)
print(key,value)
data.update(dict(zip(key, value)))
if data.get('Description',None): data.pop('Description') #remove
college_data.append(data)
college_data = removeDuplicates(college_data)
print(college_data)
with open('yellowpagesnepal_college.csv', 'w') as csv_file:
fieldname = ['Name', 'Address', 'P.O.Box', 'Phone','Mobile', 'Email', 'Webpage','Fax', 'Country']
scl = csv.DictWriter(csv_file, fieldnames=fieldname)
scl.writeheader()
scl.writerows(college_data)
with open ('yellowpagesnepal_college.json','w') as json_file:
json.dump(college_data,json_file)