-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
140 lines (102 loc) · 4.65 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#! /usr/bin/env python3
import sys
if not str(sys.version[:5]).startswith("3"):
print("You must use python3 to use this script")
exit()
# TODO: correct imports to fit python style guidelines
import http.cookiejar
import os
import urllib
import urllib.parse
import urllib.request
import csv
import configparser
from bs4 import BeautifulSoup
def removeNonAscii(s): return "".join(i for i in s if ord(i)<128)
config = configparser.ConfigParser()
config.read('config.ini')
username = config['User']['id']
password = config['User']['password']
rootUrl = config['Website']['rootUrl']
#store cookies and create opener to hold them
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
#Add our headers
opener.addheaders = [('User-agent', 'WebScrape')]
#install our opener (changes global opener)
urllib.request.install_opener(opener)
#The action/ target from the form
authentication_url = rootUrl + '/login.jsp'
payload = {
'name': username,
'pass': password,
'cmd': 'Login'
}
#use urllib to encode the payload
data = urllib.parse.urlencode(payload)
#Build our request object (supplying 'data' make it a POST)
req = urllib.request.Request(authentication_url, data.encode('utf-8'))
#Make the request and read the response
resp = urllib.request.urlopen(req)
contents = resp.read()
#Send 1 request
dchp_url = rootUrl + '/dhcp-admin/ListCNRScopes.jsp'
req = urllib.request.Request(dchp_url)
urllib.request.urlopen(req)
payload = {
'searchValue': '',
'pageSize': '1000',
'Change Page Size': 'Change Page Size',
'pageName': 'cnrscope-cursor_0_sort-by-name'
}
dhcp_url = rootUrl + '/dhcp-admin/ListCNRScopes.jsp?__vPage=dhcp-cnrscope-list&highlightValue=AP_611'
data = urllib.parse.urlencode(payload)
#Build our request object (supplying 'data' make it a POST)
req = urllib.request.Request(dhcp_url, data.encode('utf-8'))
#Make the request and read the response
resp = urllib.request.urlopen(req)
contents = resp.read()
#print contents
parsed_html = BeautifulSoup(contents)
print('\n')
links = []
for node in parsed_html.findAll('a'):
if (len(''.join(node.findAll(text=True))) > 0 and
len(''.join(node.findAll(text=True))) > 6 and
(''.join(node.findAll(text=True)).startswith('Su') or # These are the first 2 letters of the links, used to filter main list to sublist you may need
''.join(node.findAll(text=True)).startswith('AU') or # as well as filter out the actual links from other links on the page
''.join(node.findAll(text=True)).startswith('ME') or
''.join(node.findAll(text=True)).startswith('TW'))):
links.append(node['href'])
with open('report.csv', 'wb') as f:
for link in links:
subnet_url = rootUrl + link
print(subnet_url)
req = urllib.request.Request(subnet_url)
urllib.request.urlopen(req)
mac_addr = link[-23:]
leases_url = rootUrl + '/dhcp-admin/ListCNRLeasesForScope.jsp?__vPage=dhcp-cnrscope-lease-list&cnrScopeOID=OID-' + mac_addr + '&refreshList=true'
print(leases_url + '\n')
req = urllib.request.Request(leases_url)
urllib.request.urlopen(req)
leases_url = rootUrl + '/dhcp-admin/ListCNRLeasesForScope.jsp?__vPage=dhcp-cnrscope-lease-list'
payload = {
'searchValue': '',
'pageSize': '1000',
'Change Page Size': 'Change Page Size',
'pageName': 'cnrlease-for-scope-by-address-cursor'
}
data = urllib.parse.urlencode(payload)
req = urllib.request.Request(leases_url, data.encode('utf-8'))
#Make the request and read the response
resp = urllib.request.urlopen(req)
contents = resp.read()
parsed_html = BeautifulSoup(contents)
for node in parsed_html.findAll(attrs={'class':'listA'}):
if (len(node.findAll("td")[4].get_text()) > 4):
lineItem = removeNonAscii(node.findAll("td")[1].get_text() + ',' + node.findAll("td")[4].get_text() + ',' + node.findAll("td")[3].get_text() + ',' + node.findAll("td")[2].get_text() + '\n').encode('ascii')
f.write(lineItem)
for node in parsed_html.findAll(attrs={'class':'listB'}):
if (len(node.findAll("td")[4].get_text()) > 4):
lineItem = removeNonAscii(node.findAll("td")[1].get_text() + ',' + node.findAll("td")[4].get_text() + ',' + node.findAll("td")[3].get_text() + ',' + node.findAll("td")[2].get_text() + '\n').encode('ascii')
f.write(lineItem)