-
Notifications
You must be signed in to change notification settings - Fork 2
/
crawling.py
102 lines (86 loc) · 2.85 KB
/
crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import queue
import urllib3
import socket
import requests
import os
import sys
import re
from bs4 import BeautifulSoup, SoupStrainer
username = r'b.vera%40estudiantes.matcom.uh.cu'
password = r'T%40Ta30091992*'
host = "10.6.100.71"
port = 3128
proxies = {
'http': f'http://{username}:{password}@{host}:{port}',
'https': f'https://{username}:{password}@{host}:{port}'
}
def download(url,use_proxy):
return requests.get(url, proxies=proxies).text if use_proxy else requests.get(url).text
def get_links(page,url):
return re.findall('"((http)s?://.*?)"', page)
def get_text_from_html(data):
# removing js
while(True):
i = data.find('<script',0,len(data))
if i == -1:
break
j = data.find('</script>',i,len(data))
data = data.replace(data[i:j+9],' ',1)
# removing CSS
while(True):
i = data.find('<style',0,len(data))
if i == -1:
break
j = data.find('</style>',i,len(data))
data = data.replace(data[i:j+8],' ',1)
# removing Comments
while(True):
i = data.find('/*',0,len(data))
if i == -1:
break
j = data.find('*/',i,len(data))
data = data.replace(data[i:j+2],' ',1)
soup = BeautifulSoup(data,"html5lib")
return soup.get_text(strip=True,separator=' ')
def get_ip_from_host(url_host):
try:
ips = socket.gethostbyname_ex(url_host)
except socket.gaierror:
ips=[]
return ips[0] if len(ips) > 0 else None
def create_name(folder):
return os.path.join(folder,str(len(os.listdir('crawling')))+".txt")
def create_url_name(folder):
return os.path.join(folder,"urls.txt")
def save_doc(url,text):
with open(create_name('crawling'),'w', encoding='utf8', errors='ignore') as f:
f.write(text)
def save_indexer(links):
directory = os.path.join(os.getcwd(),'ind_url')
if not os.path.exists(directory):
os.makedirs(directory)
with open(create_url_name('ind_url'),'w', encoding='utf8', errors='ignore') as f:
for link in links:
f.write(link+"\n")
def real_web_name(url : str):
return (url.count('.ico') + url.count('.gif') + url.count('.jpg') + url.count('.js') + url.count('.png') + url.count('.css')) == 0
def crawler(seed_url,deep,proxy = False,user_name = None,password = None,host_ip = None,port = None):
l = []
q = queue.Queue()
for url in seed_url:
q.put((url,0))
while(not q.empty()):
url,d = q.get()
l.append(url)
print(url)
html = download(url,proxy)
links = get_links(html,url)
text = get_text_from_html(html)
save_doc(url,text)
if d >= deep:
q.get()
continue
for (link, _) in links:
if real_web_name(link) and (link not in q.queue or link not in l):
q.put((link,d+1))
save_indexer(l)