-
Notifications
You must be signed in to change notification settings - Fork 0
/
[PA 4] Robots.cs
130 lines (117 loc) · 4.34 KB
/
[PA 4] Robots.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Xml;
namespace WorkerRole1
{
public class Robots
{
private string content;
private DateTime excludeBeforeDatetime;
private string excludeUrlsWithoutToken;
private HashSet<string> disallowed = new HashSet<string>();
private HashSet<string> links = new HashSet<string>();
public Robots(string authority)
{
string robotsTXTURL = authority + "/robots.txt";
content = new WebClient().DownloadString(robotsTXTURL);
int index = content.IndexOf("User-agent: *");
if (index >= 0)
{
string disallowedContent = content.Substring(index);
foreach (Match match in Regex.Matches(disallowedContent, @"Disallow:\s*([/-_.A-Za-z0-9]+)"))
{
disallowed.Add(Urls.BuildAbsUrl(robotsTXTURL, match.Groups[1].Value));
}
}
excludeUrlsWithoutToken = "";
}
public void excludeEntriesBefore(DateTime datetime)
{
excludeBeforeDatetime = datetime;
}
public void excludeUrlsWithout(string token)
{
excludeUrlsWithoutToken = token;
}
public HashSet<string> getDisallowed()
{
return disallowed;
}
public HashSet<string> getLinks()
{
return links;
}
private HashSet<string> parseSitemap(string url)
{
HashSet<string> urls = new HashSet<string>();
try
{
content = new WebClient().DownloadString(url);
XmlDocument sitemap = new XmlDocument();
sitemap.LoadXml(content);
XmlNodeList urlNodes = sitemap.GetElementsByTagName("sitemapindex");
if (urlNodes.Count > 0)
{
XmlNodeList sitemapNodes = urlNodes[0].ChildNodes;
foreach (XmlNode node in sitemapNodes)
{
if (node["lastmod"] != null)
{
DateTime entryLastModified;
if (DateTime.TryParse(node["lastmod"].InnerText, out entryLastModified))
{
if (entryLastModified.CompareTo(excludeBeforeDatetime) < 0)
{
continue;
}
}
}
if (node["loc"] != null)
{
string crawlUrl = node["loc"].InnerText;
if (crawlUrl.Contains(excludeUrlsWithoutToken))
{
urls.UnionWith(parseSitemap(node["loc"].InnerText));
}
}
}
}
else
{
urlNodes = sitemap.GetElementsByTagName("urlset");
if (urlNodes.Count > 0)
{
XmlNodeList sitemapNodes = urlNodes[0].ChildNodes;
foreach (XmlNode node in sitemapNodes)
{
if (node["loc"] != null)
{
string crawlUrl = node["loc"].InnerText;
if (crawlUrl.Contains(excludeUrlsWithoutToken))
{
urls.Add(crawlUrl);
}
}
}
}
}
}
catch (WebException) { /* 404 or Network Error */ }
catch (XmlException) { /* Download Incomplete */ }
return urls;
}
public void parseSitemaps()
{
foreach (Match match in Regex.Matches(content, @"Sitemap:\s*([/_.:A-Za-z0-9-]+)"))
{
string what = match.Groups[1].Value;
links.UnionWith(parseSitemap(match.Groups[1].Value));
}
}
}
}