Skip to content
Permalink
Browse files

Remove redundant subroutines

  • Loading branch information
heapsmash committed Oct 13, 2019
1 parent 1c17a10 commit 224a70b196fb55970ff6d58403def6b6ae4c0185
Showing with 53 additions and 89 deletions.
  1. +53 −89 web-crawler.c
@@ -1,10 +1,11 @@
#pragma clang diagnostic push
#pragma ide diagnostic ignored "hicpp-signed-bitwise"
/* Copyright (c) 2019, Tofu von Helmholtz.

/* Copyright (c) 2019, Michael S. Walker <sigmatau@heapsmash.com>
* All rights reserved.
*
* Usage: gcc print-links.c -o print-links
* ./print-links -h 127.0.0.1
* ./print-links -h 127.0.0.1 -p port
*/

#include <stdio.h>
@@ -45,16 +46,14 @@ typedef struct _Connection {
int port_numeric;
int sck;
HttpContent http_data;
struct timeval time;
} Connection;

int Request(Connection *connection, size_t max_read_sz, size_t max_req_sz, const char *request_string, ...);
int GetHeader(Connection *connection, bool close_after_request);
int GetBody(Connection *connection, bool close_after_request);
int SendGetRequest(Connection *con, const char *request, ...);
int ReceiveReply(Connection *con);

int SetupConnection(const char *host, const void *port);
int EstablishConnection(const char *host, const void *port);
int Write(void *request, int sck, size_t n);
int Read(void *usrbuf, int sck, size_t n);
int Connect(Connection *connection);

void FindHyperLinks(char *html, char **remote_links, char **local_links, size_t *n_remote, size_t *n_local);
void CrawlHosts(Connection *con);
@@ -65,12 +64,14 @@ int main(int argc, char *argv[])
{
int opt;
Connection connection;
connection.time.tv_sec = 2;
connection.time.tv_usec = 0;

if (argc != 5) {
PRINT_ERROR_AND_RETURN("Usage: %s -h host -p port\n", argv[0]);
}

while ((opt = getopt(argc, argv, "h:p:")) != -1) {
while ((opt = getopt(argc, argv, "h:p:t:")) != -1) {
switch (opt) { // NOLINT(hicpp-multiway-paths-covered)
case 'h':
connection.raw_host = optarg;
@@ -82,6 +83,8 @@ int main(int argc, char *argv[])
PRINT_ERROR_AND_RETURN("(Usage: %s -h host -p port)\n", argv[0]);
}
break;
case 't':
break;
default:
PRINT_ERROR_AND_RETURN("(Usage: %s -h host -p port)\n", argv[0]);
}
@@ -97,18 +100,27 @@ void CrawlHosts(Connection *con)
char *external_links[MAX_LINK_LEN], *local_links[MAX_LINK_LEN];
size_t n_external_links, n_local_links, i;

if ((GetHeader(con, true) < 0) && (
strstr(con->http_data.received_data, "HTTP/1.1 302") || strstr(con->http_data.received_data, "HTTP/1.1 301")))
if (con->raw_host == NULL)
return;

if (SendGetRequest(con, REQ_GET_HEADER_CLOSE) < 0)
con->raw_host = NULL;
else if (ReceiveReply(con) < 0 &&
(strstr(con->http_data.received_data, "HTTP/1.1 302") || strstr(con->http_data.received_data, "HTTP/1.1 301"))) {
con->raw_host = GetNewLocation(con->http_data.received_data);
CrawlHosts(con);
}

if (con->raw_host != NULL) {
GetBody(con, true);
SendGetRequest(con, REQ_GET_BODY_CLOSE);
ReceiveReply(con);
FindHyperLinks(con->http_data.received_data, external_links, local_links, &n_external_links, &n_local_links);

puts("========external links===========");
for (i = 0; i < n_external_links; i++) {
puts(external_links[i]);
}

puts("\n========internal links===========");
for (i = 0; i < n_local_links; i++) {
puts(local_links[i]);
@@ -159,77 +171,46 @@ void FindHyperLinks(char *html, char **remote_links, char **local_links, size_t
}


int Connect(Connection *connection)
int SendGetRequest(Connection *con, const char *request, ...)
{
connection->sck = SetupConnection(connection->raw_host, connection->port_string);
return connection->sck;
}


int GetHeader(Connection *connection, bool close_after_request)
{
int rv;

if (Connect(connection) < 0) {
connection->raw_host = NULL;
return -1;
}

memset(&connection->http_data.get_http_request, '\0', MAX_REQUEST_SIZE - 1);

if (close_after_request)
rv = Request(connection, MAX_READ_SIZE, MAX_REQUEST_SIZE, REQ_GET_HEADER_CLOSE);
else
rv = Request(connection, MAX_READ_SIZE, MAX_REQUEST_SIZE, REQ_GET_HEADER_DONT_CLOSE, connection->raw_host,
connection->port_numeric);
if (rv < 0)
connection->raw_host = NULL;
return rv;
}
va_list ap;

memset(&con->http_data.get_http_request, '\0', MAX_REQUEST_SIZE - 1);
memset(&con->http_data.received_data, '\0', MAX_REQUEST_SIZE - 1);

int GetBody(Connection *connection, bool close_after_request)
{
if (Connect(connection) < 0)
con->sck = EstablishConnection(con->raw_host, con->port_string);
if (con->sck < 0)
return -1;

memset(&connection->http_data.get_http_request, '\0', MAX_REQUEST_SIZE - 1);

if (close_after_request)
return Request(connection, MAX_READ_SIZE, MAX_REQUEST_SIZE, REQ_GET_BODY_CLOSE);
else
return Request(connection, MAX_READ_SIZE, MAX_REQUEST_SIZE, REQ_GET_HEADER_BODY_DONT_CLOSE, connection->raw_host,
connection->port_numeric);
}


int Request(Connection *connection, size_t max_read_sz, size_t max_req_sz, const char *request_string, ...)
{
va_list ap;

memset(&connection->http_data.received_data, '\0', MAX_READ_SIZE - 1);
va_start(ap, request_string); /* Build request string */
if ((
vsnprintf(connection->http_data.get_http_request, max_req_sz, request_string, ap)) < 0) {
va_start(ap, request); /* Build request string */
if ((vsnprintf(con->http_data.get_http_request, MAX_REQUEST_SIZE, request, ap)) < 0) {
va_end(ap);
PRINT_ERROR_AND_EXIT("vsnprintf error:\n");
}
va_end(ap);

if (Write(connection->http_data.get_http_request, connection->sck, /* send request to host */
strlen(connection->http_data.get_http_request)) < 0) {
if (Write(con->http_data.get_http_request, con->sck, /* send request to host */
strlen(con->http_data.get_http_request)) < 0) {
PRINT_ERROR_AND_EXIT("Write error: (%s)\n", strerror(errno));
}

if (Read(connection->http_data.received_data, connection->sck, max_read_sz) < 0) { /* get response */
return 0;
}


int ReceiveReply(Connection *con)
{
if (read(con->sck, con->http_data.received_data, MAX_READ_SIZE) < 0) { /* get response */
if (errno == EINTR) /* interrupted by sig handler call again */
return ReceiveReply(con);
if (errno != 0) {
PRINT_ERROR_AND_EXIT("Read error: (%s)\n", strerror(errno));
} else {
PRINT_ERROR_AND_EXIT("Read error:");
}
}

if (strstr(connection->http_data.received_data, "HTTP/1.1 200") < 0) {
if (strstr(con->http_data.received_data, "HTTP/1.1 200") < 0) {
ERR("strstr error: (HTTP/1.1 200 OK Not found)\n");
return -1;
}
@@ -238,11 +219,16 @@ int Request(Connection *connection, size_t max_read_sz, size_t max_req_sz, const
}


int SetupConnection(const char *host, const void *port)
int EstablishConnection(const char *host, const void *port)
{
struct addrinfo hints, *listp, *p;
int status, clientfd = 0;

struct timeval time = {
2,
0
};

memset(&hints, 0, sizeof hints); /* Make sure the struct is empty */
hints.ai_family = AF_INET; /* Don't care IPv4 */
hints.ai_socktype = SOCK_STREAM; /* TCP stream sockets */
@@ -260,8 +246,11 @@ int SetupConnection(const char *host, const void *port)
continue; /* Socket failed, try the next */
}

setsockopt(clientfd, SOL_SOCKET, SO_SNDTIMEO, (struct timeval *) &time, sizeof(struct timeval));

if (connect(clientfd, p->ai_addr, p->ai_addrlen) != -1)
break; /* Success */

if (close(clientfd) < 0) { /* Connect failed, try another */
ERR("close failed: (%s)\n", strerror(errno));
return -1;
@@ -297,31 +286,6 @@ int Write(void *request, int sck, size_t n)
}


int Read(void *usrbuf, int sck, size_t n)
{
char *bufp;
ssize_t nread;
size_t nleft = n;
int n_new_read, first_read_flag = 1;
static int stop_loop = 0;
Connection connection;

bufp = (char *) usrbuf;
while (nleft > 0) {
if ((nread = read(sck, bufp, nleft)) < 0) {
if (errno == EINTR) /* Interrupted by sig handler */
nread = 0;
else
return -1;
} else if (nread == 0)
break;
nleft -= nread;
bufp += nread;
}
return (n - nleft); // NOLINT(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
}


char *GetNewLocation(char *html)
{
char *link = strstr(html, "://"); /* no https */

0 comments on commit 224a70b

Please sign in to comment.
You can’t perform that action at this time.