-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawler.c
200 lines (168 loc) · 6.71 KB
/
crawler.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#include "crawler.h"
void crawler1(void *arg)
{
char **pp;
struct in_addr addr;
struct hostent *hostp;
int sockfd;
struct sockaddr_in serv_socket;
int port = 80;
//char ip[] = "220.181.6.175"; //ip地址,可以通过gethostbyname来获取
char file_loc[] = "./test.html"; //下载的存放位置
char *body=(char*)malloc(1024*sizeof(char));
//char *url=(char*)malloc(512*sizeof(char));
char url[512]="http://";//add the prefix of http protocol manually.
hostp = gethostbyname((char*)arg);
//char *ip = inet_ntoa(((struct in_addr *)(*(hostp->h_addr_list)))->s_addr);
pp = hostp->h_addr_list;
addr.s_addr = ((struct in_addr *)*pp)->s_addr;
char *ip = inet_ntoa(addr);
strcat(url,*(hostp->h_aliases));
printf("ip:%s\n",ip);
printf("url:%s\n",*(hostp->h_aliases));
bzero(&serv_socket, sizeof(struct sockaddr_in));
serv_socket.sin_family = AF_INET;
serv_socket.sin_port = htons(port);
inet_pton(AF_INET, ip, &serv_socket.sin_addr);
sockfd = socket(AF_INET, SOCK_STREAM, 0);
int flag = connect(sockfd, (struct sockaddr *)&serv_socket, sizeof(serv_socket)); //建立和HTTP服务器的TCP链接
if(flag < 0)
{
printf("connect error!!! flag = %d\n", flag);
exit(1);
}
send_and_recv(sockfd, "http://www.baidu.com", "GET", "html/text", ip, port, file_loc, body, "Close"); //下载的主体函数
close(sockfd);
}
int crawler(int sockfd,char *url)
{
struct event *ev;
struct event_base *base=event_base_new();
evutil_socket_t fd;
struct sockaddr_in serv_socket;
struct timeval tv;
//int success=0;
//int arg[2];
args arg;
arg.sockfd=sockfd;
//arg.success=0;
arg.url=url;
tv.tv_sec=5;
tv.tv_usec=50*1000;
bzero(&serv_socket, sizeof(struct sockaddr_in));
serv_socket.sin_family = AF_INET;
serv_socket.sin_port = htons(80);
inet_pton(AF_INET, "10.108.106.179", &serv_socket.sin_addr);
fd = socket(AF_INET, SOCK_STREAM, 0);
int flag = connect(fd, (struct sockaddr *)&serv_socket, sizeof(serv_socket)); //建立和HTTP服务器的TCP链接
if(flag < 0)
{
printf("connect error!!! flag = %d\n", flag);
exit(1);
}
char * request = (char *) malloc (1024 * sizeof(char));
sprintf(request, "GET %s HTTP/1.1\r\nAccept: html/text\r\nHost: 10.108.106.179\r\nConnection: Close\r\n\r\n",url);//option: Accept-Encoding:gzip,deflate\r\n
int send = write(fd, request, strlen(request));
free(request);
ev=event_new(base,fd,EV_TIMEOUT|EV_READ|EV_PERSIST,cb_func,&arg);
event_add(ev,NULL);
event_base_loopexit(base,&tv);
event_base_dispatch(base);
//printf("after dispatch!\n");
//send_ipc(buf);
return 1;
}
void send_and_recv(int sockfd, char * url, char * fun_type, char * accept_type, char * ip, int port, char * file_loc, char * body, char * connection_type)
{
char * request = (char *) malloc (4* 1024 * sizeof(char));
if(body)
sprintf(request, "%s %s HTTP/1.1\r\nAccept: %s\r\nHost: %s:%d\r\nConnection: %s\r\nContent-Type: application/x-www-form-urlencoded\r\nContent-Length: %d\r\n\r\n%s", fun_type, url, accept_type, ip, port, connection_type,strlen(body),body);
else
sprintf(request, "%s %s HTTP/1.1\r\nAccept: %s\r\nHost: %s:%d\r\nConnection: %s\r\n\r\n", fun_type, url, accept_type, ip, port,connection_type);
//以上是在组织请求的头部并打印
int send = write(sockfd, request, strlen(request));
//printf("%s", request);
free(request);
char * response = (char *) malloc (400*1024 * sizeof(char));
if(file_loc) {
/*FILE *fp;
if( (fp=fopen(file_loc,"w"))==NULL)
{printf("\nfile open error!\n");}*/
int length;
int file = open(file_loc, O_RDWR | O_APPEND | O_CREAT);
do {
length = read(sockfd, response, 400*1024);
char * loc = strstr(response, "\r\n\r\n"); //截获返回头部,以\r\n\r\n为标识
if(loc) {
int loci = loc - response + 4;
//write(1, response, loci);//如果是响应头部就打印至屏幕
write(file, loc+4, length - loci);//如果是响应主体就写入文件
//fwrite((void*)(loc+4), sizeof(char), length - loci,fp);//如果是响应主体就写入文件
} else {
write(file, response, length);
}
//printf("\n%s\n",response);
if(!length)//注意,因为之前采用的是close方法,也就是说一旦传输数据完毕,则服务器端会断开链接,则read函数会返回0,所以这里 会退出循环。如果采用的是Keep-Alive则服务器不关闭TCP链接,也就说程序将会被阻塞在read函数中,因此要注意的是自己判断是否读到了响应 的结尾,然后在再次调用read之前退出循环。
break;
} while(1);
close(file);
} else {
int length;
do {
length = read(sockfd, response, 1024);
//printf("%s", response);
if(!length)
break;
} while(1);
}
free(response);
}
/*int ipc_cra(char **urlset,int n)
{
int sz_msg = strlen (urlset[n]) + 1; // '\0' too
int sock = nn_socket (AF_SP, NN_PUSH);
assert (sock >= 0);
assert (nn_connect (sock, IPC_URL) >= 0);
//printf ("NODE1: SENDING \"%s\"\n", msg);
int bytes = nn_send (sock, urlset[n], sz_msg, 0);
assert (bytes == sz_msg);
return nn_shutdown (sock, 0);
}
int ipc_cra(const char *url,char *buf)
{
int sock = nn_socket (AF_SP, NN_PAIR);
assert (sock >= 0);
assert (nn_bind (sock, IPC_URL) >= 0);
send_recv(sock, url,buf);
return nn_shutdown (sock, 0);
}*/
void cb_func(evutil_socket_t fd, short what,void *arg)
{//puts("fasffaasdfa");
char *response=(char*)malloc(1024*1024*sizeof(char));
//printf("cb_func:%s\n",(char*)arg);
int len=read(fd, response, 1024*1024);
args *argus=(args*)arg;
int sockfd=argus->sockfd;
char *url=argus->url;
//int sockfd=((int*)arg)[0];
//printf("len:%d\n\n",len);
/*int file = open("./test.html", O_RDWR | O_APPEND | O_CREAT,S_IRWXU);
write(file,response,len);
close(file);*/
//strncpy((char*)arg,response,len);
//((int*)arg)[1]=1;//flag
//argus->success=1;
/*add url to response,"#@!" is the flag of the start and end.*/
strcat(response,"!@#");
//printf("cururl:%s\n",url);
strcat(response,url);
strcat(response,"#@!");
response[len+strlen(url)+6]='\0';
//send_ipc(sockfd,response);
FILE *file=fopen("./cra_send.dat","a+");
fprintf(file,"%d response:%s\n",len+strlen(url)+13,response);
fclose(file);
//sleep(1);
//printf("send successfully!\n");
free(response);
}