-
Notifications
You must be signed in to change notification settings - Fork 6
/
main.py
187 lines (169 loc) · 7.38 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import json
import requests
import time
import os
import urllib.request
import argparse
import pandas as pd
from tools.util import get_current_time_format, generate_url_with_xbs, sleep_random
from config import IS_SAVE, SAVE_FOLDER, USER_SEC_UID, IS_WRITE_TO_CSV, LOGIN_COOKIE, CSV_FILE_NAME
class DouYinUtil(object):
def __init__(self, sec_uid: str):
"""
:param sec_uid: 抖音id
"""
self.sec_uid = sec_uid
self.is_save = IS_SAVE
self.save_folder = SAVE_FOLDER
self.is_write_to_csv = IS_WRITE_TO_CSV
self.csv_name = CSV_FILE_NAME
self.video_api_url = ''
self.api_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Referer': 'https://www.douyin.com/',
'Cookie': LOGIN_COOKIE
}
self.cursor = 0
self.videos_list = [] # 视频列表id
self.video_info_list = []
self.video_info_dict = {}
self.stop_flag = False # 默认不停止
def get_user_video_info(self, url: str):
res = requests.get(url, headers=self.api_headers)
res.encoding = 'utf-8'
res_text = res.text
return json.loads(res_text)
def get_all_videos(self):
"""
获取所有的视频
:return:
"""
while not self.stop_flag:
self.video_api_url = f'https://www.douyin.com/aweme/v1/web/aweme/post/?aid=6383&sec_user_id={self.sec_uid}&count=35&max_cursor={self.cursor}&cookie_enabled=true&platform=PC&downlink=10'
xbs = generate_url_with_xbs(self.video_api_url, self.api_headers.get('User-Agent'))
user_video_url = self.video_api_url + '&X-Bogus=' + xbs
user_info = self.get_user_video_info(user_video_url)
aweme_list = user_info['aweme_list']
for aweme_info in aweme_list:
self.video_info_list.append(aweme_info)
self.video_info_dict.setdefault(aweme_info['aweme_id'], aweme_info)
self.videos_list.append(aweme_info['aweme_id'])
if int(user_info['has_more']) == 0:
self.stop_flag = True
else:
self.cursor = user_info['max_cursor']
#self.stop_flag = True
sleep_random()
return self.videos_list
def download_video(self, video_url: str, file_name: str = None):
"""
下载视频
:param video_url: 视频地址
:param file_name: 视频保存文件名: 默认为空
:return:
"""
if not self.is_save:
print("当前不需要保存")
return
save_folder = f"{self.save_folder}/{self.sec_uid}"
if not os.path.exists(save_folder):
os.mkdir(save_folder)
real_file_name = f"{save_folder}/{file_name}"
#print(f"下载url:{video_url}\n保存文件名:{real_file_name}")
if os.path.exists(real_file_name):
os.remove(real_file_name)
urllib.request.urlretrieve(video_url, real_file_name)
def download_images(self,image_list:list,image_dir:str=None):
"""
下载图片
:param image_list: 图片地址
:param file_name: 图片目录: 默认为空
:return:
"""
if not self.is_save:
print("当前不需要保存")
return
parent_folder = f"{self.save_folder}/{self.sec_uid}"
if not os.path.exists(parent_folder):
os.mkdir(parent_folder)
save_folder = f"{self.save_folder}/{self.sec_uid}/{image_dir}"
print(f"save-dir:{save_folder}")
num=1
if not os.path.exists(save_folder):
os.mkdir(save_folder)
for image_url in image_list:
num+=1
print(f"image_url:{image_url} {num}")
real_file_name = f"{save_folder}/{num}.jpeg"
print(f"下载url:{image_url}\n保存文件名:{real_file_name}")
if os.path.exists(real_file_name):
os.remove(real_file_name)
urllib.request.urlretrieve(image_url, real_file_name)
def get_video_detail_info(self, video_id: str):
"""
获取视频详细信息
:param video_id: 视频id
:return:
"""
default_response = {
'video_id': video_id, # 视频id
'link': 'None', # 视频链接
'is_video': True, # 是否为视频
'title': 'None', # 标题
'thumb_up_num': 0, # 点赞数
'comment_num': 0, # 评论数
'cover_url': 'http://www.baidu.com', # 视频封面
'publish_time': '', # 发布日期
'record_time': '记录日期', # 更新日期
"preview_title":""
}
res_info = self.video_info_dict.get(video_id, None)
if res_info is None:
return default_response
default_response['title'] = res_info['desc']
default_response["preview_title"]=res_info["preview_title"]
create_time = res_info['create_time']
local_time = time.localtime(create_time)
local_time_str = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
default_response['publish_time'] = local_time_str
default_response['record_time'] = get_current_time_format()
if res_info['images'] is None:
default_response['link'] = res_info["video"]["play_addr"]["url_list"][0]
default_response['cover_url'] = res_info["video"]["cover"]["url_list"][0]
default_response['is_video'] = True
else:
default_response['link'] = list(map(lambda x: x["url_list"][-1], res_info["images"]))
default_response['is_video'] = False
default_response['thumb_up_num'] = res_info['statistics']['admire_count']
default_response['comment_num'] = res_info['statistics']['comment_count']
return default_response
if __name__ == '__main__':
import sys
params_list_size = len(sys.argv)
if params_list_size == 2:
USER_SEC_UID = sys.argv[1]
elif params_list_size == 3:
USER_SEC_UID = sys.argv[1]
SAVE_FOLDER = sys.argv[2]
dy_util = DouYinUtil(sec_uid=USER_SEC_UID)
all_video_list = dy_util.get_all_videos()
csvVideos =[]
for video_id in all_video_list:
video_info = dy_util.get_video_detail_info(video_id)
if video_info['is_video'] is True:
dy_util.download_video(video_info['link'], f"{video_id}.mp4")
if video_info["is_video"] is False:
dy_util.download_images(video_info["link"],f"{video_id}")
title = video_info["title"]
preview_title = video_info["preview_title"]
print(f"file:{video_id}.mp4,title:{title} , preview_title:{preview_title}")
video_info["link"]=video_id
video_info["video_id"]=f"id:{video_id}"
csvVideos.append(video_info)
data = pd.DataFrame(csvVideos)
csvHeaders = ["视频id","视频链接","是否为视频","标题","点赞数","评论数","视频封面","发布日期","更新日期","预览标题"]
data.to_csv(CSV_FILE_NAME, header=csvHeaders, index=False, mode='a+', encoding='utf-8')
try:
data.to_csv(CSV_FILE_NAME, header=False, index=False, mode='a+', encoding='utf-8')
except UnicodeEncodeError:
print("编码错误, 该数据无法写到文件中, 直接忽略该数据")