/
crawler_chinese.py
161 lines (145 loc) · 6.25 KB
/
crawler_chinese.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from urllib import request
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import sqlite3
import re
import json
class crawler:
def __init__(self,playList_url,table_name,dbname='Music_data_chinese.db'):
self.url=playList_url
self.con=sqlite3.connect(dbname)
self.table_name=table_name
self.create_table(table_name)
self.headers = {
'Referer': 'http://music.163.com/',
'Host': 'music.163.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0 Iceweasel/38.3.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
def get_all_pages(self):
pages=[]
nextPage=self.get_next_page(self.url)
while nextPage:
pages.append(nextPage)
nextPage=self.get_next_page(nextPage)
return pages
def get_next_page(self,url):
try:
page = request.urlopen(url)
except:
print("Counld not open %s " % url)
soup = BeautifulSoup(page.read(),'lxml')
try:
nextPage = soup.find('a', class_='zbtn znxt')['href']
nextPage = "http://music.163.com" + nextPage
except:
nextPage = None
print("Could not find next page for %s,\t maybe has over!" % url)
return nextPage
'''
获取所有页面中的歌单,并存入数据库
done
'''
def get_playList(self):
print("Start:\n\t爬取当前类别歌单列表 ...\n")
pages=self.get_all_pages()
playList_links_all=[]
for page in pages:
try:
page=request.urlopen(self.url)
except:
print("Counld not open %s " % self.url)
soup=BeautifulSoup(page.read(),'lxml')
playList_links=soup.find_all('a',class_='msk')
# 需要从html标签中提取url
play_list_url=[]
for play_list in playList_links:
play_list_url.append(play_list['href'])
playList_links_all.extend(play_list_url)
print("Successfully: \t歌单爬取完毕!"
"\n\tPlay list counts:",len(playList_links_all))
return playList_links_all
'''
获取某个歌单中的所有歌曲链接
down
'''
def get_musics_in_playList(self):
play_lists=self.get_playList()
print("Start:开始爬取歌曲列表...")
song_counts = 0
s = requests.session()
for playList_url in play_lists:
try:
playList_url='http://music.163.com'+playList_url
print('url:',playList_url)
page= s.get(playList_url,headers=self.headers)
except:
print("Counld not open %s " % playList_url)
soup=BeautifulSoup(page.content)
song_links=soup.find_all('a' , href = re.compile("^/song\?id=[0-9]+"))
for link in song_links:
if link.has_attr('class'):
continue
# print('link:',link)
songId=int(re.search("[0-9]+$",link['href']).group())
# print('songid:',songId)
if self.con.execute("select * from {} where song_id={}".format(self.table_name,songId)).fetchone():
# print('歌曲已存在,ID:%d' % songId)
continue
self.con.execute("insert into song_list_chinese(song_id,song_name) values(?,?)",(songId,link.string))
song_counts+=1
if song_counts!=0 and song_counts%300==0:
print('已获取歌曲数量:',song_counts)
self.con.commit()
print("Successfully:歌曲爬取完毕,已存入数据库!歌曲数量:",song_counts)
'''
获取某歌曲的歌词内容
'''
def get_lyric_in_music(self):
# self.get_musics_in_playList()
print("Start : 开始爬取歌词内容...")
song_list = self.con.execute("select song_id from %s " % self.table_name).fetchall()
song_counts = 0
for songUrl in song_list:
try:
lyric_url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(songUrl[0]) + '&lv=1&kv=1&tv=-1'
# s = requests.session()
# page = s.get(songUrl, headers=self.headers)
lyric=requests.get(lyric_url)
json_obj=lyric.text
j=json.loads(json_obj)
except:
print("Counld not open %s " % lyric_url)
try:
lrc=j['lrc']['lyric']
pat = re.compile(r'\[.*\]') # 下面这三行正则匹配删除时间轴
lrc = re.sub(pat, "", lrc)
lrc = lrc.strip()
# self.con.execute("update {} set lyric='{}' where song_id={}".format(self.table_name,lrc,songUrl[0]))
# 由于部分歌词中存在 '、''问题,所以上一种情况可能会导致语法错误从,又因为sql中只有values可用?占位符,表名不可以,所以手动写上表名
self.con.execute("update song_list_chinese set lyric=? where song_id=?" ,(lrc,songUrl[0]))
song_counts+=1
# 之前当出现异常时except中输出lrc,怀疑输出的是之前上一次循环的lrc,本次的lrc由于关键字错误(不存在lrc)未创建成功??待排查!!
except KeyError as e:
pass
if song_counts!=0 and song_counts%200==0:
print("已爬取歌词:%d.." % song_counts)
self.con.commit()
self.con.close()
print("Successfully:歌词爬取完毕!")
'''
创建相关表
done
'''
def create_table(self,table_name):
print("创建数据表:%s ..." % table_name)
self.con.execute('create table if not exists %s(song_id Integer primary key,'
'song_name Text,lyric Text,lyric_segmented text )' % table_name)
self.con.commit()
print("创建完毕!")
if __name__ =='__main__':
url = 'http://music.163.com/discover/playlist/?cat=%E5%8D%8E%E8%AF%AD'
table_name="song_list_chinese"
crawler=crawler(url,table_name=table_name)
crawler.get_lyric_in_music()