# 做个爬虫，爬点歌词

### 依赖的库
- **requests:** 简单好用的Python HTTP客户端库
- **bs4:** Beautiful Soup 是一个可以从 HTML 或 XML 文件中提取数据的 Python 库.

In [2]:
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import json
import re
import os

# 通过音乐的id得到歌词
def get_lyric_by_music_id(music_id):
    lrc_url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(music_id) + '&lv=1&kv=1&tv=-1'

    lyric = requests.get(lrc_url)
    json_obj = lyric.text
    j = json.loads(json_obj)
    
    # 部分歌曲没有歌词，这里引入一个异常
    try:
        lrc=j['lrc']['lyric']
        pat=re.compile(r'\[.*\]')
        lrc=re.sub(pat,"",lrc)
        lrc=lrc.strip()
        return lrc
    except KeyError as e:
        pass
    
# 通过一个歌手的id下载其最火的五十首歌的全部歌词
def spider_by_singer(singer_id):
    singer_url = 'http://music.163.com/artist?id=' + str(singer_id)
    web_data = requests.get(singer_url)
    soup = BeautifulSoup(web_data.text,'lxml')
    singer_name = soup.select("#artist-name")

    singer_name = singer_name[0].get('title')

    r = soup.find('ul',{'class':'f-hide'}).find_all('a')
    r = (list(r))
    music_id_set = []
    music_name_set = []
    for each in r:
        song_name = each.text
        music_name_set.append(song_name)

        song_id = each.attrs["href"]
        music_id_set.append(song_id[9:])
        print 'music id:', song_id[9:], '\t name:', song_name
        
    # 将音乐名字和音乐id组成一个字典
    dic = dict(map(lambda x,y:[x,y],music_name_set,music_id_set))

    out_file_path = "./lyrics/%s.txt" % singer_name
    if os.path.isfile(out_file_path):
        os.remove(out_file_path)
        
    success_count = 0
    for music_id in music_id_set:
        top_50_lyric = get_lyric_by_music_id(music_id)
        
        #单个文件存储一个歌手的50首热门歌曲的歌词并以歌手的名字命名
        f = open(out_file_path,"ab")
        try:
            f.write(top_50_lyric.encode('utf-8'))
            f.close()
            success_count += 1
        except AttributeError as e2:
            print 'error when deal music:', music_id
            pass

    print '\nSinger:', singer_name, '\tSucceed Count:', success_count
    
# 周杰伦 6452
# 陈奕迅 2116
# 赵雷 6731
spider_by_singer('6731')


music id: 436514312 	 name: 成都
music id: 517567264 	 name: 静下来
music id: 202373 	 name: 南方姑娘
music id: 29567192 	 name: 少年锦时
music id: 29567189 	 name: 理想
music id: 202369 	 name: 画
music id: 29567193 	 name: 我们的时光
music id: 29567191 	 name: 三十岁的女人
music id: 447926067 	 name: 鼓楼
music id: 437608773 	 name: 无法长大
music id: 29567187 	 name: 吉姆餐厅
music id: 33166602 	 name: 让我偷偷看你
music id: 447925066 	 name: 八十年代的歌
music id: 29567188 	 name: 家乡
music id: 447925058 	 name: 玛丽
music id: 447926063 	 name: 朵
music id: 202368 	 name: 未给姐姐递出的信
music id: 29567185 	 name: 北京的冬天
music id: 28111471 	 name: 已是两条路上的人
music id: 447925059 	 name: 阿刁
music id: 447925063 	 name: 孤独
music id: 29567194 	 name: 小屋
music id: 433018042 	 name: 南方姑娘 (弹唱版)
music id: 447925067 	 name: 再见北京
music id: 34852810 	 name: 再也不会去丽江
music id: 202376 	 name: 背影
music id: 202377 	 name: 妈妈
music id: 202370 	 name: 不开的唇
music id: 29567186 	 name: 浮游
music id: 29567190 	 name: 梦中的哈德森
music id: 447926068 	 name: 窑上路
music id: 2