forked from SIG-IR/SentiLyricAnalysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_lyrics.py
40 lines (36 loc) · 1.09 KB
/
scrape_lyrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from bs4 import BeautifulSoup
from bs4.element import Tag
import requests
import json
import sys
import time
import os
import re
file_name = 'links.json'
song_urls = json.loads(open(file_name).read())
# Creates a 'websites' folder if it doesn't already exist.
if not os.path.exists('songs'):
os.makedirs('songs')
requestCount = 0
for url in song_urls:
page = ''
while page == '':
try:
page = requests.get(url)
except:
time.sleep(5)
continue
# file_name is artist + song name
file_name = 'songs/' + url[19:] + '.txt'
# get html from URL
html = page.text
soup = BeautifulSoup(html, 'html5lib')
# extract text from lyrics paragraph
lyrics_div = soup.find_all('div', class_="lyrics")
lyrics_p = lyrics_div[0].find('p')
lyrics_text = lyrics_p.text
# Remove [Verse], etc. labels
lyrics_text = re.sub(r'(?is)\[.*?\]\n', '', lyrics_text)
# Writes the lyrics to a '.txt' file.
with open(file_name, 'w') as outfile:
outfile.write(lyrics_text.encode('utf-8'))