In [1]:
from __future__ import print_function

import os;
import shutil;
import codecs;
import zipfile;
import json;

import re;
import requests
from lxml import html
from string import ascii_uppercase;
import chardet;

import random;

In [3]:
from os.path import expanduser
home = expanduser("~")

In [3]:
baseAddress = 'https://www.letras.mus.br';

# Utility Functions

In [4]:
def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens.
    """
    import unicodedata
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore')
    value = unicode(re.sub('[^\w\s-]', '', value).strip().lower())
    value = unicode(re.sub('[-\s]+', '-', value))
    return value;

In [5]:
#use chardet to detect the actual encoding of the content and than encode it.
def decode(content):
    enconding = chardet.detect(content)['encoding']; #converting to the right encoding
    content = content.decode(enconding, errors='ignore');
    return content;

# Web Scrapping Related Functions

In [6]:
def listArtistByLetter(letter):
    page = requests.get(baseAddress+'/letra/'+letter+'/artistas.html')
    tree = html.fromstring(page.content);
    return [
        a.attrib['href'] for a in tree.xpath('//div[@class="artistas-a js-alphabet-cnt"]/ul/li//a')
    ];

In [7]:
def listSongsFromArtist(artist):
    page = requests.get(baseAddress+artist)
    tree = html.fromstring(page.content);
    r = [a.attrib['href']+"|"+a.text_content() for a in tree.xpath('//div[@class="cnt-list--alp"]/li//a')];
    r +=[a.attrib['href']+"|"+a.text_content() for a in tree.xpath('//*[@id="cnt_top"]//li//a')];
    
    r = list(set(r)); #removing duplicates
    return r;  

In [8]:
# retrive only one song lyrics
def getLyrics(id):
    return requests.get(baseAddress+id).content;

In [117]:
def htmlRemoveNotContentTags(htmlString):
    tree = html.fromstring(htmlString);
    for t in ["//script",
              "//style",
              '//noscript',
              "//footer",
              "//*[@rel='dns-prefetch']",
              "//*[@rel='alternate']",
              "//div[not(text()) and not(node())]",
              "//span[not(text()) and not(node())]",
              "//hr[not(text()) and not(node())]",
              "//i[not(text()) and not(node())]"]:
        for  bad in tree.xpath("//"+t):
            #bad.clear();
            bad.getparent().remove(bad);

    return html.tostring(tree, pretty_print=False);

# Reading all artists into array to futher processing

## Reading the whole artist list and saving it into a file

In [4]:
artists = [];

for letter in ascii_uppercase:
    artists += listArtistByLetter(letter)
    
with open(home+'/letras/artists.txt','w') as f:
    f.writelines("%s\n" % l for l in artists);

In [5]:
len(artists)

214346

## Reading only the most visited artist (probably it contains the weekly report, not the all times report)

In [9]:
maisAcessados = baseAddress+'/mais-acessadas/';
maisAcessados = requests.get(maisAcessados);
maisAcessados = html.fromstring(maisAcessados.content);
maisAcessados = [a.attrib['href'] for a in maisAcessados.xpath('//*[@class="top-list_art"]/li//a')]

In [10]:
len(maisAcessados)

302

In [11]:
maisAcessados[0:10]

['/1kilo/',
 '/matheus-kauan/',
 '/henrique-e-juliano/',
 '/jojo-maronttinni/',
 '/harpa-crista/',
 '/jorge-mateus/',
 '/ed-sheeran/',
 '/gabriela-rocha/',
 '/pineapple/',
 '/marchinhas-de-carnaval/']

## reading the previously saved artists list  from file

In [12]:
artists = []
with open(home+'/letras/artists.txt','r') as f:
    artists = f.readlines();

In [13]:
artists[0].strip()

'/anavitoria/'

In [14]:
random.shuffle(artists);
artists[0].strip()

'/timbre-original/'

In [15]:
def downloadArtist(currentArtist):
        
    songList = listSongsFromArtist(currentArtist[0:-1]);
    
    if len(songList) <3:
        return;

    artistDir= home + "/letras"+currentArtist[0:-1]+".zip";
    
    if os.path.exists(artistDir):
        return;
    
    thezip = zipfile.ZipFile(artistDir,"w")
    
    savedLyrics = 0;
    inZip = []; #adicional check for duplicated files.
    for s in songList:
        s = s.strip().split("|");
        
        fileName = slugify(unicode(s[1])) + ".html";
        #print (fileName);
        if fileName in inZip:
            continue;
        
        content = decode(getLyrics(s[0]));
        content = htmlRemoveNotContentTags(content);

        thezip.writestr(fileName, content.encode("UTF-8"));
        inZip.append(fileName);
    
    thezip.close();

### maisAcessadas

In [None]:
print ("Faltando %d artistas para ler " % len(maisAcessados))

while len(maisAcessados) >0:
    currentArtist = maisAcessados[0].strip();
    print (currentArtist); 
         
    downloadArtist(currentArtist);
    
    maisAcessados.remove(maisAcessados[0]);
    if maisAcessados[0] in  artists:
        artists.remove(maisAcessados[0]);

In [None]:
updateSongs = False;
count = 30

print ("Faltando %d artistas para ler " % len(artists))

while len(artists) >0 and count > 0:
    #avoiding being blocked
    count -= 1;
    
    #picking and removing 1 artist at time
    currentArtist = artists[0].strip();
    artists.remove(artists[0]);
    downloadArtist(artists);

In [28]:
for f in maisAcessados:
    print(f);

/belo/
/heloisa-rosa/
/delacruz/
/soraya-moraes/
/sandy-e-junior-musicas/
/clarice-falcao/
/john-legend/
/amy-winehouse/
/lea-mendonca/
/fifth-harmony/
/tiao-carreiro-e-pardinho/
/little-mix/
/one-ministry/
/miley-cyrus/
/kendrick-lamar/
/iron-maiden/
/falamansa/
/lorde/
/cancao-nova/
/junior-angelim/
/priscilla-alcantara/
/avril-lavigne/
/amado-batista/
/led-zeppelin/
/onze20/
/charlie-puth/
/nani-azevedo/
/gilberto-gil/
/eliane-fernandes/
/ludmilla/
/gabriel-pensador/
/vanessa-da-mata/
/ministerio-sarando-terra-ferida/


# Limpeza do HTML

In [197]:
from lxml import etree
import glob, shutil;
import json;

In [123]:
def cleanZip(zippath):
    with zipfile.ZipFile(zippath,"r") as thezip:
        with zipfile.ZipFile(zippath+".clean.zip","w") as zipclean:
            for fname in thezip.namelist():
                content = thezip.read(fname);
                content = htmlRemoveNotContentTags(content.decode("UTF-8"));
                zipclean.writestr(fname,content);

In [None]:
for g in glob.glob(r"/Users/joseeleandrocustodio/letras/*.zip"):
    cleanZip(g);


In [126]:
for g in glob.glob(r"/Users/joseeleandrocustodio/letras/*.clean.zip"):
    shutil.move(g,r"/Users/joseeleandrocustodio/letrasClean")
    
for g in glob.glob(r"/Users/joseeleandrocustodio/letras/*.clean.zip"):
    shutil.move(g,g.replace(".clean.zip",""))    

In [224]:
def parseSong(songFile):
    tree = html.fromstring(songFile);
    
    songText = tree.xpath("//*[contains(@class,'cnt-letra')]/article");
    if len(songText) == 0:
        return None;
    songText = html.tostring(songText[0],pretty_print=True);
    songText = songText.replace("<br>","\n");
    
    breadCrumb = tree.xpath("//*[@id='breadcrumb']//*[@itemprop='name']//text()");
    songTile = tree.xpath("//*[@id='js-lyric-cnt']//h1/text()")[0].strip();
    exbibitions = tree.xpath("//*[@class='cnt-info_exib']/b/text()");
    
    contributor = tree.xpath("//*[@class='letra-info_user']/a/@href");
    if len(contributor)>0:
        contributor = contributor[0].strip();
    
    song ={
        "genre":breadCrumb[1].strip(),
        "artist":breadCrumb[2].strip(),
        'title':songTile,
        'songHtml':songText,
        'exbibitions':exbibitions,
        'contributor':contributor,
        'compositor':tree.xpath("//*[@class='letra-info_comp']/text()")[0].replace(u"Composi\xe7\xe3o:","").strip(),        
    }
    return song;  

In [225]:
songs= [];

for zippath in sorted(glob.glob(r"/Users/joseeleandrocustodio/letras/*.zip")):
    print(zippath);
    try:
        with zipfile.ZipFile(zippath,"r") as thezip:
            for fname in thezip.namelist():
                content = thezip.read(fname);
                s = parseSong(content.decode("utf-8"));
                if s:
                    songs.append(s);
    except IndexError as e:
        print ("Error reading file:"+zippath +" song:"+fname);
        print (e)
        break;
    
with codecs.open(r'/Users/joseeleandrocustodio/letras/songs.json', mode='w') as outfile:
    json.dump(songs, outfile);

/Users/joseeleandrocustodio/letras/1kilo.zip
/Users/joseeleandrocustodio/letras/3030.zip
/Users/joseeleandrocustodio/letras/adele.zip
/Users/joseeleandrocustodio/letras/alceu-valenca.zip
/Users/joseeleandrocustodio/letras/aline-barros.zip
/Users/joseeleandrocustodio/letras/alisson-e-neide.zip
/Users/joseeleandrocustodio/letras/almir-sater.zip
/Users/joseeleandrocustodio/letras/alok.zip
/Users/joseeleandrocustodio/letras/amado-batista.zip
/Users/joseeleandrocustodio/letras/amanda-wanessa.zip
/Users/joseeleandrocustodio/letras/amy-winehouse.zip
/Users/joseeleandrocustodio/letras/ana-nobrega.zip
/Users/joseeleandrocustodio/letras/ana-vilela.zip
/Users/joseeleandrocustodio/letras/anavitoria.zip
/Users/joseeleandrocustodio/letras/anderson-freire.zip
/Users/joseeleandrocustodio/letras/anitta.zip
/Users/joseeleandrocustodio/letras/anjos-de-resgate-musicas.zip
/Users/joseeleandrocustodio/letras/antonia-gomes.zip
/Users/joseeleandrocustodio/letras/arctic-monkeys.zip
/Users/joseeleandrocustodio/

/Users/joseeleandrocustodio/letras/luan-santana.zip
/Users/joseeleandrocustodio/letras/lucas-lucco.zip
/Users/joseeleandrocustodio/letras/ludmila-ferber.zip
/Users/joseeleandrocustodio/letras/ludmilla.zip
/Users/joseeleandrocustodio/letras/luiz-gonzaga.zip
/Users/joseeleandrocustodio/letras/luiz-lins.zip
/Users/joseeleandrocustodio/letras/lulu-santos.zip
/Users/joseeleandrocustodio/letras/luma-elpidio.zip
/Users/joseeleandrocustodio/letras/maiara-maraisa.zip
/Users/joseeleandrocustodio/letras/maluma.zip
/Users/joseeleandrocustodio/letras/maneva.zip
/Users/joseeleandrocustodio/letras/mano-walter.zip
/Users/joseeleandrocustodio/letras/mara-lima.zip
/Users/joseeleandrocustodio/letras/marcela-tais.zip
/Users/joseeleandrocustodio/letras/marchinhas-de-carnaval.zip
/Users/joseeleandrocustodio/letras/marcos-belutti.zip
/Users/joseeleandrocustodio/letras/maria-bethania.zip
/Users/joseeleandrocustodio/letras/maria-gadu.zip
/Users/joseeleandrocustodio/letras/marilia-mendonca.zip
/Users/joseeleand

In [7]:
lyrics = None;

with codecs.open(home+r'/letras/songs.json', mode='r') as outfile:
    lyrics = json.load(outfile);

In [16]:
for l in lyrics:
    if l['compositor']:
        comp = l['compositor'];
        comp = comp[0:-2].strip();
        comp = re.sub(r'\s{2,}',' ',comp);
        l['compositor'] = comp;
    if l['contributor']:
        l['contributor'] = re.sub(r'[^\d]','',l['contributor'])
    
    l['songHtml'] = re.sub('\s*\<(\/)?article\>\s*','',l['songHtml'])

    
with codecs.open(home + r'/letras/songs2.json', mode='w') as outfile:
    json.dump(lyrics, outfile);

In [15]:
print(re.sub('\s*\<(\/)?article\>\s*','',lyrics[1]['songHtml']))

<p>[Pablo]
Aten&#231;&#227;o de soldados na guerra
Acreditamos na reza sincera
As nossas ambi&#231;&#245;es s&#227;o maiores que a terra
Voc&#234; vai ver</p>
<p>[Knust]
Reza Sincera, sobe a vela
Prece contra os pela
Filho de Atena, na guerra com ambi&#231;&#227;o da terra
A prote&#231;&#227;o no gatilho, a bala brilha, apaga o brilho
O breu te cega, a praga pega, o padre prega e mata o filho
O breu te cega, a praga pega, o padre prega e mata o filho
N&#243;s s&#243; usamos o necess&#225;rio pra expandir o saldo banc&#225;rio
Uma Ambi&#231;&#227;o suicida, numeros pagam sua vida
Amor t&#225; caro, rir t&#225; raro e tu achando que t&#225; bom
Onde a rainha do baixinhos n&#227;o &#233; a Xuxa, &#233; o Bolsonaro
For&#231;a de Zeus, blinda os meus nessa miss&#227;o, Thor bate o martelo
Que falte amor, n&#227;o muni&#231;&#227;o
&#201; viver pra ser, meu conselho &#233; n&#227;o seja
N&#227;o d&#225; as costas pra rua nem pra tomar uma cerveja
Hora pra pular a fogueira n&#227;o para
J&#22