In [1]:
import h5py
import os
import glob
import hdf5_getters
import tables
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

# using functions defined externally to use the chosen dataset

In [2]:
def count_all_files(basedir,ext='.h5') :
    cnt = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        cnt += len(files)
    return cnt

In [3]:
def get_all_titles(basedir,ext='.h5') :
    titles = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            titles.append( hdf5_getters.get_title(h5) )
            h5.close()
    return titles

In [4]:
titles = count_all_files('./MillionSongSubset', ext='.h5')

In [5]:
titles

10000

# creating the columns for the not_hot_songs dataframe 

In [6]:
all_titles = get_all_titles('./MillionSongSubset', ext='.h5')

In [7]:
def get_all_artists(basedir,ext='.h5') :
    artists = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            artists.append( hdf5_getters.get_artist_name(h5) )
            h5.close()
    return artists

In [8]:
all_artists = get_all_artists('./MillionSongSubset', ext='.h5')

In [9]:
not_hot_songs = pd.DataFrame({"Title": all_titles, "Artist": all_artists})

In [10]:
not_hot_songs['Title'] = not_hot_songs['Title'].str.decode('utf-8').str.strip("'")
not_hot_songs['Artist'] = not_hot_songs['Artist'].str.decode('utf-8').str.strip("'")

In [11]:
not_hot_songs

Unnamed: 0,Title,Artist
0,I Didn't Mean To,Casual
1,Soul Deep,The Box Tops
2,Amor De Cabaret,Sonora Santanera
3,Something Girls,Adam Ant
4,Face the Ashes,Gob
...,...,...
9995,The Hanged Man,Moonspell
9996,The Wonderful World Of The Young,Danny Williams
9997,Sentimental Man,Winston Reedy
9998,Zydeco In D-Minor,"Myrick ""Freeze"" Guillory"


# calling the hot_100_songs to check for duplicates of the not_hot_songs inside it and saving the not_hot_songs in a .csv file

In [12]:
def scrape_100():
    
    """
    Scrapes the Billboard Hot 100 chart and returns a DataFrame with song titles and artists.
    
    Returns:
        pandas.DataFrame: DataFrame containing the song titles and artists.
    """
    
    url = "https://www.billboard.com/charts/hot-100/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Scrapping for the titles of the Hot 100
    title = []
    for li in soup.select("li.lrv-u-width-100p li h3"):
        title.append(li.get_text().strip())
        
    # Scrapping for their respective artists    
    artist = []
    for i in soup.select("li.lrv-u-width-100p li span"):
        text = i.get_text().strip()
        match = re.match(r'^[A-Za-z\s]+(?:[A-Za-z0-9\s\W]*[A-Za-z0-9])?$', text)
        if match:
            artist.append(text)
    
    # Saving the data into a DataFrame
    hot_100_songs = pd.DataFrame({"Title": title, "Artist": artist})

    return hot_100_songs


In [13]:
hot_100_songs = scrape_100()

In [14]:
def remove_duplicates(not_hot_songs, hot_100_songs):
    """
    Removes rows from the 'not_hot_songs' dataframe if their values are equal to any row in the 'hot_100_songs' dataframe.

    Args:
        not_hot_songs (pandas.DataFrame): DataFrame with 'title' and 'artist' columns.
        hot_100_songs (pandas.DataFrame): DataFrame with 'title' and 'artist' columns.

    Returns:
        pandas.DataFrame: Updated 'not_hot_songs' DataFrame with duplicate rows removed.
    """

    # Check if any rows in 'not_hot_songs' are duplicates in 'hot_100_songs'
    duplicates = not_hot_songs.merge(hot_100_songs, on=['Title', 'Artist'], how='inner')
    
    if duplicates.empty:
        print("No Duplicates")
        return not_hot_songs

    # Drop duplicate rows in 'not_hot_songs' that exist in 'hot_100_songs'
    not_hot_songs = not_hot_songs.drop_duplicates(subset=['Title', 'Artist'], keep=False)

    return not_hot_songs

In [15]:
# Call the function to remove duplicates
filtered_songs = remove_duplicates(not_hot_songs, hot_100_songs)

# Print the updated dataframe
print(filtered_songs)

No Duplicates
                                 Title                    Artist
0                     I Didn't Mean To                    Casual
1                            Soul Deep              The Box Tops
2                      Amor De Cabaret          Sonora Santanera
3                      Something Girls                  Adam Ant
4                       Face the Ashes                       Gob
...                                ...                       ...
9995                    The Hanged Man                 Moonspell
9996  The Wonderful World Of The Young            Danny Williams
9997                   Sentimental Man             Winston Reedy
9998                 Zydeco In D-Minor  Myrick "Freeze" Guillory
9999                    Shattered Life       Seventh Day Slumber

[10000 rows x 2 columns]


In [16]:
filtered_songs = pd.DataFrame({"Title": all_titles, "Artist": all_artists})

In [17]:
not_hot_songs = filtered_songs

In [18]:
not_hot_songs['Title'] = not_hot_songs['Title'].str.decode('utf-8').str.strip("'")
not_hot_songs['Artist'] = not_hot_songs['Artist'].str.decode('utf-8').str.strip("'")

In [19]:
not_hot_songs = not_hot_songs.to_csv('not_hot_songs.csv', index=False)