# Table of Contents

[Finding Data](#Finding-Datasets) 

[Exploring Data](#Exploring-Datasets)

[Cleaning](#Cleaning-Datasets)

[Outputs](#Outputting)

### Everynoise
https://www.reddit.com/r/spotify/comments/4r3r1c/massive_dump_of_spotify_created_playlists_14k/

## APIs

In [None]:
import pandas as pd

# Path to your Excel file
file_path = 'C:\\Users\\jerry\\Documents\\Ucla\\100 days of data\\Orpheus\\Spotify Playlist Dump.xlsx'


# Read all sheets into a dictionary of dataframes
all_sheets = pd.read_excel(file_path, sheet_name=None)

# Accessing a specific sheet by name
sheet1_df = all_sheets['The Sound']

# Print the sheet names and the first few rows of each
for sheet_name, df in all_sheets.items():
    print(f"Sheet name: {sheet_name}")
    print(df.head())

### MusixMatch
https://developer.musixmatch.com

In [None]:
# Example: Getting list of genres with description on Musixmatch Api

import requests
api_key = 'yourApiKey'

# Define the endpoint URL
url = 'https://api.musixmatch.com/ws/1.1/music.genres.get'

# Define the parameters
params = {
    'apikey': api_key,
    # Add other parameters as needed
}

# Send the request
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    musixMatchData = response.json()
else:
    print(f'Error: {response.status_code}')

# Extracting the list of genres
genres = musixMatchData['message']['body']['music_genre_list']

# Creating a list of genre names
musixMatch_genre_names = [genre['music_genre']['music_genre_name'] for genre in genres]

# Counting the number of genres
genre_count = len(musixMatch_genre_names)

# Output the list and count
print("List of Genres:")
print(musixMatch_genre_names)
print("\nCount of Genres:")
print(genre_count)

musixMatch_genre_names = pd.Series(musixMatch_genre_names)
musixMatch_genre_names.to_csv('musixMatchGenreList.csv', index=False, header=['Genre'])

Extra work:

- Normalize genre names and look for partial matching to connect genre names in a list to the musixMatch Api

In [None]:
import re
import unicodedata

def normalize_string(s):
    # Convert to lowercase, remove accents, and strip extra spaces
    s = s.strip().lower()
    s = unicodedata.normalize('NFKD', s)
    s = ''.join(c for c in s if not unicodedata.combining(c))  # Remove accents
    s = re.sub(r'\s+', ' ', s)  # Replace multiple spaces with a single space
    s = re.sub(r'-+', ' ', s)  # Replace multiple hyphens with a space
    return s

def is_substring(sub, string):
    # Check if sub is a substring of string
    return sub in string

# Normalize the genre names
thePulseOfNames_normalized = [normalize_string(name) for name in thePulseOfNames]
musixMatch_genre_names_normalized = [normalize_string(name) for name in musixMatch_genre_names]

# Create sets for normalized names
thePulseOfNames_set = set(thePulseOfNames_normalized)
musixMatch_genre_names_set = set(musixMatch_genre_names_normalized)

# Find overlap with partial matching
overlap = set()
for pulse_name in thePulseOfNames_set:
    if any(is_substring(pulse_name, match_name) or is_substring(match_name, pulse_name)
           for match_name in musixMatch_genre_names_set):
        overlap.add(pulse_name)

# Find non-overlap in PulseOf
non_overlap = thePulseOfNames_set - overlap
non_overlap_count = len(non_overlap)

### Wikipedia API

In [None]:
# !pip install wikipedia-api

In [None]:
import requests
import pandas as pd

def check_wikipedia(name):
    """
    Check if a Wikipedia page exists for the given name.
    """
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{name}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        return 'extract' in data
    return False

def filter_names_with_wikipedia(df, column_name):
    """
    Filter the DataFrame by checking if Wikipedia pages exist for names in the given column.
    """
    initial_count = len(df)
    names_to_delete = []

    # Filter DataFrame rows
    df_filtered = df[df[column_name].apply(lambda name: not check_wikipedia(name))]

    # Track names without Wikipedia results
    names_to_delete = df_filtered[column_name].tolist()

    # Delete rows without Wikipedia results
    df = df.drop(df_filtered.index)
    
    final_count = len(df)
    deleted_count = initial_count - final_count

    # Report how many were deleted and which ones
    print(f"Total names checked: {initial_count}")
    print(f"Names deleted: {deleted_count}")
    print("Names without Wikipedia results:", names_to_delete)

    return df


# Filter the DataFrame and get the cleaned version
cleaned_df = filter_names_with_wikipedia(thePulseOfList.head(100), 0)
print("\n")

# Finding Datasets

# Exploring Datasets

# Cleaning Datasets

# Outputting