# Clean Genres:
This notebook cleans the raw genre data by mapping them to standardized genres.

**TODO**:
- Make `map_genres()` prettier (even though all my attempts yet made it significantly slower)

In [3]:
# Imports here
import pandas as pd
import ast

In [5]:
# Functions here
def map_genres(genres):
    '''
    Maps list of raw genres to list of standardized genre.

    Args:
        genres (list of str): List of genre names.

    Returns:
        list of str: Standardized genre names.
    Example:
        >>> map_genre(['k-Pop', 'film'])
        ['Asian Music', 'Movie/Games Music']
        >>> map_genre(['alternative pop'])
        ['Alternative', 'Pop']
        >>> map_genre(['unknown genre'])
        ['Not Found']
    '''
    if not genres: # Empty list or None type
        return ['Not Found']
    
    # Modify genre for easier mapping
    genre_masks = [genre.lower().replace("-", "").replace(" ", "")
        for genre in genres if isinstance(genre, str)]
    
    # Custom manual mapping using keywords
    clean_genres = []
    for genre_mask in genre_masks:

        if 'rock' in genre_mask:
            clean_genres.append('Rock')

        if 'film' in genre_mask or 'videospiel' in genre_mask or\
            'movie' in genre_mask or 'game' in genre_mask or 'musical' in genre_mask:
             clean_genres.append('Movie/Game Music')

        if 'asia' in genre_mask or 'kpop' in genre_mask or 'anime' in genre_mask or\
           'korean' in genre_mask or 'jpop' in genre_mask or 'japan' in genre_mask:
            clean_genres.append('Asian Music')

        if 'pop' in genre_mask or ('singer' in genre_mask and 'songwriter' in genre_mask):
            clean_genres.append('Pop')

        if ('hiphop' in genre_mask or 'gangsta' in genre_mask or\
              'rap' in genre_mask or 'dirty south' in genre_mask)\
              and not 'trap' in genre_mask:
            clean_genres.append('Hip Hop')

        if 'metal' in genre_mask or 'punk' in genre_mask:
            clean_genres.append('Metal')

        if 'jazz' in genre_mask or 'swing' in genre_mask:
            clean_genres.append('Jazz')

        if 'classic' in genre_mask or 'orchestra' in genre_mask:
            clean_genres.append('Classical')

        if 'folk' in genre_mask:
            clean_genres.append('Folk')

        if 'r&b' in genre_mask or 'soul' in genre_mask or 'rnb' in genre_mask:
            clean_genres.append('R&B/Soul')

        if 'reggae' in genre_mask:
            clean_genres.append('Reggae')

        if 'country' in genre_mask:
            clean_genres.append('Country')

        if 'blues' in genre_mask:
            clean_genres.append('Blues')

        if 'alternative' in genre_mask or 'indie' in genre_mask:
            clean_genres.append('Alternative/Indie')

        if 'chill' in genre_mask or 'relax' in genre_mask or\
             'lofi' in genre_mask or 'lo-fi' in genre_mask:
            clean_genres.append('Chill')

        if 'xmas' in genre_mask or 'christmas' in genre_mask or 'weihnacht' in genre_mask or\
             'snow' in genre_mask:
            clean_genres.append('Christmas')

        if 'afro' in genre_mask or 'africa' in genre_mask or 'afrika' in genre_mask:
            clean_genres.append('African Music')

        if 'latin' in genre_mask or 'corridos' in genre_mask or 'ranchera' in genre_mask or\
             'brasil' in genre_mask or 'brazil' in genre_mask or 'india' in genre_mask or\
             'indisch' in genre_mask or 'bollywood' in genre_mask or 'spanish' in genre_mask or\
             'argentine' in genre_mask:
            clean_genres.append('Latin Music')

        if 'electro' in genre_mask or 'techno' in genre_mask or 'house' in genre_mask or\
             'dance' in genre_mask or 'rave' in genre_mask or 'synth' in genre_mask or\
             'trap' in genre_mask or 'party' in genre_mask or 'disco' in genre_mask:
            clean_genres.append('Electronic')

    if clean_genres:
        return list(set(clean_genres))
    
    return ['Not Found']

In [None]:
# Add genres from unique cleaned data to raw data

raw_file = '../local/data/daily17-24.csv'
clean_file = '../local/data/cleaned_global_17-24.csv'

# Load raw and cleaned data
raw_df = pd.read_csv(raw_file)
clean_df = pd.read_csv(clean_file)

# Merge the dataframes
merged_df = pd.merge(raw_df, clean_df[['uri', 'genres']], on='uri', how='left')

# Save the merged data
merged_df.to_csv(raw_file.split('.')[0] + '_with_genres.csv', index=False)

In [None]:
# Choose files
in_file = '../local/data/daily17-24_with_genres.csv'
out_file1 = '../local/data/daily17-24_with_cleaned_genres.csv'
out_file2 = '../local/data/daily17-24_with_exploded_genres.csv'

# Load data
df = pd.read_csv(in_file)

# Fill NaN values
df['genres'] = df['genres'].fillna('[]')

# Convert strings representing lists in 'genres'
# to actual listsg for later list operations
df['genres'] = df['genres'].apply(
    lambda col: ast.literal_eval(col) if isinstance(col, str) else col
)

# Clean genres using custom mapping and remove duplicates
df['genres'] = df['genres'].apply(map_genres)

# Save cleaned data
df.to_csv(out_file1, index=False)
print('Saved cleaned genres to', out_file1)

# Explode the 'genres' column.
# Eeach list element in each row in 'genres' has now an own row.
df = df.explode('genres')

# Save exploded data
df.to_csv(out_file2, index=False)
print('Saved exploded genres to', out_file2)

Saved cleaned genres to local/data/daily17-24_with_cleaned_genres.csv
Saved exploded genres to local/data/daily17-24_with_exploded_genres.csv
