In [116]:
import os
import pandas as pd
pd.options.mode.copy_on_write = False

In [125]:
def convert_song_genres(batch):
    """
    converts the genre of all the songs in a single batch (size of 100) to simplified genres
    """
    batch['genres'] = batch['genres'].apply(simplify_genres)
    return batch

def simplify_genres(genre_str):
    """
    simplify the genre string of a single song to a list of mapped genres
    """
    if 'Need Manual Check' in genre_str:
        return None
    
    else: 
        genre_list = eval(genre_str)
        simplified_genres = set()
        
        genre_mapping = {
            'pop': 'Pop',
            'rock': 'Rock',
            'hip hop': 'Hip Hop',
            'rap': 'Hip Hop',
            'r&b': 'R&B',
            'electronic': 'Electronic/Dance',
            'edm': 'Electronic/Dance',
            'dance': 'Electronic/Dance',
            'country': 'Country',
            'jazz': 'Jazz',
            'reggae': 'Reggae',
            'latin': 'Latin',
            'classical': 'Classical'
        }
        
        for g in genre_list:
            g = g.lower()
            for k, v in genre_mapping.items():
                if k == g:
                    simplified_genres.add(v)
        
        return list(simplified_genres) if simplified_genres else None

In [126]:
input_folder = "output"
output_folder = "simplified_output"

os.makedirs(output_folder, exist_ok=True)

batch_list = os.listdir(input_folder)

for batch in batch_list:
    if batch.endswith(".csv"):
        file_path = os.path.join(input_folder, batch)
        
        if os.path.getsize(file_path) == 0:
            print(f"Empty file: {batch}")
            continue
                
        df = pd.read_csv(file_path)
        new_file_path = os.path.join(output_folder, batch)
        df2 = convert_song_genres(df)
        df2.to_csv(new_file_path, index=False)

print("파일 수정 및 저장이 완료되었습니다.")

파일 수정 및 저장이 완료되었습니다.


In [127]:
df2.head()

Unnamed: 0,date,rank,song_title,artist,genres
0,2017-11-18,1.0,Rockstar,Post Malone,"[Pop, Hip Hop]"
1,2017-11-18,2.0,Havana,Camila Cabello,[Pop]
2,,,Bodak Yellow (Money Moves),Cardi B,
3,2017-11-18,4.0,1-800-273-8255,Logic,[Hip Hop]
4,2017-11-18,5.0,Thunder,Imagine Dragons,"[Rock, Pop]"


In [130]:
# merge all the files into one

import glob

files = os.path.join("simplified_output/", "genres_data_batch_*.csv")
joined_list = glob.glob(files)

joined_df = pd.concat([pd.read_csv(f) for f in joined_list], ignore_index=True)
joined_df

Unnamed: 0,date,rank,song_title,artist,genres
0,2016-01-02,1.0,Hello,Adele,['Pop']
1,2016-01-02,2.0,Sorry,Justin Bieber,['Pop']
2,2016-01-02,3.0,Hotline Bling,Drake,['Hip Hop']
3,2016-01-02,4.0,Love Yourself,Justin Bieber,['Pop']
4,2016-01-02,5.0,What Do You Mean?,Justin Bieber,['Pop']
...,...,...,...,...,...
30595,,,Too Hotty,"Quavo, Takeoff",
30596,2017-11-18,97.0,All The Pretty Girls,Kenny Chesney,['Country']
30597,2017-11-18,98.0,All On Me,Devin Dawson,
30598,2017-11-18,99.0,More Girls Like You,Kip Moore,['Country']


In [131]:
joined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30600 entries, 0 to 30599
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        29953 non-null  object 
 1   rank        29953 non-null  float64
 2   song_title  30600 non-null  object 
 3   artist      30364 non-null  object 
 4   genres      23852 non-null  object 
dtypes: float64(1), object(4)
memory usage: 1.2+ MB
