## Data Notes:


In [39]:
import os
import re
from typing import List, Tuple
import json

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import lyricsgenius
plt.style.use("dark_background")
# sns.set_theme(style="darkgrid")

%matplotlib inline

In [20]:
def clean_lyrics(text: str):
    return " ".join(text.split("\n")[1:])

In [6]:
def split_text_into_languages(text: str) -> Tuple[List[str], List[str]]:
    """
    Splits a mixed-language text into separate lists of English and Korean words.

    Args:
        text (str): The input mixed-language (Korean and English) text.

    Returns:
        Tuple[List[str], List[str]]: A tuple containing two lists of words,
        where the first list contains English words and the second list contains Korean words.
    """
    english_words = re.findall(r'\b[a-zA-Z]+\b', text)
    korean_words = re.findall(r'\b[가-힣]+\b', text)
    return english_words, korean_words

In [None]:
################################################
## Used for collecting lyric data from Genius ##
################################################

# token = ""
# genius = lyricsgenius.Genius(token)
# genius.remove_section_headers = True # Remove section headers (e.g. [Chorus]) from lyrics when searching
# genius.skip_non_songs = True # Include hits thought to be non-songs (e.g. track lists)
# genius.excluded_terms = ["(Remix)", "(Live)", "(Cover)"] # Exclude songs with these words in their title

# artists = [
#     "NewJeans",
#     "BTS",
#     "(G)I-DLE",
#     "FIFTY FIFTY",
#     "AESPA",
#     "BLACKPINK",
#     "TWICE",
#     "Enhyphen",
#     "Stray Kids",
#     "TOMORROW X TOGETHER"
# ]

# data = {}
# for artist in artists:
#     data[artist] = {}
#     songs = genius.search_artist(artist, sort="title").songs
#     for song in songs:
#         print(f"Parsing {song} by {artist}")
#         title = song.title
#         eng_words, kor_words = split_text_into_languages(
#             clean_lyrics(song.lyrics)
#         )
#         data[artist][title] = {
#             "Total English Words": len(eng_words),
#             "Total Korean Words": len(kor_words)
#         }

In [57]:
df = pd.DataFrame.from_dict({(i,j): data[i][j] 
                           for i in data.keys() 
                           for j in data[i].keys()},
                       orient='index').reset_index()
df = df.rename(columns={"level_0":"Artist","level_1":"Song Name"})
df.to_csv("data/language_distribution_data.csv")

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1104 entries, 0 to 1103
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Artist               1104 non-null   object
 1   Song Name            1104 non-null   object
 2   Total English Words  1104 non-null   int64 
 3   Total Korean Words   1104 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 34.6+ KB


In [61]:
df.describe()

Unnamed: 0,Total English Words,Total Korean Words
count,1104.0,1104.0
mean,183.580616,114.571558
std,308.235994,107.870857
min,0.0,0.0
25%,78.0,0.0
50%,147.0,116.5
75%,233.0,192.0
max,5948.0,792.0
