In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [31]:
import os
import re
from typing import List, Tuple

In [None]:
%matplotlib inline

In [33]:
def split_text_into_languages(text: str) -> Tuple[List[str], List[str]]:
    """
    Splits a mixed-language text into separate lists of English and Korean words.

    Args:
        text (str): The input mixed-language (Korean and English) text.

    Returns:
        Tuple[List[str], List[str]]: A tuple containing two lists of words,
        where the first list contains English words and the second list contains Korean words.
    """
    english_words = re.findall(r'\b[a-zA-Z]+\b', text)
    korean_words = re.findall(r'\b[가-힣]+\b', text)
    return english_words, korean_words

In [44]:
raw_data = []
for song in os.listdir("data/"):
    total_english_words = 0
    total_korean_words = 0
    with open(f"data/{song}") as f:
        while True:
            line = f.readline()
            if not line:
                break
            if "[" in line or "]" in line:
                continue
            eng_words, kor_words = split_text_into_languages(line)
            total_english_words += len(eng_words)
            total_korean_words += len(kor_words)
    raw_data.append({
        "song_name": song[:song.index(".")].replace("_", " ").title(),
        "total_english_words": total_english_words,
        "total_korean_words": total_korean_words,
        "overall_total": sum([total_english_words, total_korean_words])
    })

In [48]:
df = pd.DataFrame.from_records(raw_data)
df.head(13)

Unnamed: 0,song_name,total_english_words,total_korean_words,overall_total
0,Super Shy,355,30,385
1,Hype Boy,268,91,359
2,New Jeans,181,19,200
3,Zero,267,79,346
4,Ditto,233,69,302
5,Get Up,49,0,49
6,Cookie,154,188,342
7,Asap,598,14,612
8,Cool With You,166,39,205
9,Omg,375,140,515


In [49]:
billboard_df = pd.read_csv("data/Hot 100.csv")

In [55]:
billboard_df[billboard_df["performer"] == "NewJeans"]

Unnamed: 0,chart_position,chart_date,song,performer,song_id,instance,time_on_chart,consecutive_weeks,previous_week,peak_position,worst_position,chart_debut,chart_url
63926,96,2023-01-21,Ditto,NewJeans,DittoNewJeans,1.0,1,,,96,96,2023-01-21,https://www.billboard.com/charts/hot-100/2023-...
63927,85,2023-01-28,Ditto,NewJeans,DittoNewJeans,1.0,2,1.0,96.0,85,96,2023-01-21,https://www.billboard.com/charts/hot-100/2023-...
63928,89,2023-02-04,Ditto,NewJeans,DittoNewJeans,1.0,3,2.0,85.0,85,96,2023-01-21,https://www.billboard.com/charts/hot-100/2023-...
63929,90,2023-02-11,Ditto,NewJeans,DittoNewJeans,1.0,4,3.0,89.0,85,96,2023-01-21,https://www.billboard.com/charts/hot-100/2023-...
63930,82,2023-02-18,Ditto,NewJeans,DittoNewJeans,1.0,5,4.0,90.0,82,96,2023-01-21,https://www.billboard.com/charts/hot-100/2023-...
205295,91,2023-01-28,OMG,NewJeans,OMGNewJeans,1.0,1,,,91,91,2023-01-28,https://www.billboard.com/charts/hot-100/2023-...
205296,79,2023-02-04,OMG,NewJeans,OMGNewJeans,1.0,2,1.0,91.0,79,91,2023-01-28,https://www.billboard.com/charts/hot-100/2023-...
205297,77,2023-02-11,OMG,NewJeans,OMGNewJeans,1.0,3,2.0,79.0,77,91,2023-01-28,https://www.billboard.com/charts/hot-100/2023-...
205298,74,2023-02-18,OMG,NewJeans,OMGNewJeans,1.0,4,3.0,77.0,74,91,2023-01-28,https://www.billboard.com/charts/hot-100/2023-...
205299,76,2023-02-25,OMG,NewJeans,OMGNewJeans,1.0,5,4.0,74.0,74,91,2023-01-28,https://www.billboard.com/charts/hot-100/2023-...
