In [1]:
import pandas as pd
import numpy as np

import re
import os

import urllib.request
from bs4 import BeautifulSoup

In [2]:
fn_list = os.listdir('data/raw/')

### Get timed data from youtube srt

In [3]:
def create_lyrics_table(fn):
    #read lines
    with open(fn, 'r') as h:
        sub = h.readlines()
    
    # Get start,end times
    re_pattern = r'[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} -->'
    regex = re.compile(re_pattern)
    times = list(filter(regex.search, sub))
    start_times = [time.split(' ')[0] for time in times]    
    end_times = [time.split(' ')[2].strip('\n') for time in times]   
    
    # Get lines
    lines = [[]]
    for sentence in sub:
        if re.match(re_pattern, sentence):
            lines[-1].pop()
            lines.append([])
        else:
            clean_line = re.sub("[\(\[].*?[\)\]],", "", sentence.strip())
            lines[-1].append(clean_line)
    lines = lines[1:]   
    lines = [x[0] for x in lines]
    
    # construct dataframe
    df = pd.DataFrame(list(zip(start_times, end_times, lines)),\
                  columns=['start_time','end_time','lyric'])
    #get duration
    df['duration'] = pd.to_datetime(df['end_time'])-pd.to_datetime(df['start_time']) 
    df['duration'] = df['duration']/ np.timedelta64(1, 's')
    return df

In [95]:
fn = 'data/raw/ICantStopMe_eng.srt'
df = create_lyrics_table(fn)
df.head()

Unnamed: 0,start_time,end_time,lyric,duration
0,"00:00:15,370","00:00:18,953",Alarm goes off Ring ring a ling,3.583
1,"00:00:18,953","00:00:22,220",Whenever our eyes meet,3.267
2,"00:00:22,220","00:00:25,522",I notice but just spin around,3.302
3,"00:00:25,522","00:00:28,814",Yet I keep getting closer I know it’s too late,3.292
4,"00:00:28,814","00:00:31,814",In my heart I already know,3.0


In [32]:
fn = "More&More_ita.srt"
df = create_lyrics_table("data/raw/"+fn)
df['member'] = ''
df.to_csv("data/processed/srts/"+fn.split(".srt")[0]+".csv", index=False, encoding='utf8')



In [100]:
for fn in fn_list:
    try:
        df = create_lyrics_table("data/raw/"+fn)
        df['member'] = ''
        df.to_csv("data/processed/"+fn.split(".srt")[0]+".csv", index=False)
    except Exception as e:
        print(e)
        

### Get member tagged lines

In [36]:
def get_member_lines(lyric_url):
    ##################
    with urllib.request.urlopen(lyric_url) as response:
        content = response.read()
        
    soup = BeautifulSoup(content)
    print(lyric_url)
    #get song title
    #title = soup.find_all('h2',attrs = {'class':"has-text-align-center"})[0].text.replace(" ","")
    #print(title)
    #get color code members
    try:
        #M&M
        member_colors_raw = soup.find_all('p',attrs = {'class':"has-text-align-center"})[1]
    except Exception as e:
        try:
        #ICSM
            member_colors_raw = [p for p in soup.find_all('p',attrs = {'style':"text-align: center"}) if "Jihyo" in str(p)][0]
        except:
        #all else:
            member_colors_raw = [td for td in soup.find_all('td',attrs = {'style':"text-align: center"}) if "Jihyo" in str(td)][0]      
        
    #print(member_colors_raw)#.find_all('span'))
    member_colors_dict = dict([(mc['style'].split(':')[1].strip(),mc.text)\
                      for mc in member_colors_raw.find_all('span')])
    print(member_colors_dict)
    
    def find_member_color(x):
        try:
            return member_colors_dict[x]
        except:
            return ''
    #get romanized color coded lyrics and color
    #first table is romanized table
    print()
    try:
        max_length_div = max([len(str(td)) for td in soup.find_all('div', attrs = {'class':"wp-block-group"})])
        for n,td in enumerate(soup.find_all('div', attrs = {'class':"wp-block-group"})):
            if (n==2)&("twice-i-can-t-stop-me" in lyric_url):
                lyrics_table = td
            
            elif (len(str(td))==max_length_div):
                lyrics_table = td
            
    except:
        max_length_div = max([len(str(td)) for td in soup.find_all('td')])
        for td in soup.find_all('td'):
            if len(str(td))==max_length_div:
                lyrics_table = td
    
    
    if "wp-block-group__inner-container" in str(lyrics_table):
        lt = lyrics_table.find_all("div", attrs = {'class':"wp-block-group__inner-container"})
        lyrics_table = max(lt, key=len)
   
        
    lyrics_raw = lyrics_table.find_all('span')
    lyrics = [l.text for l in lyrics_raw]
    member_tags = [l['style'].split(': ')[1] for l in lyrics_raw]
    
    #build dataframe
    df = pd.DataFrame({'member':member_tags,
                  'line':lyrics})
    df['member'] = df['member'].apply(lambda x: find_member_color(x))
    df= df[['line','member']]
    return df

In [30]:
lyric_urls = ['https://colorcodedlyrics.com/2020/10/twice-i-can-t-stop-me',\
              'https://colorcodedlyrics.com/2018/07/twice-dance-night-away',\
             'https://colorcodedlyrics.com/2017/02/twice-knock-knock',\
             'https://colorcodedlyrics.com/2016/10/twice-tt',
             'https://colorcodedlyrics.com/2017/10/twice-likey',
             'https://colorcodedlyrics.com/2017/12/twice-heart-shaker',\
             'https://colorcodedlyrics.com/2015/10/twice-like-ooh-ahh-ooh-ahhhage',\
             'https://colorcodedlyrics.com/2016/04/twice-cheer-up',\
             'https://colorcodedlyrics.com/2017/05/twice-signal',\
             'https://colorcodedlyrics.com/2018/04/twice-what-is-love',\
             'https://colorcodedlyrics.com/2019/04/twice-fancy',\
             'https://colorcodedlyrics.com/2018/11/twice-yes-yes',\
              'https://colorcodedlyrics.com/2019/09/twice-feel-special',\
              'https://colorcodedlyrics.com/2020/06/twice-more-amp-more',\
              'https://colorcodedlyrics.com/2020/12/twice-cry-for-me'
             ]
song_names = ["ICantStopMe","DanceTheNightAway","KnockKnock","TT","Likey","HeartShaker",\
            "LikeOohAhh","CheerUp","Signal","WhatIsLove","Fancy","YesOrYes","FeelSpecial","More&More","CryForMe"]

In [37]:
song_name='LikeOohAhh'
lyric_url = 'https://colorcodedlyrics.com/2015/10/twice-like-ooh-ahh-ooh-ahhhage'
df = get_member_lines(lyric_url)
print(df.head())
df.to_csv("data/processed/color_coded_lyrics/"+song_name+".csv", index=False)

https://colorcodedlyrics.com/2015/10/twice-like-ooh-ahh-ooh-ahhhage
{'#00ccff': 'Nayeon', '#84cc00': 'Jeongyeon', '#ffb1b8': 'Momo', '#996de7': 'Sana', '#ffb74d': 'Jihyo', '#1af0af': 'Mina', '#ffe7ff': 'Dahyun', '#ff1744': 'Chaeyoung', '#396ad8': 'Tzuyu'}

                                        line  member
0     modu nareul gajigo maeil gaman an dujo  Nayeon
1                       naega neomu yeppeujo    Momo
2                   na ttaemune da himdeuljo    Momo
3  eodil geotgo isseodo ppalgan badagingeojo  Nayeon
4                    Red carpet gateun gibun    Sana


In [31]:
for n,lyric_url in enumerate(lyric_urls):
    print(song_names[n])
    try:
        df = get_member_lines(lyric_url)
        print(df.head())
        df.to_csv("data/processed/color_coded_lyrics/"+song_names[n]+".csv", index=False)
    except Exception as e:
        print(e)

ICantStopMe
https://colorcodedlyrics.com/2020/10/twice-i-can-t-stop-me
{'#00ccff': 'Nayeon', '#84cc00': 'Jeongyeon', '#ffb1b8': 'Momo', '#996de7': 'Sana', '#ffb74d': 'Jihyo', '#1af0af': 'Mina', '#ffe7ff': 'Dahyun', '#ff1744': 'Chaeyoung', '#396ad8': 'Tzuyu'}

                                       line  member
0         allami ullyeodae Ring ring a ling  Nayeon
1           seoroye nungiri daheul ttaemada  Nayeon
2              almyeonseo bingbing doneunde    Mina
3  jeomjeom dagagajana I know it’s too late    Mina
4           maeumsogeuroneun da algo itjana    Sana
DanceTheNightAway
https://colorcodedlyrics.com/2018/07/twice-dance-night-away
{'#00ccff': 'Nayeon', '#84cc00': 'Jeongyeon', '#ffb1b8': 'Momo', '#996de7': 'Sana', '#ffb74d': 'Jihyo', '#1af0af': 'Mina', '#ffe7ff': 'Dahyun', '#ff1744': 'Chaeyoung', '#396ad8': 'Tzuyu'}

                            line  member
0  Lalalalalalala lalalalalalala    Mina
1  Lalalalalalala lalalalalalala    Mina
2    You and me in the moonlight  Naye

In [367]:
lyric_url = 'https://colorcodedlyrics.com/2020/12/twice-cry-for-me'
df = get_member_lines(lyric_url)
df

https://colorcodedlyrics.com/2020/12/twice-cry-for-me
{'#00ccff': 'Nayeon', '#84cc00': 'Jeongyeon', '#ffb1b8': 'Momo', '#996de7': 'Sana', '#ffb74d': 'Jihyo', '#1af0af': 'Mina', '#ffe7ff': 'Dahyun', '#ff1744': 'Chaeyoung', '#396ad8': 'Tzuyu'}
<div class="wp-block-group__inner-container"><p><span style="color: #00ccff">I know gochyeo sseul gachido eoptan geol</span><br/><span style="color: #00ccff">hajiman geunyeowa dalli nan neol</span><br/><span style="color: #00ccff">shwipge nwajul mami eopgeodeun</span> (<span style="color: #ff1744">Never let go</span>)</p><p><span style="color: #1af0af">You don’t know me</span><br/><span style="color: #1af0af">L O V E or hatred</span><br/><span style="color: #1af0af">ibyeol daeshin nan sunjinhan misoman</span><br/><span style="color: #1af0af">oneuldo ne pume angillae</span></p><p><span style="color: #ffe7ff">amugeotto moreuneun cheok</span><br/><span style="color: #ffb1b8">Baby no more real love</span><br/><span style="color: #ffe7ff">neoye gyeote i

Unnamed: 0,member,line
0,Nayeon,I know gochyeo sseul gachido eoptan geol
1,Nayeon,hajiman geunyeowa dalli nan neol
2,Nayeon,shwipge nwajul mami eopgeodeun
3,Chaeyoung,Never let go
4,Mina,You don’t know me
...,...,...
78,Jihyo,yongseohal pinggyereul mandeureoga
79,Chaeyoung,I want you to
80,Chaeyoung,I want you to
81,Chaeyoung,I want you to


In [None]:
lyric_url = 'https://colorcodedlyrics.com/2018/07/twice-dance-night-away'
df = get_member_lines(lyric_url)
print(df.head())
df.to_csv("data/processed/color_coded_lyrics/DanceTheNightAway.csv", index=False)

In [295]:
lyric_url = 'https://colorcodedlyrics.com/2018/07/twice-dance-night-away'
df = get_member_lines(lyric_url)
print(df.head())
df.to_csv("data/processed/color_coded_lyrics/DanceTheNightAway.csv", index=False)

{'#00ccff': 'Nayeon', '#84cc00': 'Jeongyeon', '#ffb1b8': 'Momo', '#996de7': 'Sana', '#ffb74d': 'Jihyo', '#1af0af': 'Mina', '#ffe7ff': 'Dahyun', '#ff1744': 'Chaeyoung', '#396ad8': 'Tzuyu'}
   member                           line
0    Mina  Lalalalalalala lalalalalalala
1    Mina  Lalalalalalala lalalalalalala
2  Nayeon    You and me in the moonlight
3  Nayeon  byeol kkot chukje yeollin bam
4    Sana           pado sorireul teulgo


In [250]:
lyric_url = 'https://colorcodedlyrics.com/2020/10/twice-i-can-t-stop-me'
df = get_member_lines(lyric_url)
print(df.head())
df.to_csv("data/processed/color_coded_lyrics/ICantStopMe.csv", index=False)

ICAN’TSTOPME
   member                                      line
0  Nayeon         allami ullyeodae Ring ring a ling
1  Nayeon           seoroye nungiri daheul ttaemada
2    Mina              almyeonseo bingbing doneunde
3    Mina  jeomjeom dagagajana I know it’s too late
4    Sana           maeumsogeuroneun da algo itjana


In [249]:
lyric_url = 'https://colorcodedlyrics.com/2020/06/twice-more-amp-more'
df = get_member_lines(lyric_url)
print(df.head())
df.to_csv("data/processed/color_coded_lyrics/More&More.csv", index=False)

MORE&MORE
   member                                               line
0  Nayeon                                   I know I want it
1  Nayeon                   ibe bareun sorin ije geumanhalge
2  Nayeon                      Cause I deserve it deserve it
3    Mina  hokshi jamkkan naega miweojideorado geokjeong ...
4    Mina                        Cause I know you I know you


In [187]:
lyric_url = 'https://colorcodedlyrics.com/2020/06/twice-more-amp-more'
get_member_lines(lyric_url)

MORE&MORE


Unnamed: 0,member,line
0,Nayeon,I know I want it
1,Nayeon,ibe bareun sorin ije geumanhalge
2,Nayeon,Cause I deserve it deserve it
3,Mina,hokshi jamkkan naega miweojideorado geokjeong ...
4,Mina,Cause I know you I know you
...,...,...
56,Tzuyu,geureoni han beon deo
57,Nayeon,geureoni han beon deo
58,Mina,meomchugiga shireo
59,Nayeon,More and more


In [281]:
lyric_url = 'https://colorcodedlyrics.com/2018/07/twice-dance-night-away'
##################
with urllib.request.urlopen(lyric_url) as response:
    content = response.read()
soup = BeautifulSoup(content)

In [285]:
"Jihyo" in str(soup.find_all('td',attrs = {'style':"text-align: center"})[0])

True

In [253]:
member_colors_raw = soup.find_all('td',attrs = {'style':"text-align: center"})
#member_colors_dict = dict([(mc['style'].split(':')[1],mc.text)\
#                      for mc in member_colors_raw.find_all('span')])
member_colors_raw

[<td style="text-align: center"><span style="color: #00ccff">Nayeon</span>, <span style="color: #84cc00">Jeongyeon</span>, <span style="color: #ffb1b8">Momo</span>,<br/>
 <span style="color: #996de7">Sana</span>, <span style="color: #ffb74d">Jihyo</span>, <span style="color: #1af0af">Mina</span>,<br/>
 <span style="color: #ffe7ff">Dahyun</span>, <span style="color: #ff1744">Chaeyoung</span>, <span style="color: #396ad8">Tzuyu</span></td>]

In [293]:
max_length_div = max([len(td) for td in soup.find_all('td')])


21

In [206]:
member_colors_raw = soup.find_all('',attrs = {'class':"has-text-align-center"})[1]
member_colors_dict = dict([(mc['style'].split(':')[1],mc.text)\
                      for mc in member_colors_raw.find_all('span')])
member_colors_dict

IndexError: list index out of range

In [220]:
lyric_table = soup.find_all('div', attrs = {'class':"wp-block-group"})[2]
lyrics_raw = lyric_table.find_all('span')
lyrics = [l.text for l in lyrics_raw]
member_tags = [l['style'].split(': ')[1] for l in lyrics_raw]


['#00ccff',
 '#00ccff',
 '#1af0af',
 '#1af0af',
 '#996de7',
 '#996de7',
 '#ff1744',
 '#ff1744',
 '#84cc00',
 '#84cc00',
 '#ffb74d',
 '#ffb74d',
 '#ffb74d',
 '#ffb1b8',
 '#00ccff',
 '#00ccff',
 '#00ccff',
 '#ffe7ff',
 '#396ad8',
 '#ffb1b8',
 '#ffe7ff',
 '#ffb1b8',
 '#84cc00',
 '#ffb74d',
 '#396ad8',
 '#396ad8',
 '#996de7',
 '#996de7',
 '#ff1744',
 '#ff1744',
 '#00ccff',
 '#00ccff',
 '#00ccff',
 '#ffe7ff',
 '#ffb74d',
 '#00ccff',
 '#ffb74d',
 '#ffb74d',
 '#1af0af',
 '#ff1744',
 '#ff1744',
 '#ffe7ff',
 '#ffe7ff',
 '#ffb1b8',
 '#ffb1b8',
 '#ffe7ff',
 '#ffe7ff',
 '#1af0af',
 '#996de7',
 '#396ad8',
 '#00ccff',
 '#84cc00',
 '#84cc00',
 '#84cc00',
 '#ff1744',
 '#00ccff',
 '#00ccff',
 '#00ccff',
 '#996de7',
 '#396ad8']

In [173]:
#romanized lyrics
lyric_table = soup.find_all('td')[0]
lyrics_raw = lyric_table.find_all('span')
lyrics = [l.text for l in lyrics_raw]
member_tags = [l['style'].split(': ')[1] for l in lyrics_raw]

In [176]:
df = pd.DataFrame({'member':member_tags,
                  'line':lyrics})
df['member'] = df['member'].apply(lambda x: member_colors_dict[x])
df

Unnamed: 0,member,line
0,Nayeon,I know I want it
1,Nayeon,ibe bareun sorin ije geumanhalge
2,Nayeon,Cause I deserve it deserve it
3,Mina,hokshi jamkkan naega miweojideorado geokjeong ...
4,Mina,Cause I know you I know you
...,...,...
56,Tzuyu,geureoni han beon deo
57,Nayeon,geureoni han beon deo
58,Mina,meomchugiga shireo
59,Nayeon,More and more


In [None]:
sub

In [None]:
re_pattern = r'[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} -->'
regex = re.compile(re_pattern)
# Get start times
times = list(filter(regex.search, sub))
start_times = [time.split(' ')[0] for time in times]    
end_times = [time.split(' ')[2].strip('\n') for time in times]   
list(zip(start_times,end_times))

In [None]:
lines = [[]]
for sentence in sub:
    if re.match(re_pattern, sentence):
        lines[-1].pop()
        lines.append([])
    else:
        clean_line = re.sub("[\(\[].*?[\)\]],", "", sentence.strip())
        lines[-1].append(clean_line)
lines = lines[1:]   
lines = [x[0] for x in lines]
lines

In [None]:
df = pd.DataFrame(list(zip(start_times, end_times, lines)),\
                  columns=['start_time','end_time','lyric'])
df

In [None]:
df['duration'] = pd.to_datetime(df['end_time'])-pd.to_datetime(df['start_time']) 
df['duration'] = df['duration']/ np.timedelta64(1, 's')
df

In [None]:
df['duration'].values[0].item().total_seconds()