# 0. Model

In [60]:
from dataclasses import dataclass

@dataclass
class Word:
    content:str
    start:float
    end:float


# 1.Data

In [61]:
from pathlib import Path
import json

def get_gen_words(file:str|Path) -> str:    
    with open(file, 'r') as file:
        json_lyrics:dict = json.loads(file.read())
        text:str = json_lyrics.get('text')
       
    return text

In [62]:
original_lyrics = """Passin' around, your blood and your body
No conversation to ease up your mind
And nobody seems to see the outside of it
Preoccupied, and nobody's hiding it
Preoccupied, nobody's hiding it
Preoccupied, nobody's hiding it
Opened your mind, filled it with bullshit 
Locked up your heart, without even knowing it
It must be a sign, the days that we're living in
Preoccupied, and nobody's hiding it
Preoccupied, and nobody's hiding it
Preoccupied, nobody's hiding it
Preoccupied, yeah, and nobody's hiding it
Preoccupied, and nobody's hiding it"""

generated_lyrics = get_gen_words(Path("./gen_lyrics.json"))

In [63]:
# from pathlib import Path
# import json

def get_segments_lyrics(file:str|Path) -> list[dict]:
    segments = []
    with open(file, 'r') as file:
        json_lyrics:dict = json.loads(file.read())
        segments = json_lyrics.get('segments')
    return segments

def get_words_with_timestamp(file:str|Path) -> list[Word]:
    words_with_timestamp:list[Word] = []
    
    with open(file, 'r') as file:
        json_lyrics:dict = json.loads(file.read())
        segments:list[dict] = json_lyrics.get('segments')
        for seg in segments:
            for word_json in seg['words']:
                word = Word(
                    content=word_json['text'],
                    start=word_json['start'],
                    end=word_json['end'],
                )
                words_with_timestamp.append(word)
    return words_with_timestamp


In [64]:
words_with_timestamp = get_words_with_timestamp(Path("./gen_lyrics.json"))

# 2.Clean

In [65]:
words_org:list[str] = original_lyrics.split()
words_gen:list[str] = generated_lyrics.split()

In [66]:
words_org_cleaned:list[str] = [word.strip(',') for word in words_org]
words_gen_cleaned:list[str] = [word.strip(',') for word in words_gen]

In [78]:
from tabulate import tabulate
print(tabulate({
    "original": words_org,
    "generated": words_gen,

},
headers="keys",
showindex="always"
))

    original      generated
--  ------------  -------------
 0  Passin'       Passing
 1  around,       around,
 2  your          you're
 3  blood         blind
 4  and           in
 5  your          your
 6  body          body,
 7  No            no
 8  conversation  conversation,
 9  to            to
10  ease          ease
11  up            up
12  your          your
13  mind          mind,
14  And           nobody
15  nobody        sings,
16  seems         to
17  to            see
18  see           the
19  the           outside
20  outside       of
21  of            it,
22  it            preoccupied,
23  Preoccupied,  nobody's
24  and           hiding
25  nobody's      it,
26  hiding        preoccupied,
27  it            nobody's
28  Preoccupied,  hiding
29  nobody's      it,
30  hiding        preoccupied,
31  it            nobody's
32  Preoccupied,  hiding
33  nobody's      it,
34  hiding        opened
35  it            your
36  Opened        mind,
37  your          filled
38  mind, 

# 3. Algorithm

In [None]:
from pyphonetics import RefinedSoundex

def balk_spread_algorithm_with_phonetics(original:list[str], gen:list[str], spread:int = 5, distance:int = 1) -> list[tuple]:
    len_gen = len(gen)
    len_original = len(original)
    rs = RefinedSoundex()

    SPREAD = spread

    index_balk = 0

    index_first_spread_target = 0

    similar_words = []

    for index_original in range(len_original):
        word = original[index_original]


        if SPREAD // 2 + index_balk > index_original: 
            index_first_spread_target = index_balk
        else:
            index_first_spread_target = index_original - SPREAD // 2

        for index_spread in range(index_first_spread_target, SPREAD+index_first_spread_target):
            if index_spread < len_gen:
                distance_word = rs.distance(word,gen[index_spread])

                if distance_word <= distance:
                    similar_words.append((index_original, index_spread))
                    index_balk = index_spread + 1
                    break
    
    return similar_words

In [84]:
indexes_of_similar_words = balk_spread_algorithm_with_phonetics(words_org_cleaned,words_gen_cleaned)

In [85]:
not_connected_indexes:list[str]

original_indexes = set([index for index in range(len(words_org))])

connected_indexes = set([con[0] for con in indexes_of_similar_words ])

diff_indexes = original_indexes.difference(connected_indexes)

for index in diff_indexes:
    print(words_org[index], index)
    

and 4
And 14
and 24
nobody's 25
hiding 26
it 27
Preoccupied, 28
nobody's 29
hiding 30
it 31
Preoccupied, 32
nobody's 33
hiding 34
it 35
Opened 36
your 37
mind, 38
filled 39
with 41
bullshit 42
Locked 43
up 44
your 45
heart, 46
without 47
even 48
knowing 49
it 50
It 51
must 52
be 53
a 54
sign, 55
days 57
that 58
we're 59
living 60
in 61
Preoccupied, 62
and 63
nobody's 64
hiding 65
it 66
Preoccupied, 67
and 68
yeah, 77
Preoccupied, 82
and 83
nobody's 84
hiding 85
it 86


In [86]:
for o ,g in indexes_of_similar_words:
    # if o != g:    
        print('{', o, '  ', g,'}'," -> ", o-g,' ----- ', words_org[o]," --- ", words_with_timestamp[g] )


{ 0    0 }  ->  0  -----  Passin'  ---  Word(content='Passing', start=24.57, end=25.31)
{ 1    1 }  ->  0  -----  around,  ---  Word(content='around,', start=25.31, end=26.25)
{ 2    2 }  ->  0  -----  your  ---  Word(content="you're", start=29.77, end=30.65)
{ 3    3 }  ->  0  -----  blood  ---  Word(content='blind', start=30.65, end=31.01)
{ 5    5 }  ->  0  -----  your  ---  Word(content='your', start=31.31, end=31.61)
{ 6    6 }  ->  0  -----  body  ---  Word(content='body,', start=31.61, end=32.51)
{ 7    7 }  ->  0  -----  No  ---  Word(content='no', start=36.15, end=36.91)
{ 8    8 }  ->  0  -----  conversation  ---  Word(content='conversation,', start=36.91, end=38.11)
{ 9    9 }  ->  0  -----  to  ---  Word(content='to', start=41.65, end=42.7)
{ 10    10 }  ->  0  -----  ease  ---  Word(content='ease', start=42.7, end=43.08)
{ 11    11 }  ->  0  -----  up  ---  Word(content='up', start=43.08, end=43.38)
{ 12    12 }  ->  0  -----  your  ---  Word(content='your', start=43.38, e