Goal is to detect three types of rhyming techniques: 
- assonance.
- mutli-syllable rhymes.
- rhyming schemes. 

- Read data: 
Read the lyrics in a list of lines. 
- preprocessing steps: 
1. remove special characters. 
2. remove maningless words? (implement it but keep it open). 
3. create a vowel representation for each word (assonance detection)
4. create a syllable representation of each word (multis detection)

In [1]:
import pronouncing


In [2]:
import pyphen

# Create an instance of the Pyphen class using the 'en' dictionary for English language
dic = pyphen.Pyphen(lang='en_US')

# Get the syllables of a word
word = 'income'
syllables = dic.inserted(word).split('-')

print(syllables)  # Output: ['ex', 'am', 'ple']


['in', 'come']


## Read & Preprocess Data

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zohabidi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
new_stop_words = ['ooh','yeah','hey','whoa','woah', 'ohh', 'was', 'mmm', 'oooh','yah','yeh','mmm', 'hmm','deh','doh','jah','wa']
stop_words.extend(new_stop_words)

In [5]:
stop_words[:30]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself']

In [7]:
import string
lines,words = [], []
punctuation = string.punctuation.replace("'", "")
translation_table = str.maketrans(punctuation, ' ' * len(punctuation))

def remove_extra_spaces(input_string):
    # Split the string into words and join them with a single space
    return ' '.join(input_string.split())

with open("BarBreakDown\\lyrics_en\\2pac\\Hit_'Em_Up.txt", 'r') as file:
    # Iterate over each line in the file
    for line in file:
        # Strip leading/trailing whitespace from the line
        line = line.replace("\n", " ")
        # Replace punctuation marks with spaces
        line = line.translate(translation_table)
        line = remove_extra_spaces(line)
        ## remove stopwords not so sure about it. 
        
        lines.append(line)
        words.append(line.split())

In [8]:
words[0]

['I', "ain't", 'got', 'no', "motherfuckin'", 'friends', 'sucka', 'ass']

In [9]:
## create vowel representation: 
import phonetics
#ph.get_phonetic_transcription('bath')
phonetics.metaphone('motherfuckin')

'M0RFKN'

In [10]:
# check rhyming words
phonetics.metaphone('friends')[-1] == phonetics.metaphone('ass')[-1] 

True

In [11]:
bar1 = "sling some raps"
bar2 = "income tax"

words_bar1 = bar1.split(" ")
words_bar2 = bar2.split(" ")

bar1_syllables,bar2_syllables = [],[]
bar1_ph_syllables,bar2_ph_syllables = [],[]


for word in words_bar1:
    syllables = dic.inserted(word).split('-')
    bar1_syllables.append(syllables)
    syllables_ph = []
    for syllable in syllables:
        syllables_ph.append(phonetics.nysiis(syllable))
    bar1_ph_syllables.append(syllables_ph)

for word in words_bar2:
    syllables = dic.inserted(word).split('-')
    bar2_syllables.append(syllables)
    syllables_ph = []
    for syllable in syllables:
        syllables_ph.append(phonetics.nysiis(syllable))
    bar2_ph_syllables.append(syllables_ph)

In [12]:
bar1_ph_syllables

[['SA'], ['SANA'], ['RA']]

In [13]:
bar2_ph_syllables

[['IA', 'CANA'], ['TA']]

In [14]:
bar1 = "Cold Winter Day so I wear my jacket" 
bar2 = "gold winners play because life is a game"

words_bar1 = bar1.split(" ")
words_bar2 = bar2.split(" ")

bar1_syllables,bar2_syllables = [],[]
bar1_ph_syllables,bar2_ph_syllables = [],[]


for word in words_bar1:
    syllables = dic.inserted(word).split('-')
    bar1_syllables.append(syllables)
    syllables_ph = []
    for syllable in syllables:
        syllables_ph.append(phonetics.metaphone(syllable))
    bar1_ph_syllables.append(syllables_ph)

for word in words_bar2:
    syllables = dic.inserted(word).split('-')
    bar2_syllables.append(syllables)
    syllables_ph = []
    for syllable in syllables:
        syllables_ph.append(phonetics.metaphone(syllable))
    bar2_ph_syllables.append(syllables_ph)


In [15]:
bar1_ph_syllables

[['KLT'], ['AN', 'TR'], ['T'], ['S'], ['A'], ['AR'], ['M'], ['JK', 'AT']]

In [16]:
bar2_ph_syllables

[['KLT'], ['AN', 'NRS'], ['PL'], ['P', 'KS'], ['LF'], ['AS'], ['A'], ['KM']]

In [17]:
import Levenshtein

def levenshtein_similarity(str1, str2):
    # Calculate the Levenshtein distance
    distance = Levenshtein.distance(str1, str2)
    # Calculate the similarity ratio
    similarity = 1 - (distance / max(len(str1), len(str2)))
    return similarity

# Example usage
str1 = "KLT AN TR T S A AR"
str2 = "KLT AN NRS PL P KS LF"
similarity = levenshtein_similarity(str1, str2)
print(f"Levenshtein similarity: {similarity:.2f}")

Levenshtein similarity: 0.57


In [18]:
from difflib import SequenceMatcher

def difflib_similarity(str1, str2):
    # Create a SequenceMatcher object
    matcher = SequenceMatcher(None, str1, str2)
    # Calculate the similarity ratio
    similarity = matcher.ratio()
    return similarity

#similarity = difflib_similarity(str1, str2)
#print(f"difflib similarity: {similarity:.2f}")

In [19]:
# function to extract multi syllable rhymes --> assonance extraction --> rhyme schemes. 


In [20]:
similarity_threshold = 0.75
def detect_multi_rhymes(current_line_syllables, next_line_syllables):
    ## for each syllable in the current line check current line and next line. 
    # if you find matching syllables, start concatenating the matching syllable strings, stop when the similarity is under a specific threshold
    ## Sample input
    ## current_line_syllables = ["A", "B", "C"]
    ## next_line_syllables = ["D", "B", "C"]
    i=0
    mutli_rhymes = []

    while(i<(len(current_line_syllables)-1)):

        j,extension = 0, 0
        lookup_field = current_line_syllables[i+1:] + next_line_syllables

        while((i+extension)<len(current_line_syllables) and (j+extension) < len(lookup_field)): 
            str1 = ' '.join(current_line_syllables[i:i+1+extension])
            str2 = ' '.join(lookup_field[j:j+1+extension])

            similarity = difflib_similarity(str1, str2)
            if similarity >= similarity_threshold:
                extension +=1
            elif similarity<similarity_threshold and extension >1:
                break
            elif similarity<similarity_threshold and extension <=1:
                j+=1
                extension = 0
        
        if extension >1:
            mutli_rhyme = (current_line_syllables[i:i+extension],lookup_field[j:j+extension])   
            mutli_rhymes.append(mutli_rhyme)
        
        i = i+extension+1
    
    return mutli_rhymes

detect_multi_rhymes(["A", "B", "C"], ["D", "B", "C"])
#detect_multi_rhymes(["A", "B", "C","D", "M", "N", "O"], ["E","F", "B", "L","B", "C", "D"])


[(['B', 'C'], ['B', 'C'])]

In [21]:
phonetics.metaphone("com")

'KM'

In [22]:
def create_phonetic_syllable_represtation(line):
    words_list = line.split(" ")
    line_syllables,line_syllables_ph = [],[]

    for word in words_list:
        syllables = dic.inserted(word).split('-')
        line_syllables.extend(syllables)
        for syllable in syllables:
            line_syllables_ph.append(phonetics.metaphone(syllable))
    return line_syllables,line_syllables_ph

create_phonetic_syllable_represtation("sling some raps")

(['sling', 'some', 'raps'], ['SLNK', 'SM', 'RPS'])

In [23]:
# for each line -> create syllable phonetic representation of current & next line --> call the multi function
# -> in case you find something add it to a dict: key:line number -> value: multi tuple. 
multi_rhymes = {}
for line_index in range(len(lines)-1):
    _,current_line_syllables_ph=create_phonetic_syllable_represtation(lines[line_index])
    _,next_line_syllables_ph=create_phonetic_syllable_represtation(lines[line_index+1])
    
    multi_rhyme = detect_multi_rhymes(current_line_syllables_ph, next_line_syllables_ph)
    if multi_rhyme is not None:
        multi_rhymes[line_index] = multi_rhyme


In [24]:
number_multi_rhymes, average_len_multi_rhymes = 0,0
for index, (key,value) in enumerate(multi_rhymes.items()):
    if len(value)>0:
        number_multi_rhymes += 1
        average_len_multi_rhymes += len(value[0])

print(number_multi_rhymes, average_len_multi_rhymes/number_multi_rhymes) 

85 2.0


## Assonance

In [25]:
# same logic, you check current line and next line 
# you need a vowel representation 
def cut_at_last_vowel(line):
    vowels = "aeiou"
    words = line.split(" ")
    line_vowel_repr = []

    for word in words:
        last_vowel_pos = -1

        # Find the position of the last vowel
        for i, char in enumerate(reversed(word.lower())):
            if char in vowels:
                last_vowel_pos = len(word) - 1 - i
                break
    
        # If no vowel is found, return the original string
        if last_vowel_pos != -1:
            line_vowel_repr.append(word[:last_vowel_pos + 1].lower()) 

    return line_vowel_repr

def delete_indices_from_list(input_list, indices_to_delete):
    # Sort the indices in reverse order
    indices_to_delete = sorted(indices_to_delete, reverse=True)
    
    # Delete elements at the specified indices
    for index in indices_to_delete:
        del input_list[index]
    
    return input_list

def detect_assonance(current_line_vowel_repr, next_line_vowel_repr):
    assonances = []
    i = 0
    while(i<len(current_line_vowel_repr)):
        j = i+1
        assonance = []
        indices_to_delete_current_line = []
        while (j<len(current_line_vowel_repr)):
            if current_line_vowel_repr[i][-1] == current_line_vowel_repr[j][-1]: 
                assonance.append(current_line_vowel_repr[j])
                indices_to_delete_current_line.append(j)
            j+=1

        j = 0
        while (j<len(next_line_vowel_repr)):
            if current_line_vowel_repr[i][-1] == next_line_vowel_repr[j][-1]: 
                assonance.append(next_line_vowel_repr[j])
            j+=1

        if len(assonance):
            assonance.insert(0,current_line_vowel_repr[i])
            assonances.append(list(set(assonance)))
            indices_to_delete_current_line.append(i)
            delete_indices_from_list(current_line_vowel_repr, indices_to_delete_current_line)
        else:
            i+=1
    
    return assonances

#detect_assonance([ "BA", "MAI", "LOO", "FA"], ["TAA", "LBO", "KKO"])

In [26]:
assonances = {}
for line_index in range(len(lines)-1):
    current_line_vowel_repr = cut_at_last_vowel(lines[line_index])
    if not len(current_line_vowel_repr):
        continue
    next_line_vowel_repr = cut_at_last_vowel(lines[line_index+1])
    
    assonance = detect_assonance(current_line_vowel_repr, next_line_vowel_repr)
    if len(assonance):
        assonances[line_index] = assonance

In [None]:
assonances

## Rhyme Schemes

In [None]:
four_line_schemes = ["ABAB","XAXA", "AAAA", "AABB", "AXAA", "AAXA", "ABBA", "AXXA", "AAAX"]
six_line_schems = ["XXAXXA", "AABCCB", "XAAXBB", "AABAAB"]

In [30]:
twopac_lines = lines 

In [42]:
# comparison not between actualy characters but pronunciation. 
lines = ["That girl is a real crowd pleaser",
         "Small world, all her friends know me",
         "Young bull livin' like an old geezer",
         "Release the cash, watch it fall slowle"] #ABAB 
lines = ["I got the horses in the back",
         "Horse tack is attached",
         "Hat is matte black",
         "Got the boots that's black to match"] ##AAAA
lines = ["Spend all your time waiting",
         "For that second chance",
         "For a break that would make it okay",
         "There's always some reason",
         "To feel not good enough",
         "And it's hard at the end of the day"] #XXAXXA

In [44]:
def add_to_rhyme_schemes(dictionary, key_, value_):
    if key_ not in rhyme_schemes:
        dictionary[key_] = [value_]
    else:
        dictionary[key_].append(value_)
    return dictionary

def are_lists_equal(list1, list2):
    set1 = set(map(tuple, list1))
    set2 = set(map(tuple, list2))
    return set1 == set2

def analyze_line_endings(line_endings):
    # Dictionary to store character information
    char_info = {}

    # Iterate over the string to collect information
    for index, char in enumerate(line_endings):
        if char not in char_info:
            char_info[char] = {'count': 0, 'indices': []}
        char_info[char]['count'] += 1
        char_info[char]['indices'].append(index)
    
    list_indices=[] 
    for k,v in char_info.items():
        list_indices.append(v["indices"])

    return char_info,list_indices

rhyme_schemes = {}

for line_index in range(len(lines)-3):

    remaining_lines = len(lines)-line_index
    four_line_endings = lines[line_index][-1] + lines[line_index+1][-1] + lines[line_index+2][-1] + lines[line_index+3][-1]

    if remaining_lines >5:
        six_line_endings = four_line_endings + lines[line_index+4][-1] + lines[line_index+5][-1]
        six_line_info, list_indices = analyze_line_endings(six_line_endings)

        unique_chars = len(six_line_info)
        match unique_chars:
            case 1:
                pass
            case 2:
                #check if it's AABAAB otherwise pass.
                scheme_AABAAB = are_lists_equal(list_indices, [[0,1,3,4], [2,5]])
                if scheme_AABAAB:
                    rhyme_schemes = add_to_rhyme_schemes(rhyme_schemes, "AABAAB", lines[line_index:line_index+4])
                    scheme_AABAAB = False
                    continue
                continue
                
            case 3:
                #check if it's AABCCB otherwise pass.
                scheme_AABCCB = are_lists_equal(list_indices, [[0,1], [2,5], [3,4]])
                if scheme_AABCCB:
                    rhyme_schemes = add_to_rhyme_schemes(rhyme_schemes, "AABCCB", lines[line_index:line_index+4])
                    scheme_AABCCB = False
                    continue
                continue
            case 4:
                pass
            case 5:
                #check if it's XXAXXA otherwise pass.
                scheme_XXAXXA = are_lists_equal(list_indices, [[0],[1],[2,5], [3],[4]]) 
                if scheme_XXAXXA:
                    rhyme_schemes = add_to_rhyme_schemes(rhyme_schemes, "XXAXXA", lines[line_index:line_index+4])
                    scheme_XXAXXA = False
                    continue
                continue
            case 6:
                continue #no rhyme scheme found.
    
    
    four_line_info, list_indices = analyze_line_endings(four_line_endings)



    unique_chars = len(four_line_info)

    match unique_chars:
        case 1:
            rhyme_schemes = add_to_rhyme_schemes(rhyme_schemes, "AAAA", lines[line_index:line_index+4])
            continue
        case 2:
            # can be ABAB, AABB, ABBA,AAXA
            scheme_ABAB = are_lists_equal(list_indices, [[0,2],[1,3]])
            if scheme_ABAB:
                rhyme_schemes = add_to_rhyme_schemes(rhyme_schemes, "ABAB", lines[line_index:line_index+4])
                scheme_ABAB = False
                continue
            
            scheme_AABB = are_lists_equal(list_indices, [[0,1], [2,3]])
            if scheme_AABB:
                rhyme_schemes = add_to_rhyme_schemes(rhyme_schemes, "AABB", lines[line_index:line_index+4])
                scheme_AABB = False
                continue

            scheme_ABBA = are_lists_equal(list_indices, [[0,3], [1,2]])
            if scheme_ABBA:
                rhyme_schemes = add_to_rhyme_schemes(rhyme_schemes, "ABBA", lines[line_index:line_index+4])
                scheme_ABBA = False
                continue

            scheme_AAXA = are_lists_equal(list_indices, [[0,1,3], [2]])
            if scheme_AAXA:
                rhyme_schemes = add_to_rhyme_schemes(rhyme_schemes, "AAXA", lines[line_index:line_index+4])
                scheme_AAXA = False
                continue

            continue
        case 3:
            #can be XAXA, AXXA
            scheme_XAXA = are_lists_equal(list_indices, [[0],[1,3],[2]])
            if scheme_XAXA:
                rhyme_schemes = add_to_rhyme_schemes(rhyme_schemes, "XAXA", lines[line_index:line_index+4])
                scheme_XAXA = False
                continue
            
            scheme_AXXA = are_lists_equal(list_indices, [[0,3],[1], [2]])
            if scheme_AXXA:
                rhyme_schemes = add_to_rhyme_schemes(rhyme_schemes, "AXXA", lines[line_index:line_index+4])
                scheme_AXXA = False
                continue
            continue
        case 4:
            continue # no rhyme scheme found


## Sentiment Analysis Twitter & Youtube comments

In [49]:
import certifi
print(certifi.where())

c:\Users\zohabidi\Desktop\Hackathon\beef_analysis\myenv\Lib\site-packages\certifi\cacert.pem


In [48]:
import tweepy
 
# Add Twitter API key and secret
consumer_key = ""
consumer_secret = ""
 
# Handling authentication with Twitter
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
 
# Create a wrapper for the Twitter API
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

SSLError: HTTPSConnectionPool(host='api.twitter.com', port=443): Max retries exceeded with url: /oauth2/token (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))

## Google Trends? 