In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pronouncing

In [None]:
import pandas
import seaborn as sns
import pronouncing 
from google.colab import files

In [None]:
#loading data with the last words of the songs
out = pandas.read_excel('/content/drive/MyDrive/thesis/output.xlsx')
out = out.values.tolist()
for i in range (0, len(out)):
    out[i] = [x for x in out[i] if str(x) != 'nan']

print("Done")

In [None]:
#indices of songs for which the lyrics hasn't been found
no_lyrics = pandas.read_excel('/content/drive/MyDrive/thesis/indices_no_lyrics.xlsx')
no_lyrics = no_lyrics[0].tolist()

print(len(no_lyrics))

In [None]:
#check the length of each song - useful for normalization of the rhyme score
length = []
for i in range (0, len(out)):
    length.append(len(out[i]))

print("Done")

In [None]:
#see distribution of lengths of the songs, sometimes the program might have captured non-lyrics but treated them as lyrics
#it can be seen that above 100 are probably outliers, but just to be safe I'll put the threshold for non-lyrics to 150
sns.boxplot(x=length)

In [None]:
#indices of the songs that probably don't have actual lyrics
#will delete those later

too_long = []
for i in range (0, len(length)):
    if length[i] > 150:
        too_long.append(i)
        
print(len(too_long))

In [None]:
#score based on rhymes
#coupled rhymes

count_rhymes_coupled = []

for i in range (0, len(out)):
    j = 0
    count = 0
    while j+1 < len(out[i]):
        if out[i][j] in pronouncing.rhymes(out[i][j+1]):
            count += 1
        j += 2
    count_rhymes_coupled.append(int(count))

print("Done")

In [None]:
#score based on rhymes
#alternating rhymes

count_rhymes_alternating = []

for i in range (0, len(out)):
    j = 0
    count = 0
    while j+2 < len(out[i]):
        if out[i][j] in pronouncing.rhymes(out[i][j+2]):
            count += 1
        j += 1
    count_rhymes_alternating.append(int(count))

print("Done")

In [None]:
#score based on rhymes
#same words

count_rhymes_same = []

for i in range (0, len(out)):
    j = 0
    count = 0
    while j+1 < len(out[i]):
        if out[i][j] == out[i][j+1]:
            count += 1
        j += 2
    count_rhymes_same.append(int(count))

print("Done")

In [None]:
#sum of all values of rhymes
sum_rhymes = [count_rhymes_coupled[i]+count_rhymes_alternating[i]+count_rhymes_same[i] for i in range(len(count_rhymes_coupled))]
print("Done")

In [None]:
sns.boxplot(x=sum_rhymes)

In [None]:
#normalization of the rhyme values
normalization_rhymes = [round(sum_rhymes[i]/length[i], 2) for i in range(len(sum_rhymes))]
norm_coupled = [round(count_rhymes_coupled[i]/length[i], 2) for i in range(len(count_rhymes_coupled))]
norm_alternating = [round(count_rhymes_alternating[i]/length[i], 2) for i in range(len(count_rhymes_alternating))]
norm_same = [round(count_rhymes_same[i]/length[i], 2) for i in range(len(count_rhymes_same))]

print("Done")

In [None]:
#loading data with all the songs and information about their peak rank
data = pandas.read_excel('/content/drive/MyDrive/thesis/dataset.xlsx')
peak_rank = data['peak.rank'].tolist()
print("Done")

In [None]:
#which 10th of the chart it is
rank_10 = []
for i in range (0, len(peak_rank)):
    if 10 >= peak_rank[i] > 0:
        rank_10.append(1)
    elif 20 >= peak_rank[i] >= 11:
        rank_10.append(2)
    elif 30 >= peak_rank[i] >= 21:
        rank_10.append(3)
    elif 40 >= peak_rank[i] >= 31:
        rank_10.append(4)
    elif 50 >= peak_rank[i] >= 41:
        rank_10.append(5)
    elif 60 >= peak_rank[i] >= 51:
        rank_10.append(6)
    elif 70 >= peak_rank[i] >= 61:
        rank_10.append(7)
    elif 80 >= peak_rank[i] >= 71:
        rank_10.append(8)
    elif 90 >= peak_rank[i] >= 81:
        rank_10.append(9)
    else:
        rank_10.append(10)
        
print("Done")

In [None]:
#adding new columns to the dataset 
data['ranks_10'] = rank_10
data['rhyme_score'] = normalization_rhymes
data['coupled'] = norm_coupled
data['alternating'] = norm_alternating
data['same'] = norm_same
data['length'] = length

print(data)

In [None]:
#deleting rows which don't have lyrics
no_lyrics_total = no_lyrics + too_long
end_data = data.drop(no_lyrics_total)
print(end_data)

In [None]:
#loading data with the number of second-person pronouns per song
data_you = pandas.read_excel('/content/drive/MyDrive/thesis/you_all.xlsx')
number_you = data_you[0].tolist()
print("Done")

In [None]:
#normalization of the number of second-person pronouns
updated_length = list(end_data['length'])
norm_you = [round(number_you[i]/updated_length[i], 2) for i in range(len(number_you))]
print("Done")

In [None]:
end_data['number_you'] = number_you
end_data['norm_you'] = norm_you
print("Done")

In [None]:
writer = pandas.ExcelWriter('end_data.xlsx', engine='xlsxwriter')
end_data.to_excel(writer, index=False)
writer.save()
print("Done")

In [None]:
files.download('end_data.xlsx') 

In [None]:
#subset that is going to be used for the models
subset = end_data[end_data["ranks_10"].isin([1, 10])]

ranks_10 = subset['ranks_10'].tolist()

hit = []
for i in range (0, len(ranks_10)):
    if ranks_10[i] == 1:
        hit.append(1)
    else:
        hit.append(0)
       
subset['hit'] = hit

subset

In [None]:
end_subset = subset[subset["length"]>15]

In [None]:
writer = pandas.ExcelWriter('subset.xlsx', engine='xlsxwriter')
end_subset.to_excel(writer, index=False)
writer.save()
print("Done")

In [None]:
files.download('subset.xlsx') 