In [None]:
%run ./pkg/db.py

In [None]:
import nltk
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from nltk.corpus import stopwords, wordnet
from stemming.porter2 import stem


In [None]:
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')

In [None]:

load_dotenv()
conn_string = os.getenv('POSTGRES_CONNECTION_STRING')

In [None]:
df_lyrics = get_dataframe('lyrics.sql', conn_string)
df_songs = get_dataframe('songs.sql', conn_string)

In [None]:
# removing stopwords
stops = set(stopwords.words('english'))
stops_stem = set([stem(stop) for stop in stops])
df_stop = df_lyrics[~df_lyrics['word'].isin(stops_stem)]
df_stop.info()

In [None]:
eng_words = set(wordnet.words())
eng_words_stem = set([stem(eng_word) for eng_word in eng_words])

# function to determine if a word is english
def is_english(word: str) -> bool:
    return word in eng_words_stem or word in eng_words

In [None]:
# create a function to calculate the percentage of English words for a track
def get_percentage_english(group: pd.DataFrame) -> float:
    # create a list of words for the track
    words = []
    for _, row in group.iterrows():
        words += [row['word']] * row['count']
    # count the number of English words in the list
    english_count = sum([1 for word in words if is_english(word)])
    # calculate the percentage of English words
    return english_count / len(words)

In [None]:
# removing tracks with less than 90% english words
df_eng = df_stop.groupby(['track_id']).filter(lambda x: get_percentage_english(x) > 0.9)

In [None]:
# remove remaining non english words
df_eng = df_eng[df_eng['word'].isin(eng_words) | df_eng['word'].isin(eng_words_stem)]

In [None]:
# asseble fragment column with counts and word
df_eng['fragment'] = df_eng.apply(lambda x: [x['word']] * x['count'], axis=1)

In [None]:
# build track dataframe with combined fragments
df_track = df_eng.groupby(df_eng['track_id'])['fragment'].apply(list).reset_index()
df_track['fragment'] = df_track['fragment'].apply(lambda x: sum(x, []))
df_track['fragment'] = df_track['fragment'].apply(lambda x: np.random.permutation(x).tolist())

In [None]:
# merge with song metadata
df_final = pd.merge(df_track, df_songs, left_on='track_id', right_on='track_id')

In [None]:
# filter out unwanted genres
df_final = df_final[df_final['genre'].isin(['Reggae','Rap','Pop_Rock','Country','RnB'])]

In [None]:
# sample out 3000 tracks for pop rock to equalize genre counts
df_final = pd.concat([df_final[df_final['genre'] != 'Pop_Rock'], df_final[df_final['genre'] == 'Pop_Rock'].sample(n=3000, random_state=1)])

In [None]:
# save as dataframe
df_eng.to_csv('../data/df_final.csv', encoding='utf-8')