In [19]:
%run ./pkg/db.py

In [20]:
import nltk
import pandas as pd
from dotenv import load_dotenv
from nltk.corpus import stopwords, wordnet
from stemming.porter2 import stem


In [21]:
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/jmoeh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/jmoeh/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jmoeh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:

load_dotenv()
conn_string = os.getenv('POSTGRES_CONNECTION_STRING')

In [23]:
df = get_dataframe('lyrics_limit.sql', conn_string)

SELECT * FROM lyrics WHERE track_id IN (
    SELECT track_id FROM lyrics GROUP BY track_id ORDER BY random() LIMIT 1000
)


In [24]:
# removing stopwords
stops = set(stopwords.words('english'))
stops_stem = set([stem(stop) for stop in stops])
df_stop = df[~df['word'].isin(stops_stem)]
df_stop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51982 entries, 20 to 80182
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   track_id  51982 non-null  object
 1   mxm_tid   51982 non-null  int64 
 2   word      51982 non-null  object
 3   count     51982 non-null  int64 
 4   is_test   51982 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 2.4+ MB


In [25]:
eng_words = set(wordnet.words())
eng_words_stem = set([stem(eng_word) for eng_word in eng_words])

# function to determine if a word is english
def is_english(word: str) -> bool:
    return word in eng_words_stem or word in eng_words

In [26]:
# create a function to calculate the percentage of English words for a track
def get_percentage_english(group: pd.DataFrame) -> float:
    # create a list of words for the track
    words = []
    for _, row in group.iterrows():
        words += [row['word']] * row['count']
    # count the number of English words in the list
    english_count = sum([1 for word in words if is_english(word)])
    # calculate the percentage of English words
    return english_count / len(words)

In [27]:
# removing tracks with less than 90% english words
df_eng = df_stop.groupby(['track_id']).filter(lambda x: get_percentage_english(x) > 0.9)

In [28]:
df_eng = df_eng[df_eng['word'].isin(eng_words) | df_eng['word'].isin(eng_words_stem)]

In [29]:
df_eng.reset_index()[['track_id','word','count']].to_csv('../data/df_eng_1000.csv', encoding='utf-8')