In [1]:
import pandas as pd
import ast
import re
import nltk
from nltk.corpus import stopwords, words
from nltk import pos_tag

In [2]:
def parse_musixmatch(path):    
    d = {
        'track_id': [],
        'word_counts': []
        }

    with open(path, 'r', encoding='utf-8') as file:
        top_words = None

        # line_count = 0
        for line in file:
            # if line_count >= 25:
            #     break
            # line_count += 1

            line = line.strip()

            if line.startswith('#'):
                continue

            elif line.startswith('%'):
                top_words = line[1:].split(',')
                continue
            else:
                parts = line.split(',')
                track_id = parts[0]
                
                word_counts = {}
                for part in parts[2:]:
                    match = re.match(r'(\d+):(\d+)', part)
                    if match:
                        word = top_words[int(match.group(1)) - 1]
                        count = int(match.group(2))
                        word_counts[word] = count

                d['track_id'].append(track_id)
                d['word_counts'].append(word_counts)

    df = pd.DataFrame(d)
    return df

In [4]:
dataset_path = 'data/mxm_dataset_train.txt'
df = parse_musixmatch(dataset_path)
df = df.head(15000)
df

Unnamed: 0,track_id,word_counts
0,TRAAAAV128F421A322,"{'i': 6, 'the': 4, 'you': 2, 'to': 2, 'and': 5..."
1,TRAAABD128F429CF47,"{'i': 10, 'you': 17, 'to': 8, 'and': 2, 'a': 2..."
2,TRAAAED128E0783FAB,"{'i': 28, 'the': 15, 'you': 2, 'to': 12, 'and'..."
3,TRAAAEF128F4273421,"{'i': 5, 'the': 4, 'you': 3, 'to': 2, 'and': 1..."
4,TRAAAEW128F42930C0,"{'i': 4, 'to': 5, 'and': 7, 'a': 2, 'me': 4, '..."
...,...,...
14995,TRBVHSJ128F1471230,"{'i': 9, 'the': 3, 'you': 4, 'to': 2, 'and': 3..."
14996,TRBVHUT128F42481F8,"{'i': 9, 'the': 1, 'a': 7, 'it': 1, 'not': 3, ..."
14997,TRBVHVW128F428C9C0,"{'i': 1, 'the': 17, 'you': 4, 'a': 1, 'me': 6,..."
14998,TRBVIBY128F932C19F,"{'i': 2, 'the': 12, 'you': 2, 'to': 7, 'and': ..."


In [5]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to /Users/mfardin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mfardin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/mfardin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [6]:
progress = 1

def filter_stop_interjunction(word_counts, total_rows):
    global progress
    print(f"Progress: {(progress * 100 / total_rows):.2f}% → working on {progress} of {total_rows}", end='\r', flush=True)
    progress += 1

    stop_words = set(stopwords.words('english'))

    filtered = {}
    for word, count in word_counts.items():
        if word.lower() not in stop_words:
            tagged = pos_tag([word])
            if tagged[0][1] != 'UH':
                filtered[word] = count

    return filtered      


In [7]:
df['filtered_word_counts'] = df['word_counts'].apply(filter_stop_interjunction, total_rows=len(df))
df

Progress: 100.00% → working on 15000 of 15000

Unnamed: 0,track_id,word_counts,filtered_word_counts
0,TRAAAAV128F421A322,"{'i': 6, 'the': 4, 'you': 2, 'to': 2, 'and': 5...","{'like': 2, 'de': 1, 'got': 1, 'would': 1, 'se..."
1,TRAAABD128F429CF47,"{'i': 10, 'you': 17, 'to': 8, 'and': 2, 'a': 2...","{'know': 5, 'time': 3, 'la': 7, 'get': 2, 'got..."
2,TRAAAED128E0783FAB,"{'i': 28, 'the': 15, 'you': 2, 'to': 12, 'and'...","{'love': 11, 'like': 1, 'time': 6, 'come': 4, ..."
3,TRAAAEF128F4273421,"{'i': 5, 'the': 4, 'you': 3, 'to': 2, 'and': 1...","{'know': 1, 'got': 3, 'feel': 1, 'let': 1, 'wo..."
4,TRAAAEW128F42930C0,"{'i': 4, 'to': 5, 'and': 7, 'a': 2, 'me': 4, '...","{'like': 1, 'take': 1, 'would': 1, 'wo': 1, 's..."
...,...,...,...
14995,TRBVHSJ128F1471230,"{'i': 9, 'the': 3, 'you': 4, 'to': 2, 'and': 3...","{'know': 2, 'get': 1, 'got': 1, 'feel': 2, 'wa..."
14996,TRBVHUT128F42481F8,"{'i': 9, 'the': 1, 'a': 7, 'it': 1, 'not': 3, ...","{'thing': 1, 'world': 1, 'caus': 6, 'gonna': 3..."
14997,TRBVHVW128F428C9C0,"{'i': 1, 'the': 17, 'you': 4, 'a': 1, 'me': 6,...","{'like': 1, 'come': 1, 'well': 1, 'noth': 1, '..."
14998,TRBVIBY128F932C19F,"{'i': 2, 'the': 12, 'you': 2, 'to': 7, 'and': ...","{'know': 1, 'time': 2, 'come': 4, 'go': 1, 'ma..."


In [8]:
nltk.download('words')

[nltk_data] Downloading package words to /Users/mfardin/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [9]:
progress = 1

def is_english_word(word_counts, total_rows):
    global progress
    print(f"Progress: {(progress * 100 / total_rows):.2f}% → working on {progress} of {total_rows}", end='\r', flush=True)
    progress += 1
    
    english_words = set(words.words())

    non_english_count = 0
    filtered_english_counts = {}
    for word, count in word_counts.items():        
        if word not in english_words:
            non_english_count += count
        else:
            filtered_english_counts[word] = count

    total = sum(word_counts.values())

    if non_english_count > total / 2:
        return None
    else:
        return filtered_english_counts

In [10]:
df['filtered_english'] = df['filtered_word_counts'].apply(is_english_word, total_rows=len(df))
df = df.dropna(subset=['filtered_english'])
df

Progress: 100.00% → working on 15000 of 15000

Unnamed: 0,track_id,word_counts,filtered_word_counts,filtered_english
0,TRAAAAV128F421A322,"{'i': 6, 'the': 4, 'you': 2, 'to': 2, 'and': 5...","{'like': 2, 'de': 1, 'got': 1, 'would': 1, 'se...","{'like': 2, 'de': 1, 'got': 1, 'would': 1, 'se..."
1,TRAAABD128F429CF47,"{'i': 10, 'you': 17, 'to': 8, 'and': 2, 'a': 2...","{'know': 5, 'time': 3, 'la': 7, 'get': 2, 'got...","{'know': 5, 'time': 3, 'la': 7, 'get': 2, 'got..."
2,TRAAAED128E0783FAB,"{'i': 28, 'the': 15, 'you': 2, 'to': 12, 'and'...","{'love': 11, 'like': 1, 'time': 6, 'come': 4, ...","{'love': 11, 'like': 1, 'time': 6, 'come': 4, ..."
3,TRAAAEF128F4273421,"{'i': 5, 'the': 4, 'you': 3, 'to': 2, 'and': 1...","{'know': 1, 'got': 3, 'feel': 1, 'let': 1, 'wo...","{'know': 1, 'got': 3, 'feel': 1, 'let': 1, 'wo..."
4,TRAAAEW128F42930C0,"{'i': 4, 'to': 5, 'and': 7, 'a': 2, 'me': 4, '...","{'like': 1, 'take': 1, 'would': 1, 'wo': 1, 's...","{'like': 1, 'take': 1, 'would': 1, 'wo': 1, 's..."
...,...,...,...,...
14995,TRBVHSJ128F1471230,"{'i': 9, 'the': 3, 'you': 4, 'to': 2, 'and': 3...","{'know': 2, 'get': 1, 'got': 1, 'feel': 2, 'wa...","{'know': 2, 'get': 1, 'got': 1, 'feel': 2, 'wa..."
14996,TRBVHUT128F42481F8,"{'i': 9, 'the': 1, 'a': 7, 'it': 1, 'not': 3, ...","{'thing': 1, 'world': 1, 'caus': 6, 'gonna': 3...","{'thing': 1, 'world': 1, 'right': 2, 'girl': 1..."
14997,TRBVHVW128F428C9C0,"{'i': 1, 'the': 17, 'you': 4, 'a': 1, 'me': 6,...","{'like': 1, 'come': 1, 'well': 1, 'noth': 1, '...","{'like': 1, 'come': 1, 'well': 1, 'wait': 7, '..."
14998,TRBVIBY128F932C19F,"{'i': 2, 'the': 12, 'you': 2, 'to': 7, 'and': ...","{'know': 1, 'time': 2, 'come': 4, 'go': 1, 'ma...","{'know': 1, 'time': 2, 'come': 4, 'go': 1, 'ma..."


In [17]:
emotions_list = pd.read_csv('data/emotion_lists.csv', header=0, converters={'emotions':ast.literal_eval})
emotions_list

Unnamed: 0,word,emotions
0,aback,[]
1,abacus,[]
2,abandon,"[fear, sad]"
3,abandoned,"[angry, fear, sad]"
4,abandonment,"[angry, fear, sad, surprise]"
...,...,...
14149,zone,[]
14150,zoo,[]
14151,zoological,[]
14152,zoology,[]


In [19]:
progress = 1

def emotion_map(word_counts, total_rows):
    global progress
    print(f"Progress: {(progress * 100 / total_rows):.2f}% → working on {progress} of {total_rows}", end='\r', flush=True)
    progress += 1

    emotions = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
    freqs = {emotion:0 for emotion in emotions}

    total = 0
    for word, count in word_counts.items():
        
        if word in emotions_list['word'].values:
            emotions = emotions_list.loc[emotions_list['word'] == word, 'emotions'].values[0]
            if not emotions:
                continue

            for emotion in emotions:
                freqs[emotion] += count
                total += count

    if total != 0:
        for emotion in freqs.keys():
            freqs[emotion] /= total

    return freqs


In [20]:
df['emotion_freqs'] = df['filtered_english'].apply(emotion_map, total_rows=len(df))

Progress: 100.00% → working on 13427 of 13427

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['emotion_freqs'] = df['filtered_english'].apply(emotion_map, total_rows=len(df))


In [21]:
final_df = df[['track_id', 'filtered_english', 'emotion_freqs']]
final_df = final_df.rename(columns={'filtered_english':'word_counts'})
final_df

Unnamed: 0,track_id,word_counts,emotion_freqs
0,TRAAAAV128F421A322,"{'like': 2, 'de': 1, 'got': 1, 'would': 1, 'se...","{'angry': 0.05, 'disgust': 0.15, 'fear': 0.15,..."
1,TRAAABD128F429CF47,"{'know': 5, 'time': 3, 'la': 7, 'get': 2, 'got...","{'angry': 0.0, 'disgust': 0.0, 'fear': 0.0, 'h..."
2,TRAAAED128E0783FAB,"{'love': 11, 'like': 1, 'time': 6, 'come': 4, ...","{'angry': 0.0, 'disgust': 0.0, 'fear': 0.02941..."
3,TRAAAEF128F4273421,"{'know': 1, 'got': 3, 'feel': 1, 'let': 1, 'wo...","{'angry': 0.07142857142857142, 'disgust': 0.07..."
4,TRAAAEW128F42930C0,"{'like': 1, 'take': 1, 'would': 1, 'wo': 1, 's...","{'angry': 0.125, 'disgust': 0.5, 'fear': 0.125..."
...,...,...,...
14995,TRBVHSJ128F1471230,"{'know': 2, 'get': 1, 'got': 1, 'feel': 2, 'wa...","{'angry': 0.09523809523809523, 'disgust': 0.0,..."
14996,TRBVHUT128F42481F8,"{'thing': 1, 'world': 1, 'right': 2, 'girl': 1...","{'angry': 0.1791044776119403, 'disgust': 0.149..."
14997,TRBVHVW128F428C9C0,"{'like': 1, 'come': 1, 'well': 1, 'wait': 7, '...","{'angry': 0.05263157894736842, 'disgust': 0.10..."
14998,TRBVIBY128F932C19F,"{'know': 1, 'time': 2, 'come': 4, 'go': 1, 'ma...","{'angry': 0.03571428571428571, 'disgust': 0.0,..."


In [22]:
final_df.to_csv('data/training_data.csv', index=False)