In [2]:
import pandas as pd
from pymongo import MongoClient
from recommender.infrastructure.repository.mongodb import (
    MongoDBTracksRepository
)

Open connection to DB

In [3]:
client = MongoClient()
db = client.mgr
plays_repository = MongoDBTracksRepository(db.playedtracks)

Get all rows from DB, and create a dataframe with all the song plays

In [4]:
records = plays_repository.all_raw()
plays = pd.DataFrame(list(records))
plays.head()

Unnamed: 0,_id,artist,name,tags,loved,user_playcount,total_playcount,playback_utc_date,mbid
0,615da9a3690c7486296ebdb9,"{'name': 'The Crystal Method', 'mbid': 'aaf09f...",Name of the Game (edit),"[electronic, electronica, the crystal method, ...",False,0,20575,2007-07-31 01:12:10,
1,615da9a0690c7486296ebdb8,"{'name': 'Tiësto', 'mbid': 'aabb1d9f-be12-45b3...",Sweet Things,"[trance, Tiesto, electronic, vocal trance, ele...",False,0,131760,2007-07-31 01:13:02,c97cf355-964c-4481-8c72-e3e3b7b31399
2,615da99d690c7486296ebdb7,"{'name': 'Massive Attack', 'mbid': '10adbe5e-a...",Protection (The Eno Mix),"[trip-hop, chillout, Massive Attack, electroni...",False,0,53855,2007-07-31 01:16:23,03c6743e-a300-49ec-bf9f-e66505c95bdd
3,615da99b690c7486296ebdb6,"{'name': 'Apollo 440', 'mbid': '1ff10dff-7ac7-...",The Machine in the Ghost,"[electronic, electronic rock, chillout, ambien...",False,0,67939,2007-07-31 01:16:23,
4,615da998690c7486296ebdb5,"{'name': 'Chicane', 'mbid': 'c3480e79-a8d9-44f...",Early,"[chillout, ambient, trance, electronic, Chican...",False,0,209607,2007-07-31 12:44:18,f5f63906-1c43-4602-8fe2-0648923759ed


Total tracks played

In [5]:
count = plays.shape[0]
count

90030

First track played

In [6]:
plays_sorted = plays.sort_values("playback_utc_date")
first = plays_sorted.iloc[0]

print("Date: ", str(first.playback_utc_date))
print("Artist: ", first.artist)
print("Song: ", first["name"])

Date:  2007-07-31 01:12:10
Artist:  {'name': 'The Crystal Method', 'mbid': 'aaf09f31-bb5c-43e5-9f54-bb6554c33a71'}
Song:  Name of the Game (edit)


Last track played

In [7]:
last = plays_sorted.iloc[count - 1]
last["name"] + " - " + str(last.playback_utc_date)

print("Date: ", str(last.playback_utc_date))
print("Artist: ", last.artist)
print("Song: ", last["name"])

Date:  2021-10-02 21:08:41
Artist:  {'name': 'Technique', 'mbid': 'e602592e-3c4b-454a-97d4-f0659ee5ff05'}
Song:  Sun Is Shining - Mash Up Matt Remix


# Tags analysis

Unique tag count

In [8]:
from itertools import chain
from collections import Counter

# Flatten different lists into a single 1-dimensional list
all_tags = chain.from_iterable(plays["tags"])

# To lower case
all_tags = [t.lower() for t in all_tags]

len(Counter(all_tags).keys())

12870

Top-20 tags of all time

In [9]:
from itertools import chain
from collections import Counter

Counter(all_tags).most_common(50)

[('electronic', 35234),
 ('electronica', 20845),
 ('chillout', 15048),
 ('ambient', 12893),
 ('dance', 12384),
 ('indie', 11001),
 ('electro', 9758),
 ('alternative', 9219),
 ('downtempo', 8764),
 ('techno', 7986),
 ('house', 7984),
 ('rock', 7888),
 ('pop', 7347),
 ('chill', 7056),
 ('instrumental', 6730),
 ('trip-hop', 6344),
 ('female vocalists', 5689),
 ('british', 5479),
 ('minimal', 4480),
 ('lounge', 4414),
 ('soundtrack', 4154),
 ('trance', 4064),
 ('indie rock', 3965),
 ('90s', 3716),
 ('experimental', 3656),
 ('idm', 3639),
 ('beautiful', 3275),
 ('synthpop', 3051),
 ('french', 3009),
 ('alternative rock', 2918),
 ('electropop', 2510),
 ('disco', 2388),
 ('dreamy', 2366),
 ('indie pop', 2310),
 ('minimal techno', 2285),
 ('jazz', 2277),
 ('mellow', 2187),
 ('funk', 2176),
 ('80s', 1928),
 ('00s', 1868),
 ('indietronica', 1847),
 ('deep house', 1750),
 ('trip hop', 1738),
 ('psychedelic', 1713),
 ('atmospheric', 1645),
 ('remix', 1621),
 ('sexy', 1591),
 ('soul', 1588),
 ('sho

Get top-10 tags by year

In [53]:
TAGS_DISPLAYED_PER_YEAR = 10

tags = plays[["tags", "playback_utc_date"]]

def most_common_tag(series):
    # Flatten different lists into a single 1-dimensional list
    tags = chain.from_iterable(series)
    # To lower case
    tags = [t.lower() for t in tags]
    # Return most common
    return (Counter(tags), )

most_common_tags_by_year = (
    tags
        .resample(rule='y', on='playback_utc_date')["tags"]
        .apply(most_common_tag)
        .reset_index()
        .set_index("playback_utc_date"))

for moment in most_common_tags_by_year.index:
    print("\n====", moment.year, "====" )
    for tags in most_common_tags_by_year.loc[moment]:
        counter = tags[0]
        total_count = sum(counter.values())
        print(f"Total tag plays: {total_count}")

        for t, count in counter.most_common(TAGS_DISPLAYED_PER_YEAR):
            percent = 100 * count / total_count
            print("-", f"{t}:", count, f"({percent:.2f}%)") 



==== 2007 ====
Total tag plays: 10974
- electronic: 753 (6.86%)
- electronica: 508 (4.63%)
- dance: 403 (3.67%)
- chillout: 330 (3.01%)
- trance: 312 (2.84%)
- techno: 275 (2.51%)
- alternative: 236 (2.15%)
- rock: 212 (1.93%)
- house: 206 (1.88%)
- ambient: 205 (1.87%)

==== 2008 ====
Total tag plays: 56060
- electronic: 4361 (7.78%)
- electronica: 3102 (5.53%)
- chillout: 2421 (4.32%)
- dance: 1839 (3.28%)
- ambient: 1611 (2.87%)
- trip-hop: 1333 (2.38%)
- downtempo: 1330 (2.37%)
- electro: 1222 (2.18%)
- techno: 1189 (2.12%)
- house: 1108 (1.98%)

==== 2009 ====
Total tag plays: 54880
- electronic: 4130 (7.53%)
- electronica: 2819 (5.14%)
- chillout: 1795 (3.27%)
- electro: 1700 (3.10%)
- dance: 1602 (2.92%)
- ambient: 1449 (2.64%)
- downtempo: 1147 (2.09%)
- alternative: 973 (1.77%)
- indie: 961 (1.75%)
- trip-hop: 946 (1.72%)

==== 2010 ====
Total tag plays: 69915
- electronic: 5079 (7.26%)
- electronica: 2797 (4.00%)
- ambient: 1906 (2.73%)
- electro: 1630 (2.33%)
- chillout: 15

What about the least common? Probably, lots of noisy tags

In [56]:
import numpy as np

TAGS_DISPLAYED_PER_YEAR = 20

tags = plays[["tags", "playback_utc_date"]]

def most_common_tag2(series):
    # Flatten different lists into a single 1-dimensional list
    tags = chain.from_iterable(series)
    # To lower case
    tags = [t.lower() for t in tags]
    # Return most common
    return (Counter(tags), )

least_common_tags_by_year = (
    tags
        .resample(rule='y', on='playback_utc_date')["tags"]
        .apply(most_common_tag2)
        .reset_index()
        .set_index("playback_utc_date"))

for moment in least_common_tags_by_year.index:
    print("\n====", moment.year, "====" )
    for tags in least_common_tags_by_year.loc[moment]:
        counter = tags[0]
        total_count = sum(counter.values())
        print(f"Total tag plays: {total_count}")

        for t, count in counter.most_common()[-TAGS_DISPLAYED_PER_YEAR:]:
            percent = 100 * count / total_count
            print("-", f"{t}:", count, f"({percent:.3f}%)") 


==== 2007 ====
Total tag plays: 10974
- mostly instrumental: 1 (0.009%)
- all time favourite: 1 (0.009%)
- beck: 1 (0.009%)
- laidback electronica: 1 (0.009%)
- 2005: 1 (0.009%)
- gentle: 1 (0.009%)
- soulseekartists: 1 (0.009%)
- winter mix: 1 (0.009%)
- post rock: 1 (0.009%)
- samples: 1 (0.009%)
- rjd2: 1 (0.009%)
- club-dance: 1 (0.009%)
- robot rock: 1 (0.009%)
- hope sandoval: 1 (0.009%)
- females: 1 (0.009%)
- dantopluv: 1 (0.009%)
- letsdance: 1 (0.009%)
- robot: 1 (0.009%)
- french touch: 1 (0.009%)
- best song: 1 (0.009%)

==== 2008 ====
Total tag plays: 56060
- tipejas: 1 (0.002%)
- psych folk: 1 (0.002%)
- jefferson airplane: 1 (0.002%)
- triad: 1 (0.002%)
- brass band: 1 (0.002%)
- march: 1 (0.002%)
- drum gods: 1 (0.002%)
- kanzleramt: 1 (0.002%)
- wicked: 1 (0.002%)
- canibal morte songs: 1 (0.002%)
- nothing: 1 (0.002%)
- boys and girls singing together: 1 (0.002%)
- claude vonstroke: 1 (0.002%)
- ravel habanera: 1 (0.002%)
- habanera: 1 (0.002%)
- vc: 1 (0.002%)
- mut

In [109]:
TAGS_DISPLAYED_PER_YEAR = 10

tags = plays[["tags", "playback_utc_date"]]

def get_counter(series):
    # Flatten different lists into a single 1-dimensional list
    tags = chain.from_iterable(series)
    # To lower case
    tags = [t.lower() for t in tags]
    # Return most common
    return (Counter(tags), )
    

counter_by_year = (
    tags
        .resample(rule='y', on='playback_utc_date')["tags"]
        .apply(get_counter)
        .reset_index()
        .set_index("playback_utc_date"))

prev_moment = None
for moment in counter_by_year.index:
    print("\n====", moment.year, "====" )
    if prev_moment:
        prev_counter = counter_by_year.loc[prev_moment][0][0].copy()
        counter = counter_by_year.loc[moment][0][0].copy()
        counter.subtract(prev_counter)
        common = counter.most_common()
        print("Up:")
        for t, delta in common[:TAGS_DISPLAYED_PER_YEAR]:
            print("-", t, delta)
        
        print("Down:")
        for t, delta in common[:-TAGS_DISPLAYED_PER_YEAR:-1]:
            print("-", t, delta)
    prev_moment = moment

    


==== 2007 ====

==== 2008 ====
Up:
- electronic 3608
- electronica 2594
- chillout 2091
- dance 1436
- ambient 1406
- trip-hop 1192
- downtempo 1166
- electro 1040
- techno 914
- house 902
Down:
- dave gahan -16
- spanish fusion -8
- depeche mode -8
- math rock -7
- strings organs rubber wind -6
- emd -6
- variables -6
- 22-20s -6
- something  peaceful -6

==== 2009 ====
Up:
- electro 478
- indie 311
- synthpop 224
- remix 204
- electropop 175
- instrumental 154
- rock 129
- electroclash 129
- french 107
- indietronica 107
Down:
- chillout -626
- trance -468
- trip-hop -387
- house -284
- electronica -283
- techno -282
- lounge -262
- chill -239
- dance -237

==== 2010 ====
Up:
- electronic 949
- idm 763
- minimal 748
- experimental 716
- indie 571
- techno 539
- ambient 457
- minimal techno 410
- indie rock 371
- tech house 355
Down:
- dance -387
- trip-hop -339
- female vocalists -288
- chillout -223
- lounge -205
- pop -199
- trance -168
- downtempo -167
- norwegian -155

==== 2011

## Next steps

* Most common tags visualized in stacked area charts
* Alternative temporal aggregations (by month?)
* Tags correlation
* Clean tags with incorrect / irrelevant meaning (NLP)
* How tags map to or predict genres?
* Find sets of songs played together in the same time window (identify potential mixtapes, playlists...) and their inner relationships
* Introduce Spotify API audio features in the experiments. The API provides audio features for each song