In [None]:
import pandas as pd
from pymongo import MongoClient
from recommender.infrastructure.repository.mongodb import (
    MongoDBTracksRepository
)

Open connection to DB

In [2]:
client = MongoClient()
db = client.mgr
plays_repository = MongoDBTracksRepository(db.playedtracks)

Get all rows from DB, and create a dataframe with all the song plays

In [96]:
records = plays_repository.all_raw()
plays = pd.DataFrame(list(records))
plays.head()

Unnamed: 0,_id,artist,name,tags,loved,user_playcount,total_playcount,playback_utc_date,mbid
0,615da9a3690c7486296ebdb9,"{'name': 'The Crystal Method', 'mbid': 'aaf09f...",Name of the Game (edit),"[electronic, electronica, the crystal method, ...",False,0,20575,2007-07-31 01:12:10,
1,615da9a0690c7486296ebdb8,"{'name': 'Tiësto', 'mbid': 'aabb1d9f-be12-45b3...",Sweet Things,"[trance, Tiesto, electronic, vocal trance, ele...",False,0,131760,2007-07-31 01:13:02,c97cf355-964c-4481-8c72-e3e3b7b31399
2,615da99d690c7486296ebdb7,"{'name': 'Massive Attack', 'mbid': '10adbe5e-a...",Protection (The Eno Mix),"[trip-hop, chillout, Massive Attack, electroni...",False,0,53855,2007-07-31 01:16:23,03c6743e-a300-49ec-bf9f-e66505c95bdd
3,615da99b690c7486296ebdb6,"{'name': 'Apollo 440', 'mbid': '1ff10dff-7ac7-...",The Machine in the Ghost,"[electronic, electronic rock, chillout, ambien...",False,0,67939,2007-07-31 01:16:23,
4,615da998690c7486296ebdb5,"{'name': 'Chicane', 'mbid': 'c3480e79-a8d9-44f...",Early,"[chillout, ambient, trance, electronic, Chican...",False,0,209607,2007-07-31 12:44:18,f5f63906-1c43-4602-8fe2-0648923759ed


Total tracks played

In [97]:
count = plays.shape[0]
count

90030

First track played

In [98]:
plays_sorted = plays.sort_values("playback_utc_date")
first = plays_sorted.iloc[0]

print("Date: ", str(first.playback_utc_date))
print("Artist: ", first.artist)
print("Song: ", first["name"])

Date:  2007-07-31 01:12:10
Artist:  {'name': 'The Crystal Method', 'mbid': 'aaf09f31-bb5c-43e5-9f54-bb6554c33a71'}
Song:  Name of the Game (edit)


Last track played

In [99]:
last = plays_sorted.iloc[count - 1]
last["name"] + " - " + str(last.playback_utc_date)

print("Date: ", str(last.playback_utc_date))
print("Artist: ", last.artist)
print("Song: ", last["name"])

Date:  2021-10-02 21:08:41
Artist:  {'name': 'Technique', 'mbid': 'e602592e-3c4b-454a-97d4-f0659ee5ff05'}
Song:  Sun Is Shining - Mash Up Matt Remix


# Tags analysis

Unique tag count

In [100]:
from itertools import chain
from collections import Counter

# Flatten different lists into a single 1-dimensional list
all_tags = chain.from_iterable(plays["tags"])

# To lower case
all_tags = [t.lower() for t in all_tags]

len(Counter(all_tags).keys())

12870

Top-20 tags of all time

In [101]:
from itertools import chain
from collections import Counter

Counter(all_tags).most_common(50)

[('electronic', 35234),
 ('electronica', 20845),
 ('chillout', 15048),
 ('ambient', 12893),
 ('dance', 12384),
 ('indie', 11001),
 ('electro', 9758),
 ('alternative', 9219),
 ('downtempo', 8764),
 ('techno', 7986),
 ('house', 7984),
 ('rock', 7888),
 ('pop', 7347),
 ('chill', 7056),
 ('instrumental', 6730),
 ('trip-hop', 6344),
 ('female vocalists', 5689),
 ('british', 5479),
 ('minimal', 4480),
 ('lounge', 4414),
 ('soundtrack', 4154),
 ('trance', 4064),
 ('indie rock', 3965),
 ('90s', 3716),
 ('experimental', 3656),
 ('idm', 3639),
 ('beautiful', 3275),
 ('synthpop', 3051),
 ('french', 3009),
 ('alternative rock', 2918),
 ('electropop', 2510),
 ('disco', 2388),
 ('dreamy', 2366),
 ('indie pop', 2310),
 ('minimal techno', 2285),
 ('jazz', 2277),
 ('mellow', 2187),
 ('funk', 2176),
 ('80s', 1928),
 ('00s', 1868),
 ('indietronica', 1847),
 ('deep house', 1750),
 ('trip hop', 1738),
 ('psychedelic', 1713),
 ('atmospheric', 1645),
 ('remix', 1621),
 ('sexy', 1591),
 ('soul', 1588),
 ('sho

Get top-10 tags by year

In [103]:
TAGS_DISPLAYED_PER_YEAR = 10

tags = plays[["tags", "playback_utc_date"]]

def most_common_tag(series):
    # Flatten different lists into a single 1-dimensional list
    tags = chain.from_iterable(series)
    # To lower case
    tags = [t.lower() for t in tags]
    # Return most common
    return Counter(tags).most_common(TAGS_DISPLAYED_PER_YEAR)

most_common_tags_by_year = (
    tags
        .resample(rule='y', on='playback_utc_date')["tags"]
        .apply(most_common_tag)
        .reset_index()
        .set_index("playback_utc_date"))

for moment in most_common_tags_by_year.index:
    print("\n====", moment.year, "====" )
    for tags in most_common_tags_by_year.loc[moment]:
        for t, count in tags:
            print("-", t, f"({count})") 
            


==== 2007 ====
- electronic (753)
- electronica (508)
- dance (403)
- chillout (330)
- trance (312)
- techno (275)
- alternative (236)
- rock (212)
- house (206)
- ambient (205)

==== 2008 ====
- electronic (4361)
- electronica (3102)
- chillout (2421)
- dance (1839)
- ambient (1611)
- trip-hop (1333)
- downtempo (1330)
- electro (1222)
- techno (1189)
- house (1108)

==== 2009 ====
- electronic (4130)
- electronica (2819)
- chillout (1795)
- electro (1700)
- dance (1602)
- ambient (1449)
- downtempo (1147)
- alternative (973)
- indie (961)
- trip-hop (946)

==== 2010 ====
- electronic (5079)
- electronica (2797)
- ambient (1906)
- electro (1630)
- chillout (1572)
- indie (1532)
- techno (1446)
- minimal (1306)
- dance (1215)
- house (1062)

==== 2011 ====
- electronic (4144)
- electronica (2157)
- ambient (1954)
- chillout (1713)
- indie (1465)
- instrumental (1279)
- electro (1082)
- downtempo (971)
- alternative (855)
- soundtrack (848)

==== 2012 ====
- electronic (4571)
- indie (

What about the least common? Probably, lots of noisy tags

In [125]:
TAGS_DISPLAYED_PER_YEAR = 20

tags = plays[["tags", "playback_utc_date"]]

def most_common_tag(series):
    # Flatten different lists into a single 1-dimensional list
    tags = chain.from_iterable(series)
    # To lower case
    tags = [t.lower() for t in tags]
    # Return most common
    return Counter(tags).most_common()

least_common_tags_by_year = (
    tags
        .resample(rule='y', on='playback_utc_date')["tags"]
        .apply(most_common_tag)
        .reset_index()
        .set_index("playback_utc_date"))

for moment in least_common_tags_by_year.index:
    print("\n====", moment.year, "====" )
    for tags in most_common_tags_by_year.loc[moment]:
        for t, count in tags[-TAGS_DISPLAYED_PER_YEAR:]:
            print("-", t, f"({count})") 


==== 2007 ====
- mostly instrumental (1)
- all time favourite (1)
- beck (1)
- laidback electronica (1)
- 2005 (1)
- gentle (1)
- soulseekartists (1)
- winter mix (1)
- post rock (1)
- samples (1)
- rjd2 (1)
- club-dance (1)
- robot rock (1)
- hope sandoval (1)
- females (1)
- dantopluv (1)
- letsdance (1)
- robot (1)
- french touch (1)
- best song (1)

==== 2008 ====
- tipejas (1)
- psych folk (1)
- jefferson airplane (1)
- triad (1)
- brass band (1)
- march (1)
- drum gods (1)
- kanzleramt (1)
- wicked (1)
- canibal morte songs (1)
- nothing (1)
- boys and girls singing together (1)
- claude vonstroke (1)
- ravel habanera (1)
- habanera (1)
- vc (1)
- mute records (1)
- back ground music (1)
- bounce (1)
- wills chill radio (1)

==== 2009 ====
- miike snow (1)
- feina (1)
- psychobilly (1)
- prestissimo (1)
- surf music (1)
- f (1)
- 2011 (1)
- beatles cover (1)
- instrumental surf (1)
- dba (1)
- chispas (1)
- surf del bueno (1)
- dbasuper (1)
- meh (1)
- cosmic baby (1)
- darkness

## Next steps

* Show percentages in most common tags
* Tags rising in the most-common ranking, compared to last year
* Tags falling in the most-common ranking, compared to last year
* Most common tags visualized in stacked area charts
* Alternative temporal aggregations (by month?)
* Tags correlation
* Clean tags with incorrect / irrelevant meaning (NLP)
* How tags map to or predict genres?
* Find sets of songs played together in the same time window (identify potential mixtapes, playlists...) and their inner relationships
* Introduce Spotify API audio features in the experiments. The API provides audio features for each song