# Data and Filter Collaborations

In [66]:
# DB CONFIG
from pymongo import MongoClient

client = MongoClient('mongo', 27017)
db = client.music_db
songs_collection = db.songs

In [67]:
# Pull all songs from DB
songs = list(songs_collection.find())

In [68]:
# Filter Collaboration Songs 
SEPARATORS = ['&', 'Featuring', 'With', ',', 'Feat.']

def is_collaboration(artist):
    return any([word in artist for word in SEPARATORS])

collaborations = [song for song in songs if is_collaboration(song['artist'])]

### Artist List Format

In [115]:
import re
from typing import List

def clean(collaborators: List) -> List:
    return [artist.strip() for artist in collaborators]

def separate(artist: str) -> List:
    return re.split('|'.join(SEPARATORS), artist)

collaborators = [song['artist'] for song in collaborations]
collaborators = map(separate, collaborators)
collaborators = map(clean, collaborators)
collaborators = list(collaborators)
len(collaborators), collaborators[:10]

(2919,
 [['Cardi B', 'Bad Bunny', 'J Balvin'],
  ['6ix9ine', 'Nicki Minaj', 'Murda Beatz'],
  ['Tyga', 'Offset'],
  ['DJ Khaled', 'Justin Bieber', 'Chance The Rapper', 'Quavo'],
  ['Post Malone', 'Ty Dolla $ign'],
  ['Lil Baby', 'Drake'],
  ['YG', '2 Chainz', 'Big Sean', 'Nicki Minaj'],
  ['Nicki Minaj', 'Ariana Grande'],
  ['Nicki Minaj', 'Lil Wayne'],
  ['Nicki Minaj', 'Eminem', 'Labrinth']])

# Associations of Artists 

In [157]:
from mlxtend.frequent_patterns import apriori
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

dataset = collaborators
te = TransactionEncoder()
oht_ary = te.fit(dataset).transform(dataset, sparse=True)
df = pd.SparseDataFrame(te_ary, columns=te.columns_, default_fill_value=False)

In [158]:
frequent_itemsets = apriori(df, min_support=0.001, use_colnames=True)   # 1/1000 out of 2919

In [159]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(len)

In [160]:
# 563 3-artist collaborations, 2135 2-artist collaborations
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(len)
filtered_itemsets = frequent_itemsets[(frequent_itemsets['length'] > 1)]   # Remove 1-item sets
filtered_itemsets = filtered_itemsets.reset_index()

In [161]:
# 2 item ordered by support
two_itemsets = filtered_itemsets[(filtered_itemsets['length'] == 2)].sort_values(by='support', ascending=False)
print(len(two_itemsets), '2-Itemsets')
two_itemsets[:15]

184 2-Itemsets


Unnamed: 0,index,support,itemsets,length
71,558,0.007194,"(Lil Wayne, Drake)",2
60,547,0.005824,"(Rick Ross, DJ Khaled)",2
34,521,0.005481,"(Birdman, Lil Wayne)",2
133,620,0.004796,"(Lil Jon, The East Side Boyz)",2
69,556,0.004796,"(Future, Drake)",2
135,622,0.004111,"(Nicki Minaj, Lil Wayne)",2
136,623,0.003768,"(Rick Ross, Lil Wayne)",2
170,657,0.003426,"(Rick Ross, T-Pain)",2
117,604,0.003083,"(Jay-Z, R. Kelly)",2
103,590,0.003083,"(Future, Young Thug)",2


In [162]:
# 3 item ordered by support
three_itemsets = filtered_itemsets[(filtered_itemsets['length'] == 3)].sort_values(by='support', ascending=False)
print(len(three_itemsets), '3-Itemsets')
three_itemsets

14 3-Itemsets


Unnamed: 0,index,support,itemsets,length
188,675,0.002055,"(Rick Ross, Lil Wayne, DJ Khaled)",3
184,671,0.00137,"(Offset, 21 Savage, Metro Boomin)",3
196,683,0.00137,"(Fire, Earth, Wind)",3
185,672,0.001028,"(August Alsina, Chris Brown, DJ Khaled)",3
186,673,0.001028,"(Rick Ross, Birdman, Lil Wayne)",3
187,674,0.001028,"(Drake, Rick Ross, DJ Khaled)",3
189,676,0.001028,"(Nicki Minaj, Rick Ross, DJ Khaled)",3
190,677,0.001028,"(Rick Ross, Plies, DJ Khaled)",3
191,678,0.001028,"(Rick Ross, T-Pain, DJ Khaled)",3
192,679,0.001028,"(Rick Ross, Trick Daddy, DJ Khaled)",3


In [163]:
# Ocurrences of a sample itemset
itemset = ('Rick Ross', 'Lil Wayne', 'DJ Khaled')
count = [artists for artists in collaborators if all([x in artists for x in itemset])]   
print("Number of occurences for", itemset, '=', len(count))

Number of occurences for ('Rick Ross', 'Lil Wayne', 'DJ Khaled') = 6
