# Data and Filter Collaborations

In [1]:
# DB CONFIG
from pymongo import MongoClient

client = MongoClient('mongo', 27017)
db = client.music_db
songs_collection = db.songs

In [2]:
# Pull all songs from DB
songs = list(songs_collection.find())

In [3]:
# Filter Collaboration Songs 
SEPARATORS = ['&', 'Featuring', 'With', ',', 'Feat.']

def is_collaboration(artist):
    return any([word in artist for word in SEPARATORS])

collaborations = [song for song in songs if is_collaboration(song['artist'])]
collaborations[0]

{'_id': ObjectId('5b927851ea311f002198aadb'),
 'title': 'I Like It',
 'artist': 'Cardi B, Bad Bunny & J Balvin',
 'peakPos': 5,
 'lastPos': 0,
 'weeks': 0,
 'rank': 5,
 'isNew': True,
 'identifier': "'I Like It' by Cardi B, Bad Bunny & J Balvin",
 'lyrics': {'error': 'Lyric no found, try again later.'}}

### Artist List Format

In [4]:
import re
from typing import List

def clean(collaborators: List) -> List:
    return [artist.strip() for artist in collaborators]

def separate(artist: str) -> List:
    return re.split('|'.join(SEPARATORS), artist)

collaborators = [song['artist'] for song in collaborations]
collaborators = map(separate, collaborators)
collaborators = map(clean, collaborators)
collaborators = list(collaborators)
len(collaborators), collaborators[:10]

(2919,
 [['Cardi B', 'Bad Bunny', 'J Balvin'],
  ['6ix9ine', 'Nicki Minaj', 'Murda Beatz'],
  ['Tyga', 'Offset'],
  ['DJ Khaled', 'Justin Bieber', 'Chance The Rapper', 'Quavo'],
  ['Post Malone', 'Ty Dolla $ign'],
  ['Lil Baby', 'Drake'],
  ['YG', '2 Chainz', 'Big Sean', 'Nicki Minaj'],
  ['Nicki Minaj', 'Ariana Grande'],
  ['Nicki Minaj', 'Lil Wayne'],
  ['Nicki Minaj', 'Eminem', 'Labrinth']])

# Associations of Artists 

In [40]:
from mlxtend.frequent_patterns import apriori
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

dataset = collaborators
te = TransactionEncoder()
oht_ary = te.fit(dataset).transform(dataset, sparse=True)
df = pd.SparseDataFrame(oht_ary, columns=te.columns_, default_fill_value=False)
len(list(df.columns.values))

2153

In [6]:
frequent_itemsets = apriori(df, min_support=0.001, use_colnames=True)   # 1/1000 out of 2919

In [8]:
# 563 3-artist collaborations, 2135 2-artist collaborations
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(len)
filtered_itemsets = frequent_itemsets.reset_index()

In [9]:
# 2 item ordered by support
two_itemsets = filtered_itemsets[(filtered_itemsets['length'] == 2)].sort_values(by='support', ascending=False)
print(len(two_itemsets), '2-Itemsets')
two_itemsets[:15]

184 2-Itemsets


Unnamed: 0,index,support,itemsets,length
558,558,0.007194,"(Drake, Lil Wayne)",2
547,547,0.005824,"(Rick Ross, DJ Khaled)",2
521,521,0.005481,"(Birdman, Lil Wayne)",2
620,620,0.004796,"(The East Side Boyz, Lil Jon)",2
556,556,0.004796,"(Drake, Future)",2
622,622,0.004111,"(Nicki Minaj, Lil Wayne)",2
623,623,0.003768,"(Lil Wayne, Rick Ross)",2
657,657,0.003426,"(T-Pain, Rick Ross)",2
604,604,0.003083,"(Jay-Z, R. Kelly)",2
590,590,0.003083,"(Young Thug, Future)",2


In [10]:
# 3 item ordered by support
three_itemsets = filtered_itemsets[(filtered_itemsets['length'] == 3)].sort_values(by='support', ascending=False)
print(len(three_itemsets), '3-Itemsets')
three_itemsets

14 3-Itemsets


Unnamed: 0,index,support,itemsets,length
675,675,0.002055,"(Lil Wayne, Rick Ross, DJ Khaled)",3
671,671,0.00137,"(Metro Boomin, 21 Savage, Offset)",3
683,683,0.00137,"(Earth, Fire, Wind)",3
672,672,0.001028,"(August Alsina, Chris Brown, DJ Khaled)",3
673,673,0.001028,"(Birdman, Lil Wayne, Rick Ross)",3
674,674,0.001028,"(Drake, Rick Ross, DJ Khaled)",3
676,676,0.001028,"(Nicki Minaj, Rick Ross, DJ Khaled)",3
677,677,0.001028,"(Plies, Rick Ross, DJ Khaled)",3
678,678,0.001028,"(T-Pain, Rick Ross, DJ Khaled)",3
679,679,0.001028,"(Trick Daddy, Rick Ross, DJ Khaled)",3


In [11]:
# Ocurrences of a sample itemset
itemset = ('Rick Ross', 'Lil Wayne', 'DJ Khaled')
count = [artists for artists in collaborators if all([x in artists for x in itemset])]   
print("Number of occurences for", itemset, '=', len(count))

Number of occurences for ('Rick Ross', 'Lil Wayne', 'DJ Khaled') = 6


In [38]:
from mlxtend.frequent_patterns import association_rules
sets = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

sets['length'] = sets['antecedents'].apply(len)
sets[(sets['length'] == 2)].sort_values(by='confidence', ascending=False)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,length
83,"(Plies, DJ Khaled)",(Rick Ross),0.001028,0.029462,0.001028,1.0,33.94186,0.000997,inf,2
82,"(Plies, Rick Ross)",(DJ Khaled),0.001028,0.012333,0.001028,1.0,81.083333,0.001015,inf,2
99,"(Eddie Levert, Gerald Levert)",(Sr.),0.001028,0.00137,0.001028,1.0,729.75,0.001026,inf,2
98,"(Sr., Eddie Levert)",(Gerald Levert),0.001028,0.002055,0.001028,1.0,486.5,0.001026,inf,2
94,"(Fire, Wind)",(Earth),0.00137,0.00137,0.00137,1.0,729.75,0.001368,inf,2
93,"(Earth, Wind)",(Fire),0.00137,0.00137,0.00137,1.0,729.75,0.001368,inf,2
92,"(Earth, Fire)",(Wind),0.00137,0.00137,0.00137,1.0,729.75,0.001368,inf,2
89,"(Drag-On, DMX)",(Eve),0.001028,0.006166,0.001028,1.0,162.166667,0.001021,inf,2
88,"(Eve, DMX)",(Drag-On),0.001028,0.001713,0.001028,1.0,583.8,0.001026,inf,2
87,"(Eve, Drag-On)",(DMX),0.001028,0.006509,0.001028,1.0,153.631579,0.001021,inf,2
