# Data and Filter Collaborations

In [66]:
# DB CONFIG
from pymongo import MongoClient

client = MongoClient('mongo', 27017)
db = client.music_db
songs_collection = db.songs

In [67]:
# Pull all songs from DB
songs = list(songs_collection.find())

In [68]:
# Filter Collaboration Songs 
SEPARATORS = ['&', 'Featuring', 'With', ',', 'Feat.']

def is_collaboration(artist):
    return any([word in artist for word in SEPARATORS])

collaborations = [song for song in songs if is_collaboration(song['artist'])]

### Artist List Format

In [115]:
import re
from typing import List

def clean(collaborators: List) -> List:
    return [artist.strip() for artist in collaborators]

def separate(artist: str) -> List:
    return re.split('|'.join(SEPARATORS), artist)

collaborators = [song['artist'] for song in collaborations]
collaborators = map(separate, collaborators)
collaborators = map(clean, collaborators)
collaborators = list(collaborators)
len(collaborators), collaborators[:10]

(2919,
 [['Cardi B', 'Bad Bunny', 'J Balvin'],
  ['6ix9ine', 'Nicki Minaj', 'Murda Beatz'],
  ['Tyga', 'Offset'],
  ['DJ Khaled', 'Justin Bieber', 'Chance The Rapper', 'Quavo'],
  ['Post Malone', 'Ty Dolla $ign'],
  ['Lil Baby', 'Drake'],
  ['YG', '2 Chainz', 'Big Sean', 'Nicki Minaj'],
  ['Nicki Minaj', 'Ariana Grande'],
  ['Nicki Minaj', 'Lil Wayne'],
  ['Nicki Minaj', 'Eminem', 'Labrinth']])

# Associations of Artists 

In [157]:
from mlxtend.frequent_patterns import apriori
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

dataset = collaborators
te = TransactionEncoder()
oht_ary = te.fit(dataset).transform(dataset, sparse=True)
df = pd.SparseDataFrame(te_ary, columns=te.columns_, default_fill_value=False)

In [None]:
frequent_itemsets = apriori(df, min_support=0.001, use_colnames=True)   # 1/1000 out of 2919

In [None]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(len)

In [None]:
# 563 3-artist collaborations, 2135 2-artist collaborations
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(len)
filtered_itemsets = frequent_itemsets[(frequent_itemsets['length'] > 1)]
filtered_itemsets = filtered_itemsets.reset_index()

In [None]:
# 2 item ordered by support
two_itemsets = filtered_itemsets[(filtered_itemsets['length'] == 2)].sort_values(by='support', ascending=False)
print(len(two_itemsets), '2-Itemsets')
two_itemsets[:15]

In [None]:
# 3 item ordered by support
three_itemsets = filtered_itemsets[(filtered_itemsets['length'] == 3)].sort_values(by='support', ascending=False)
print(len(three_itemsets), '3-Itemsets')
three_itemsets

In [None]:
itemset = ('Rick Ross', 'Lil Wayne', 'DJ Khaled')
count = [artists for artists in collaborators if all([x in artists for x in itemset])]   
print("Number of occurences for", itemset, '=', len(count))