# Determining Most Frequent Collaborators for Songwriter Dataset 

In [1]:
from collections import defaultdict
from functools import reduce
from itertools import combinations
import pickle

import pandas as pd 
import numpy as np

import dask.bag as db

from sklearn.metrics import pairwise_distances

## Table of Contents 

1. Functions
2. Importing Dataset
3. Computing Most Frequent Collaborators
4. Comparing Collaborators with Euclidean Model

## 1. Functions

In [2]:
def import_and_strip_song_dataset_to_WID_CID(path):
    '''
    Imports song csv file at path and strips it to only 'WID' and 'CID' columns
    '''
    song_df = pd.read_csv(path, index_col=0)
    filtered_song_df = song_df[['WID', 
                               'CID']]
    return filtered_song_df

In [3]:
def mk_song_count_per_songwriter_dict(song_df_with_writer_and_composition_id):
    '''
    Creates a dictionary with writer id's as keys and each of the writers songs as values
    '''
    songwriter_corpus_dict = defaultdict(set)
    song_dict = song_df_with_writer_and_composition_id.to_dict(orient='index')
    for i in song_dict:
        songwriter_corpus_dict[song_dict[i]['WID']].add(song_dict[i]['CID'])
    songwriter_corpus_dict.default_factory = None
    return songwriter_corpus_dict

In [4]:
def remove_songwriters_w_less_than_4_songs(songwriter_corpus_dict):
    new_songwriter_dict = dict(map(lambda x: None if len(x.values()) < 4 else x, 
                                  songwriter_corpus_dict))
    return new_songwriter_dict

In [5]:
def get_intersection_of_writers_and_count_matches(songwriter_corpus_dict):
    '''
    Matches all songwriters in `songwriter_corpus_dict` with one another and determines
    what each pair has in common
    '''
    match_count_dict = {}
    combos = combinations(songwriter_corpus_dict.keys(), 2)
    for pair in combos:
        match_count_dict[pair] = len(songwriter_corpus_dict[pair[0]]\
                                     .intersection(songwriter_corpus_dict[pair[1]]))
    return match_count_dict

In [24]:
def remove_zero_match_counts(match_count_dict):
    for pair, value in list(match_count_dict.items()):
        if value == 0:
            del match_count_dict[pair]

## 2. Importing Dataset

In [7]:
df = import_and_strip_song_dataset_to_WID_CID('../data/matched_tracks_w_writers_20190504.csv')

## 3. Computing Most Frequent Collaborators

In [8]:
songwriter_songs_dict = mk_song_count_per_songwriter_dict(df)

In [9]:
# pruned_songwriter_songs_dict = remove_songwriters_w_less_than_4_songs(songwriter_songs_dict)

In [10]:
match_counts = get_intersection_of_writers_and_count_matches(songwriter_songs_dict)

In [25]:
remove_zero_match_counts(match_counts)

In [31]:
with open('../data/interim/match_counts.pkl', 'wb') as f:
    pkl.dump(match_counts, f)

In [26]:
len(match_counts)

150508

In [27]:
np.mean(list(map(len, songwriter_songs_dict.values())))

5.9075419614811056

## 4. Comparing Collaborators with Euclidean Model

### 4a. Loading Euclidean Based Songwriter Model

In [4]:
with open('../data/final_model_poc/top_12_writer_by_votes_euclidean_20190523.pkl', 'rb') as f:
    top_12s = pickle.load(f)

### Loading Match Counts Dict

In [5]:
with open('../data/interim/match_counts.pkl', 'rb') as f:
    match_counts = pickle.load(f)

In [31]:
match_counts

{(7280, 7281): 76,
 (7280, 63156): 1,
 (7280, 9384): 1,
 (7280, 4577): 1,
 (7280, 4581): 1,
 (7280, 7067): 1,
 (7280, 11294): 1,
 (7280, 64264): 3,
 (7280, 56967): 1,
 (7280, 56968): 1,
 (7280, 9987): 1,
 (7280, 401528): 1,
 (7280, 90281): 1,
 (7280, 90284): 1,
 (7280, 64265): 3,
 (7280, 38513): 1,
 (7280, 85609): 1,
 (7280, 90285): 1,
 (7280, 200872): 2,
 (7280, 251704): 1,
 (7280, 311804): 1,
 (7280, 28815): 1,
 (7281, 63156): 1,
 (7281, 9384): 1,
 (7281, 4577): 1,
 (7281, 4581): 1,
 (7281, 7067): 1,
 (7281, 11294): 1,
 (7281, 56967): 1,
 (7281, 56968): 1,
 (7281, 9987): 1,
 (7281, 401528): 1,
 (7281, 90281): 1,
 (7281, 90284): 1,
 (7281, 38513): 1,
 (7281, 85609): 1,
 (7281, 90285): 1,
 (7281, 191721): 1,
 (7281, 200872): 2,
 (7281, 251704): 1,
 (7281, 311804): 1,
 (7281, 28815): 1,
 (2642, 6145): 54,
 (2642, 9495): 28,
 (2642, 60204): 23,
 (2642, 60244): 2,
 (2642, 60324): 5,
 (2642, 14498): 8,
 (2642, 60215): 10,
 (2642, 60281): 5,
 (2642, 60235): 7,
 (2642, 60163): 1,
 (2642, 601

In [4]:
type((filter(lambda x: 110784 in x, match_counts)))

filter

In [78]:
[[number for number in tupl] for tupl in list(filter(lambda x: 110784 in x, match_counts))][:12]

[[110783, 110784],
 [110784, 110785],
 [110784, 14200],
 [110784, 110788],
 [110784, 27448],
 [110784, 110786],
 [110784, 110896],
 [110784, 110916],
 [110784, 110841]]

In [5]:
for i, v in pd.DataFrame(top_12s[110784]['top_matches']).iterrows():
    print(v[0])

110784
110785
110783
27448
92607
167896
9007
379
14200
110916
695
38897


In [55]:
set(pd.DataFrame(top_12s[110784]['top_matches'])[0])

{379,
 695,
 9007,
 14200,
 27448,
 38897,
 92607,
 110783,
 110784,
 110785,
 110916,
 167896}

In [31]:
set(pd.DataFrame(top_12s[110784]['top_matches']))

{0, 1}

### Sorting Match Counts by Count 

In [14]:
# https://stackoverflow.com/questions/20944483/python-3-sort-a-dict-by-its-values
match_counts_sorted = [(k, match_counts[k]) for k in sorted(match_counts, key=match_counts.get, reverse=True)]

### Comparing Collabs vs Model Results

In [9]:
# https://stackoverflow.com/a/1388836
# https://stackoverflow.com/a/18072799

def check_freq_collabs_vs_model_results(freq_model : dict, distance_model : dict) -> dict:
    """
    Compares songwriter frequency counts vs distance based model results.
    
    Returns:
    - 'overlap' : int, the number of results per writer in common between the two
                     datasets
    - 'num_in_same_order' : int, the number of results per writer in the same ranked
                                 order between the two datasets
    
    Dependencies:
        - numpy
        - pandas
        - sklearn
    """
    results = {}
    for writer_id in distance_model:
        if writer_id in [number for tupl in list(filter(lambda x: writer_id in x, freq_model)) for number in tupl]:
            top_ids_freq_nested = [[number for number in tupl] for tupl in \
                               list(filter(lambda x: writer_id in x, freq_model))][:13]
            top_ids_freq_single_list = [number for tupl in list(filter(lambda x: writer_id in x, freq_model)) for number in tupl]
            top_ids_dist = pd.DataFrame(distance_model[writer_id]['top_matches'])\
                                            .rename(columns={0: 'WID', 1: 'freq'})

            results[writer_id] = {
                'overlap' : len(set(top_ids_freq_single_list).intersection(set(top_ids_dist['WID']))),
                'num_in_same_order' : len([i for i, j in zip(top_ids_dist, top_ids_freq_nested) if i in j])
            }
    return results

In [10]:
test_results = check_freq_collabs_vs_model_results(match_counts, top_12s)

In [11]:
len(test_results)

29887

In [12]:
with open('../data/interim/freq_collab_model_comparison.pkl', 'wb') as f:
    pickle.dump(test_results, f)

In [13]:
test_results

{499782: {'overlap': 4, 'num_in_same_order': 0},
 27407: {'overlap': 3, 'num_in_same_order': 0},
 334033: {'overlap': 4, 'num_in_same_order': 0},
 499753: {'overlap': 4, 'num_in_same_order': 0},
 499755: {'overlap': 3, 'num_in_same_order': 0},
 5011: {'overlap': 8, 'num_in_same_order': 0},
 5014: {'overlap': 8, 'num_in_same_order': 0},
 4985: {'overlap': 8, 'num_in_same_order': 0},
 4986: {'overlap': 8, 'num_in_same_order': 0},
 4987: {'overlap': 8, 'num_in_same_order': 0},
 4988: {'overlap': 8, 'num_in_same_order': 0},
 5010: {'overlap': 8, 'num_in_same_order': 0},
 5015: {'overlap': 8, 'num_in_same_order': 0},
 197387: {'overlap': 4, 'num_in_same_order': 0},
 3145: {'overlap': 7, 'num_in_same_order': 0},
 12804: {'overlap': 6, 'num_in_same_order': 0},
 124370: {'overlap': 1, 'num_in_same_order': 0},
 11787: {'overlap': 2, 'num_in_same_order': 0},
 48751: {'overlap': 3, 'num_in_same_order': 0},
 58542: {'overlap': 8, 'num_in_same_order': 0},
 138577: {'overlap': 7, 'num_in_same_order'