# Computing Nearest Neighbors Model for Songwriter Dataset

In [226]:
from collections import Counter
import json
import pickle

import dask
import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder
from dask.distributed import Client
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors, kneighbors_graph

## Table of Contents

1. Loading Data
2. Computing Nearest Neighbors
3. Instituting Voting

## 1. Loading Data

In [69]:
df = pd.read_csv('../data/matched_tracks_w_writers_20190422.csv',
                 index_col = 0,
                 dtype = {
                     'track_id' : str,
                     'Song Title' : str,
                     'Artist' : str,
                     'artist_id' : str,
                     'name' : str,
                     'popularity' : np.int8,
                     'followers' : np.int32,
                     'artist_name' : str,
                     'song_id' : str,
                     'song_title' : str,
                     'CID' : np.int32,
                     'PID' : np.int32,
                     'Title': str,
                     'Performer Name' : str,
                     'WID' : np.int32,
                     'Writer Name' : str,
                     'IPI' : str,
                     'PRO' : 'category'
                 })

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 462965 entries, 0 to 462964
Data columns (total 18 columns):
track_id          462965 non-null object
Song Title        462965 non-null object
Artist            462965 non-null object
artist_id         462965 non-null object
name              462965 non-null object
popularity        462965 non-null int8
followers         462965 non-null int32
artist_name       462965 non-null object
song_id           462965 non-null object
song_title        462965 non-null object
CID               462965 non-null int32
PID               462965 non-null int32
Title             462965 non-null object
Performer Name    462965 non-null object
WID               462965 non-null int32
Writer Name       462965 non-null object
IPI               462965 non-null object
PRO               462965 non-null category
dtypes: category(1), int32(4), int8(1), object(12)
memory usage: 53.9+ MB


In [10]:
df['IPI'] = df['IPI'].apply(lambda x: np.NaN if x == '—' else x)

In [22]:
df['IPI'].iloc[18456]

'9211814'

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 455492 entries, 0 to 455491
Data columns (total 17 columns):
track_id          455492 non-null object
Song Title        455492 non-null object
Artist            455492 non-null object
artist_id         455492 non-null object
name              455492 non-null object
popularity        455492 non-null int8
followers         455492 non-null int32
artist_name       455492 non-null object
song_id           455492 non-null object
song_title        455492 non-null object
CID               455492 non-null int32
PID               455492 non-null int32
Title             455492 non-null object
Performer Name    455492 non-null object
IPI               438607 non-null float64
Writer Name       455492 non-null object
PRO               455492 non-null object
dtypes: float64(1), int32(3), int8(1), object(12)
memory usage: 54.3+ MB


In [4]:
df['Writer Name'].isnull().any()

False

In [5]:
df[df['Writer Name'] == 'SMITH YOMO ']

Unnamed: 0,track_id,Song Title,Artist,artist_id,name,popularity,followers,artist_name,song_id,song_title,CID,PID,Title,Performer Name,WID,Writer Name,IPI,PRO
224501,6FHj7FegwTzdrcuQDCo8jT,Foe Tha Love of $,Bone Thugs-N-Harmony,5spEJXLwD1sKUdC2bnOHPg,Bone Thugs-N-Harmony,68,847771,bone thugs-n-harmony,6FHj7FegwTzdrcuQDCo8jT,foe tha love of $,0,3,foe tha love of $,bone thugs-n-harmony,8,SMITH YOMO,—,BMI


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,Song Title,Artist,artist_id,name,popularity,followers,artist_name,song_id,song_title,CID,PID,Title,Performer Name,IPI,Writer Name,PRO
0,0,0007aPK8VmXN4ycL2OcBFa,Bodhisattva - Live,Toto,0PFtn5NtBbbUNbU9EAmIWF,Toto,76,753405.0,toto,0007aPK8VmXN4ycL2OcBFa,bodhisattva,126127.0,6030.0,bodhisattva,toto,89150949,BECKER WALTER CARL,ASCAP
1,1,4F6419huGNh7rXB8Kr1rBf,Bodhisattva - Live,Toto,0PFtn5NtBbbUNbU9EAmIWF,Toto,76,753405.0,toto,4F6419huGNh7rXB8Kr1rBf,bodhisattva,126127.0,6030.0,bodhisattva,toto,89150949,BECKER WALTER CARL,ASCAP
2,2,6auUOYXyP76l4fiqC86c9E,Bodhisattva - Live,Toto,0PFtn5NtBbbUNbU9EAmIWF,Toto,76,753405.0,toto,6auUOYXyP76l4fiqC86c9E,bodhisattva,126127.0,6030.0,bodhisattva,toto,89150949,BECKER WALTER CARL,ASCAP
3,3,10JB78fwqaUrCFZj3XNn6L,Bodhisattva,Steely Dan,6P7H3ai06vU1sGvdpBwDmE,Steely Dan,68,601480.0,steely dan,10JB78fwqaUrCFZj3XNn6L,bodhisattva,126127.0,1699.0,bodhisattva,steely dan,89150949,BECKER WALTER CARL,ASCAP
4,4,6cdLlECd80adBM3fgEk9Km,Bodhisattva,Steely Dan,6P7H3ai06vU1sGvdpBwDmE,Steely Dan,68,601480.0,steely dan,6cdLlECd80adBM3fgEk9Km,bodhisattva,126127.0,1699.0,bodhisattva,steely dan,89150949,BECKER WALTER CARL,ASCAP


#### Dropping non-numeric columns

In [60]:
df = df[['popularity', 'followers', 'CID', 'PID', 'WID']]

## 2. Computing Nearest Neighbors

In [34]:
nn = NearestNeighbors(n_neighbors=10,
                      metric = 'euclidean')

In [35]:
fit_df = nn.fit(df)

In [36]:
fit_df

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=None, n_neighbors=10, p=2, radius=1.0)

In [61]:
euclidean_distances, euclidean_indices = nn.kneighbors(df)

In [63]:
euclidean_distances.tofile('../data/final_model_test/distances_20190428.fid')
euclidean_indices.tofile('../data/final_model_test/indices_20190428.fid')

In [43]:
import os

os.getcwd()

'/Users/jonjohnson/dev/SWI_2/Songwriter_Index/notebooks'

In [44]:
os.chdir('../data/')

In [45]:
os.mkdir('final_model_test')

In [46]:
os.chdir('../notebooks')

In [47]:
distances.tofile('../data/final_model_test/distances_20190424.fid')
indices.tofile('../data/final_model_test/indices_20190424.fid')

### 2a. Merging Observations with Indices and Distances

In [41]:
euclidean_distances = np.fromfile('../data/final_model_test/distances_20190428.fid')
euclidean_indices = np.fromfile('../data/final_model_test/indices_20190428.fid')

In [64]:
euclidean_indices.astype(np.int16)[:900]

array([[   2,    0,    1, ...,  643,  639,  642],
       [   2,    0,    1, ...,  643,  639,  642],
       [   2,    0,    1, ...,  643,  639,  642],
       ...,
       [ 896,  897,  898, ..., 1002, 1001, 1000],
       [ 896,  897,  898, ..., 1002, 1001, 1000],
       [ 899,  894, 1007, ...,  999,  900, 1241]], dtype=int16)

In [55]:
euclidean_distances.astype(np.int16)[:60]

array([   0,    0,    1,    1,    1, 1397, 1397, 1397, 1397, 1397,    0,
          0,    1,    1,    1, 1397, 1397, 1397, 1397, 1397,    0,    0,
          1,    1,    1, 1397, 1397, 1397, 1397, 1397,    0,    0,    1,
          1,    1,    1,    1,    1,    2,    2,    0,    0,    1,    1,
          1,    1,    1,    1,    2,    2,    0,    0,    1,    1,    1,
          1,    1,    1,    2,    2], dtype=int16)

In [6]:
distances = np.fromfile('../data/final_model_test/distances_20190424.fid')
indices = np.fromfile('../data/final_model_test/indices_20190424.fid')

In [8]:
distances.shape

(4629650,)

In [65]:
df_distances = pd.DataFrame(np.vstack(np.split(euclidean_distances, 462965)))
df_indices = pd.DataFrame(np.vstack(np.split(euclidean_indices, 462965)))

In [66]:
df_distances.shape

(462965, 10)

In [67]:
df_indices.shape

(462965, 10)

In [70]:
X_distances = pd.concat([df[['track_id',
                             'Song Title',
                             'artist_name',
                             'Writer Name',
                             'WID']],
                         df_distances],
                        axis = 1)

In [71]:
X_indices = pd.concat([df[['track_id',
                             'Song Title',
                             'artist_name',
                             'Writer Name',
                             'WID']],
                         df_indices],
                        axis = 1)

In [72]:
X_distances.head()

Unnamed: 0,track_id,Song Title,artist_name,Writer Name,WID,0,1,2,3,4,5,6,7,8,9
0,0007aPK8VmXN4ycL2OcBFa,Bodhisattva - Live,toto,BECKER WALTER CARL,7280,0.0,0.0,0.0,1.0,1.0,1.0,1397.30097,1397.30097,1397.30097,1397.30097
1,4F6419huGNh7rXB8Kr1rBf,Bodhisattva - Live,toto,BECKER WALTER CARL,7280,0.0,0.0,0.0,1.0,1.0,1.0,1397.30097,1397.30097,1397.30097,1397.30097
2,6auUOYXyP76l4fiqC86c9E,Bodhisattva - Live,toto,BECKER WALTER CARL,7280,0.0,0.0,0.0,1.0,1.0,1.0,1397.30097,1397.30097,1397.30097,1397.30097
3,10JB78fwqaUrCFZj3XNn6L,Bodhisattva,steely dan,BECKER WALTER CARL,7280,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.414214,2.0
4,6cdLlECd80adBM3fgEk9Km,Bodhisattva,steely dan,BECKER WALTER CARL,7280,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.414214,2.0


In [74]:
X_indices.head()

Unnamed: 0,track_id,Song Title,artist_name,Writer Name,WID,0,1,2,3,4,5,6,7,8,9
0,0007aPK8VmXN4ycL2OcBFa,Bodhisattva - Live,toto,BECKER WALTER CARL,7280,2,0,1,148,146,147,641,643,639,642
1,4F6419huGNh7rXB8Kr1rBf,Bodhisattva - Live,toto,BECKER WALTER CARL,7280,2,0,1,148,146,147,641,643,639,642
2,6auUOYXyP76l4fiqC86c9E,Bodhisattva - Live,toto,BECKER WALTER CARL,7280,2,0,1,148,146,147,641,643,639,642
3,10JB78fwqaUrCFZj3XNn6L,Bodhisattva,steely dan,BECKER WALTER CARL,7280,3,5,4,151,53,149,150,103,207,74
4,6cdLlECd80adBM3fgEk9Km,Bodhisattva,steely dan,BECKER WALTER CARL,7280,3,5,4,151,53,149,150,103,207,74


In [76]:
X_indices.iloc[146]

track_id       0007aPK8VmXN4ycL2OcBFa
Song Title         Bodhisattva - Live
artist_name                      toto
Writer Name          FAGEN DONALD JAY
WID                              7281
0                                 147
1                                 148
2                                 146
3                                   0
4                                   1
5                                   2
6                                 641
7                                 643
8                                 639
9                                 642
Name: 146, dtype: object

### 3. Determing Voting Mechanism

In [79]:
X_voters = X_indices[['WID',
                      0,
                      1,
                      2,
                      3,
                      4,
                      5,
                      6,
                      7,
                      8,
                      9]]

In [126]:
X_voters.columns

Index(['WID', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='object')

In [104]:
X_voters_dict = X_voters.to_dict(orient = 'index')

In [105]:
X_voters_dict[0]

{'WID': 7280,
 0: 2,
 1: 0,
 2: 1,
 3: 148,
 4: 146,
 5: 147,
 6: 641,
 7: 643,
 8: 639,
 9: 642}

In [107]:
index_wid_dct = df['WID'].to_dict()

In [118]:
(355, 7280) in index_wid_dct.items()

False

In [133]:
len(index_wid_dct)

462965

In [143]:
def index_to_wid_mapping(df):
    index_wid_dict = df['WID'].to_dict()
    just_voters_df = df.drop('WID', axis = 1)
    new_voters_dict = {i:[] for i in range(len(just_voters_df))}
    
    count = 0
    for entry in just_voters_df.itertuples():
        for item in entry[:]:
            if item in index_wid_dict:
                new_voters_dict[count].append(index_wid_dict[item])
        count += 1
    
    new_voters_df = pd.DataFrame.from_dict(new_voters_dict,
                                           orient= 'index',
                                           columns = [i for i in range(11)])
    
    new_voters_df = pd.concat([pd.DataFrame.from_dict(index_wid_dict,
                                                      orient = 'index',
                                                      columns = ['WID']),
                               new_voters_df], axis = 1)

    return new_voters_df

In [144]:
X_voters_2 = index_to_wid_mapping(X_voters)

In [194]:
df[df['WID'] == 35426]

Unnamed: 0,track_id,Song Title,Artist,artist_id,name,popularity,followers,artist_name,song_id,song_title,CID,PID,Title,Performer Name,WID,Writer Name,IPI,PRO


In [189]:
X_voters[X_voters['WID'] == 105444]

Unnamed: 0,WID,0,1,2,3,4,5,6,7,8,9


In [190]:
X_voters_2[X_voters_2['WID'] == 105444]

Unnamed: 0,WID,0,1,2,3,4,5,6,7,8,9,10


In [100]:
def count_vals(df):
    id_dict = {wid : Counter() for wid in df['WID'].unique()}
    for entry in df.itertuples():
            id_dict[entry[1]].update(entry[2:])
    return id_dict

In [101]:
# with just index values

count_df = count_vals(X_voters)

In [145]:
# with WID values instead of index values

count_df_2 = count_vals(X_voters_2)

In [182]:
count_df[7280].most_common(10)

[(41, 10),
 (40, 10),
 (43, 10),
 (42, 10),
 (120, 10),
 (119, 10),
 (118, 10),
 (53, 9),
 (207, 9),
 (60, 9)]

In [196]:
count_df_2[1940].most_common(10)

[(1940, 1979),
 (1979, 55),
 (1981, 45),
 (1945, 45),
 (347, 42),
 (1252, 38),
 (2530, 29),
 (1622, 27),
 (2871, 25),
 (3747, 24)]

In [168]:
count_df_2.keys()

dict_keys([7280, 7281, 2642, 6145, 9495, 60204, 60244, 60324, 14498, 60215, 60281, 3208, 60235, 60163, 60164, 60168, 19791, 60265, 104338, 6144, 60268, 5898, 60243, 60327, 347, 207636, 221189, 261342, 1786, 60306, 16868, 253407, 60318, 190351, 28, 77282, 429000, 60353, 50609, 56336, 1956, 253373, 20019, 86354, 8882, 11354, 14877, 3135, 46853, 305835, 78042, 410015, 16231, 410069, 46854, 305850, 356861, 261732, 15789, 11767, 305862, 178351, 53387, 149564, 16858, 16859, 16860, 200418, 39422, 59369, 200457, 61574, 351485, 200402, 200530, 12616, 200553, 200554, 200620, 22475, 90143, 90145, 200480, 16552, 200410, 200392, 200383, 200389, 200390, 31152, 31153, 351487, 200465, 52615, 200411, 17344, 17345, 17346, 17347, 200484, 48305, 253578, 253522, 253537, 240729, 253587, 29697, 65613, 132884, 132891, 132895, 92870, 92871, 132885, 132886, 20391, 8667, 29963, 138111, 6209, 14260, 30104, 9700, 9701, 10290, 26168, 26145, 12156, 29987, 10196, 10197, 9677, 30091, 26138, 3250, 138193, 11855, 30114,

##### Saving Writer Count Dict to File

In [204]:
with open('../data/final_model_test/writer_count_dict_20190504.pkl', 'wb') as f:
    pickle.dump(count_df_2, f)

#### 3a. Mapping Votes back Against Writers

In [223]:
# Will need to update this function as I figure out the logic for this algo

def combine_writer_counts_and_info(count_dict, writer_df):
    
    top_12_dict = {wid : {'writer_name' : '',
                          'ipi' : '',
                          'top_matches' : ''}\
                   for wid in count_dict.keys()}
    
    # Retreive Top 12 Results for each Songwriter
    for entry in count_dict:
        top_12_dict[entry]['top_matches'] = count_dict[entry].most_common(12)
    
    # add writer name and info to dictionary
    for wid in top_12_dict:
        if wid in writer_df['WID']:
            top_12_dict[wid]['writer_name']\
            = writer_df['Writer Name'][writer_df['WID'] == wid].values[0]
            top_12_dict[wid]['ipi']\
            = writer_df['IPI'][writer_df['WID'] == wid].values[0]
    
    return top_12_dict

In [222]:
top_12s = combine_writer_counts_and_info(count_df_2, writers)

In [224]:
top_12s[4692]

{'writer_name': 'CARTER SHAWN  ',
 'ipi': '240089589',
 'top_matches': [(4692, 224),
  (3747, 30),
  (3772, 24),
  (4561, 23),
  (4754, 21),
  (5519, 20),
  (3729, 18),
  (5565, 15),
  (7091, 14),
  (4575, 14),
  (3835, 12),
  (16895, 12)]}

In [148]:
writers = pd.read_csv('../data/writers.csv', index_col = 0)