# Computing Nearest Neighbors Model for Songwriter Dataset

In [1]:
from collections import Counter
import json
import pickle

import dask
import dask.dataframe as dd
from dask_ml.wrappers import Incremental

import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors, kneighbors_graph
from sklearn.externals import joblib
from scipy.spatial.distance import jaccard

## Table of Contents

1. Loading Data
2. Computing Nearest Neighbors
3. Instituting Voting

## 1. Loading Data

In [None]:
print('hello')

In [2]:
ddf = dd.read_csv('../data/interim/scaled_ddf_201901516.csv')\
        .rename(columns = {'Unnamed: 0' : 'index'})\
        .set_index('index')

### 1a. Loading Labels 

In [3]:
df = pd.read_csv('../data/interim/non_normalized_tracks_20190516.csv',
                 index_col = 0)

In [4]:
df.shape

(577426, 144)

In [5]:
df_labels = df[[
                'track_id',
                'Song Title',
                'Artist',
                'artist_id',
                'name',
                'popularity',
                'followers',
                'artist_name',
                'song_id',
                'song_title',
                'CID',
                'WID',
                'PID',
                'Title',
                'Performer Name',
                'Writer Name',
                'IPI',
                'PRO'
            ]]

## 2. Computing Nearest Neighbors

In [6]:
nn = NearestNeighbors(n_neighbors=10,
                      metric = 'jaccard')

In [None]:
nn.fit()

In [7]:
nn_incremental = Incremental(nn)

In [8]:
fit_ddf = nn_incremental.fit(ddf)

TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator NearestNeighbors(algorithm='auto', leaf_size=30, metric='jaccard',
         metric_params=None, n_jobs=None, n_neighbors=10, p=2, radius=1.0) does not.

In [None]:
euclidean_distances, euclidean_indices = nn.kneighbors(df)

In [None]:
euclidean_distances.tofile('../data/final_model_poc/distances_20190428.fid')
euclidean_indices.tofile('../data/final_model_poc/indices_20190428.fid')

### 2a. Merging Observations with Indices and Distances

In [30]:
euclidean_distances = np.fromfile('../data/final_model_poc/distances_20190517.fid')
euclidean_indices = np.fromfile('../data/final_model_poc/indices_20190517.fid')

In [18]:
euclidean_distances.shape, euclidean_indices.shape

((5774260,), (5774260,))

In [32]:
euclidean_indices.astype(np.int16)

array([0, 0, 0, ..., 0, 0, 0], dtype=int16)

In [19]:
euclidean_indices = euclidean_indices.astype(np.int16)

In [20]:
euclidean_distances = euclidean_distances.astype(np.int16)

In [21]:
df_distances = pd.DataFrame(np.vstack(np.split(euclidean_distances, 577426)))
df_indices = pd.DataFrame(np.vstack(np.split(euclidean_indices, 577426)))

In [22]:
df_distances.shape, df_distances.index

((577426, 10), RangeIndex(start=0, stop=577426, step=1))

In [23]:
df_indices.shape, df_indices.index

((577426, 10), RangeIndex(start=0, stop=577426, step=1))

In [26]:
df_labels.shape, df_labels.index

((577426, 18),
 Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                 9,
             ...
             57464, 57465, 57466, 57467, 57468, 57469, 57470, 57471, 57472,
             57473],
            dtype='int64', length=577426))

In [24]:
df_labels.reset_index(inplace = True)

In [25]:
df_labels = df_labels.drop('index', 1)

In [26]:
X_distances = pd.concat([df_labels,
                         df_distances],
                        axis = 1)

In [27]:
X_indices = pd.concat([df_labels,
                         df_indices],
                        axis = 1)

In [28]:
X_distances.head()

Unnamed: 0,track_id,Song Title,Artist,artist_id,name,popularity,followers,artist_name,song_id,song_title,...,0,1,2,3,4,5,6,7,8,9
0,0fr7Y70YYTKiXbRRBlg4bW,Cult of Personality - Recorded Live,Living Colour,6Uhp7WA6sjm5ZL6Xz561de,Living Colour,53,132456.0,living colour,0fr7Y70YYTKiXbRRBlg4bW,cult of personality,...,0,80,98,106,114,117,121,123,126,133
1,0fr7Y70YYTKiXbRRBlg4bW,Cult of Personality - Recorded Live,Living Colour,6Uhp7WA6sjm5ZL6Xz561de,Living Colour,53,132456.0,living colour,0fr7Y70YYTKiXbRRBlg4bW,cult of personality,...,0,35,46,49,49,53,53,53,54,54
2,0fr7Y70YYTKiXbRRBlg4bW,Cult of Personality - Recorded Live,Living Colour,6Uhp7WA6sjm5ZL6Xz561de,Living Colour,53,132456.0,living colour,0fr7Y70YYTKiXbRRBlg4bW,cult of personality,...,0,2,33,33,38,39,40,41,42,42
3,0fr7Y70YYTKiXbRRBlg4bW,Cult of Personality - Recorded Live,Living Colour,6Uhp7WA6sjm5ZL6Xz561de,Living Colour,53,132456.0,living colour,0fr7Y70YYTKiXbRRBlg4bW,cult of personality,...,0,4,33,35,38,39,43,44,45,45
4,54DMD1zBaX2aeMe7eE9N4v,Cult Of Personality,Living Colour,6Uhp7WA6sjm5ZL6Xz561de,Living Colour,53,132456.0,living colour,54DMD1zBaX2aeMe7eE9N4v,cult of personality,...,0,34,38,39,46,46,46,47,48,48


In [29]:
X_indices.head()

Unnamed: 0,track_id,Song Title,Artist,artist_id,name,popularity,followers,artist_name,song_id,song_title,...,0,1,2,3,4,5,6,7,8,9
0,0fr7Y70YYTKiXbRRBlg4bW,Cult of Personality - Recorded Live,Living Colour,6Uhp7WA6sjm5ZL6Xz561de,Living Colour,53,132456.0,living colour,0fr7Y70YYTKiXbRRBlg4bW,cult of personality,...,0,0,0,0,0,0,0,0,0,0
1,0fr7Y70YYTKiXbRRBlg4bW,Cult of Personality - Recorded Live,Living Colour,6Uhp7WA6sjm5ZL6Xz561de,Living Colour,53,132456.0,living colour,0fr7Y70YYTKiXbRRBlg4bW,cult of personality,...,0,0,0,0,0,0,0,0,0,0
2,0fr7Y70YYTKiXbRRBlg4bW,Cult of Personality - Recorded Live,Living Colour,6Uhp7WA6sjm5ZL6Xz561de,Living Colour,53,132456.0,living colour,0fr7Y70YYTKiXbRRBlg4bW,cult of personality,...,0,0,0,0,0,0,0,0,0,0
3,0fr7Y70YYTKiXbRRBlg4bW,Cult of Personality - Recorded Live,Living Colour,6Uhp7WA6sjm5ZL6Xz561de,Living Colour,53,132456.0,living colour,0fr7Y70YYTKiXbRRBlg4bW,cult of personality,...,0,0,0,0,0,0,0,0,0,0
4,54DMD1zBaX2aeMe7eE9N4v,Cult Of Personality,Living Colour,6Uhp7WA6sjm5ZL6Xz561de,Living Colour,53,132456.0,living colour,54DMD1zBaX2aeMe7eE9N4v,cult of personality,...,0,0,0,0,0,0,0,0,0,0


In [59]:
X_indices.iloc[544021]

track_id          2e4RcQohKWo8oZIUfjxiGo
Song Title           Holidays In The Sun
Artist                       Sex Pistols
artist_id         1u7kkVrr14iBvrpYnZILJR
name                         Sex Pistols
popularity                            57
followers                         769809
artist_name                  sex pistols
song_id           2e4RcQohKWo8oZIUfjxiGo
song_title           holidays in the sun
CID                               143256
WID                                12557
PID                                14131
Title                holidays in the sun
Performer Name               sex pistols
Writer Name                  LYDON JOHN 
IPI                          2.45851e+07
PRO                                  PRS
0                                      0
1                                      0
2                                      0
3                                      0
4                                      0
5                                      0
6               

### 3. Determing Voting Mechanism

In [None]:
X_voters = X_indices[['WID',
                      0,
                      1,
                      2,
                      3,
                      4,
                      5,
                      6,
                      7,
                      8,
                      9]]

In [None]:
X_voters.columns

In [None]:
X_voters_dict = X_voters.to_dict(orient = 'index')

In [None]:
X_voters_dict[0]

In [None]:
index_wid_dct = df['WID'].to_dict()

In [None]:
(355, 7280) in index_wid_dct.items()

In [None]:
len(index_wid_dct)

In [None]:
def index_to_wid_mapping(df):
    index_wid_dict = df['WID'].to_dict()
    just_voters_df = df.drop('WID', axis = 1)
    new_voters_dict = {i:[] for i in range(len(just_voters_df))}
    
    count = 0
    for entry in just_voters_df.itertuples():
        for item in entry[:]:
            if item in index_wid_dict:
                new_voters_dict[count].append(index_wid_dict[item])
        count += 1
    
    new_voters_df = pd.DataFrame.from_dict(new_voters_dict,
                                           orient= 'index',
                                           columns = [i for i in range(11)])
    
    new_voters_df = pd.concat([pd.DataFrame.from_dict(index_wid_dict,
                                                      orient = 'index',
                                                      columns = ['WID']),
                               new_voters_df], axis = 1)

    return new_voters_df

In [None]:
X_voters_2 = index_to_wid_mapping(X_voters)

In [None]:
df[df['WID'] == 35426]

In [None]:
X_voters[X_voters['WID'] == 105444]

In [None]:
X_voters_2[X_voters_2['WID'] == 105444]

In [None]:
def count_vals(df):
    id_dict = {wid : Counter() for wid in df['WID'].unique()}
    for entry in df.itertuples():
            id_dict[entry[1]].update(entry[2:])
    return id_dict

In [None]:
# with just index values

count_df = count_vals(X_voters)

In [None]:
# with WID values instead of index values

count_df_2 = count_vals(X_voters_2)

In [None]:
count_df[7280].most_common(10)

In [None]:
count_df_2[1940].most_common(10)

In [None]:
count_df_2.keys()

##### Saving Writer Count Dict to File

In [None]:
with open('../data/final_model_test/writer_count_dict_20190504.pkl', 'wb') as f:
    pickle.dump(count_df_2, f)

In [12]:
with open('../data/writer_count_dict_20190504.pkl', 'rb') as f:
    count_df_2 = pickle.load(f)

#### 3a. Mapping Votes back Against Writers

In [14]:
# Will need to update this function as I figure out the logic for this algo

def combine_writer_counts_and_info(count_dict, writer_df):
    
    top_12_dict = {wid : {'writer_name' : '',
                          'ipi' : '',
                          'top_matches' : ''}\
                   for wid in count_dict.keys()}
    
    # Retreive Top 12 Results for each Songwriter
    for entry in count_dict:
        top_12_dict[entry]['top_matches'] = count_dict[entry].most_common(12)
    
    # add writer name and info to dictionary
    for wid in top_12_dict:
        if wid in writer_df['WID']:
            top_12_dict[wid]['writer_name']\
            = writer_df['Writer Name'][writer_df['WID'] == wid].values[0]
            top_12_dict[wid]['ipi']\
            = writer_df['IPI'][writer_df['WID'] == wid].values[0]
    
    return top_12_dict

In [15]:
top_12s = combine_writer_counts_and_info(count_df_2, writers)

In [16]:
top_12s[4692]

{'writer_name': 'CARTER SHAWN  ',
 'ipi': '240089589',
 'top_matches': [(4692, 224),
  (3747, 30),
  (3772, 24),
  (4561, 23),
  (4754, 21),
  (5519, 20),
  (3729, 18),
  (5565, 15),
  (7091, 14),
  (4575, 14),
  (3835, 12),
  (16895, 12)]}

In [13]:
writers = pd.read_csv('writers.csv', index_col = 0)

In [2]:
with open('../data/final_model_poc/top_12_writer_by_votes_manhattan_20190517.pkl', 'rb') as f:
    top_12 = pickle.load(f)

In [4]:
with open('../data/final_model_poc/top_12_writer_by_votes_euclidean_20190523.pkl', 'rb') as f:
    top_12_euclidean = pickle.load(f)

In [13]:
writers[writers['WID'] == 694]

Unnamed: 0,WID,Writer Name,IPI,PRO
437,694,GARCIA JEROME J,41050341,ASCAP


In [20]:
writers[writers['Writer Name'].str.startswith("CARTER SHAWN")]

Unnamed: 0,WID,Writer Name,IPI,PRO
956,1940,CARTER SHAWN C,240089393,ASCAP
2155,4692,CARTER SHAWN,240089589,ASCAP
30523,102371,CARTER SHAWN KEVIN,849140133,BMI
48606,201860,CARTER SHAWN C.,—,ASCAP
49565,206931,CARTER SHAWN,—,NS
88593,508205,CARTER SHAWN 'JAY Z',—,BMI
88600,508254,CARTER SHAWN,—,ASCAP


In [5]:
top_12_euclidean[1940]

{'writer_name': 'CARTER SHAWN C',
 'ipi': '240089393',
 'top_matches': [(1940, 381),
  (1980, 34),
  (1979, 31),
  (12544, 27),
  (1283, 26),
  (56427, 22),
  (694, 18),
  (695, 15),
  (90313, 14),
  (56424, 13),
  (9168, 12),
  (1981, 11)],
 'top_matches_w_names': {'CARTER SHAWN C': (1940, 381),
  'STEWART CHRISTOPHER A': (1980, 34),
  'NASH TERIUS YOUNGDELL': (1979, 31),
  'LEVINE ADAM NOAH': (12544, 27),
  'VALENTINE JAMES B': (1283, 26),
  'MADDEN MICHAEL ALLEN': (56427, 22),
  'GARCIA JEROME J': (694, 18),
  'HUNTER ROBERT C': (695, 15),
  'HARRELL THADDIS LAPHONIA': (90313, 14),
  'CARMICHAEL JESSE ROYAL': (56424, 13),
  'HERMANSEN TOR ERIK': (9168, 12),
  'WEST KANYE OMARI': (1981, 11)}}

In [5]:
top_12[18876]

{'writer_name': 'HALL ROBERT BRYSON II',
 'ipi': '696894362',
 'top_matches': [(18876, 40),
  (119661, 4),
  (183281, 3),
  (32746, 3),
  (8450, 3),
  (276772, 3),
  (48216, 3),
  (1783, 2),
  (85335, 2),
  (3725, 2),
  (14273, 2),
  (27644, 2)],
 'top_matches_w_names': {'HALL ROBERT BRYSON II': (18876, 40),
  'REDDICK JARET RAY': (119661, 4),
  'FILKINS ZACHARY DOUGLAS': (183281, 3),
  'LORD JON ': (32746, 3),
  'TEDDER RYAN B': (8450, 3),
  'JOKER  ': (276772, 3),
  'GANO GORDON JAMES': (48216, 3),
  'BACHARACH BURT F': (1783, 2),
  'FARRAR SAM JOHN': (85335, 2),
  'GOTTWALD LUKASZ ': (3725, 2),
  'WADE RICO RENARD': (14273, 2),
  'TITUS ANDRES ': (27644, 2)}}