# Computing Nearest Neighbors Model for Songwriter Dataset

In [1]:
from collections import Counter
import json
import pickle

import dask
import dask.dataframe as dd
import numpy as np
import pandas as pd

from sklearn.neighbors import NearestNeighbors, kneighbors_graph

## Table of Contents

1. Loading Data
2. Computing Nearest Neighbors
3. Instituting Voting

## 1. Loading Data

In [2]:
df = pd.read_csv('../data/interim/scaled_ddf_201901516.csv',
                 index_col = 0)

## 2. Computing Nearest Neighbors

In [3]:
nn = NearestNeighbors(n_neighbors=10,
                      metric = 'euclidean')

In [4]:
fit_df = nn.fit(df)

In [None]:
euclidean_distances, euclidean_indices = nn.kneighbors(df)

In [None]:
euclidean_distances.tofile('../data/final_model_test/distances_20190428.fid')
euclidean_indices.tofile('../data/final_model_test/indices_20190428.fid')

In [None]:
import os

os.getcwd()

In [None]:
os.chdir('../data/')

In [None]:
os.mkdir('final_model_test')

In [None]:
os.chdir('../notebooks')

In [None]:
distances.tofile('../data/final_model_test/distances_20190424.fid')
indices.tofile('../data/final_model_test/indices_20190424.fid')

### 2a. Merging Observations with Indices and Distances

In [None]:
euclidean_distances = np.fromfile('../data/final_model_test/distances_20190428.fid')
euclidean_indices = np.fromfile('../data/final_model_test/indices_20190428.fid')

In [None]:
euclidean_indices.astype(np.int16)[:900]

In [None]:
euclidean_distances.astype(np.int16)[:60]

In [None]:
distances = np.fromfile('../data/final_model_test/distances_20190424.fid')
indices = np.fromfile('../data/final_model_test/indices_20190424.fid')

In [None]:
distances.shape

In [None]:
df_distances = pd.DataFrame(np.vstack(np.split(euclidean_distances, 462965)))
df_indices = pd.DataFrame(np.vstack(np.split(euclidean_indices, 462965)))

In [None]:
df_distances.shape

In [None]:
df_indices.shape

In [None]:
X_distances = pd.concat([df[['track_id',
                             'Song Title',
                             'artist_name',
                             'Writer Name',
                             'WID']],
                         df_distances],
                        axis = 1)

In [None]:
X_indices = pd.concat([df[['track_id',
                             'Song Title',
                             'artist_name',
                             'Writer Name',
                             'WID']],
                         df_indices],
                        axis = 1)

In [None]:
X_distances.head()

In [None]:
X_indices.head()

In [None]:
X_indices.iloc[146]

### 3. Determing Voting Mechanism

In [None]:
X_voters = X_indices[['WID',
                      0,
                      1,
                      2,
                      3,
                      4,
                      5,
                      6,
                      7,
                      8,
                      9]]

In [None]:
X_voters.columns

In [None]:
X_voters_dict = X_voters.to_dict(orient = 'index')

In [None]:
X_voters_dict[0]

In [None]:
index_wid_dct = df['WID'].to_dict()

In [None]:
(355, 7280) in index_wid_dct.items()

In [None]:
len(index_wid_dct)

In [None]:
def index_to_wid_mapping(df):
    index_wid_dict = df['WID'].to_dict()
    just_voters_df = df.drop('WID', axis = 1)
    new_voters_dict = {i:[] for i in range(len(just_voters_df))}
    
    count = 0
    for entry in just_voters_df.itertuples():
        for item in entry[:]:
            if item in index_wid_dict:
                new_voters_dict[count].append(index_wid_dict[item])
        count += 1
    
    new_voters_df = pd.DataFrame.from_dict(new_voters_dict,
                                           orient= 'index',
                                           columns = [i for i in range(11)])
    
    new_voters_df = pd.concat([pd.DataFrame.from_dict(index_wid_dict,
                                                      orient = 'index',
                                                      columns = ['WID']),
                               new_voters_df], axis = 1)

    return new_voters_df

In [None]:
X_voters_2 = index_to_wid_mapping(X_voters)

In [None]:
df[df['WID'] == 35426]

In [None]:
X_voters[X_voters['WID'] == 105444]

In [None]:
X_voters_2[X_voters_2['WID'] == 105444]

In [None]:
def count_vals(df):
    id_dict = {wid : Counter() for wid in df['WID'].unique()}
    for entry in df.itertuples():
            id_dict[entry[1]].update(entry[2:])
    return id_dict

In [None]:
# with just index values

count_df = count_vals(X_voters)

In [None]:
# with WID values instead of index values

count_df_2 = count_vals(X_voters_2)

In [None]:
count_df[7280].most_common(10)

In [None]:
count_df_2[1940].most_common(10)

In [None]:
count_df_2.keys()

##### Saving Writer Count Dict to File

In [None]:
with open('../data/final_model_test/writer_count_dict_20190504.pkl', 'wb') as f:
    pickle.dump(count_df_2, f)

#### 3a. Mapping Votes back Against Writers

In [None]:
# Will need to update this function as I figure out the logic for this algo

def combine_writer_counts_and_info(count_dict, writer_df):
    
    top_12_dict = {wid : {'writer_name' : '',
                          'ipi' : '',
                          'top_matches' : ''}\
                   for wid in count_dict.keys()}
    
    # Retreive Top 12 Results for each Songwriter
    for entry in count_dict:
        top_12_dict[entry]['top_matches'] = count_dict[entry].most_common(12)
    
    # add writer name and info to dictionary
    for wid in top_12_dict:
        if wid in writer_df['WID']:
            top_12_dict[wid]['writer_name']\
            = writer_df['Writer Name'][writer_df['WID'] == wid].values[0]
            top_12_dict[wid]['ipi']\
            = writer_df['IPI'][writer_df['WID'] == wid].values[0]
    
    return top_12_dict

In [None]:
top_12s = combine_writer_counts_and_info(count_df_2, writers)

In [None]:
top_12s[4692]

In [None]:
writers = pd.read_csv('../data/writers.csv', index_col = 0)