# Preprocessing the Second Batch of Songs

In [68]:
import json
import pickle
import os
import re

import numpy as np
import pandas as pd

from spotipy import Spotify
from spotipy.oauth2 import SpotifyClientCredentials

from library import get_sec_ss

## Table of Contents

## 1. Loading in Tables

In [79]:
# all songs I attempted to pull
with open('../data/new_tracks_20190103.json', 'r') as f:
    new_tracks = json.load(f)


matched_songs = pd.read_csv('../data/matched_songs_20190209.csv', index_col = 0)
main_wfeats = pd.read_csv('../data/main_wfeats.csv', index_col = 0)
genres = pd.read_csv('../data/genres.csv', index_col = 0)
gs_lookup = pd.read_csv('../data/gsong_lookup.csv', index_col = 0)

In [56]:
artists = pd.read_csv('../data/artist_list_v2.csv', index_col=0)

### 1a. Grabbing Mean & Variance Segment Values

In [3]:
def combine_section_mean_var(fp):
    '''
    Given a file path, grabs mean and variance segment dictionaries
    and combines them together
    '''
    directory = {
        'means' : [],
        'vars' : [],
    }
    
    # grabbing all files
    for file in os.listdir(fp):
        if 'section_mean_' in file:
            directory['means'].append(file)
        elif 'section_var_' in file:
            directory['vars'].append(file)
    
    # base files
    with open(f"{fp}/{directory['means'][0]}", 'r') as f:
        mean_dict = json.load(f)
    with open(f"{fp}/{directory['vars'][0]}", 'r') as f:
        vars_dict = json.load(f)
    
    # adding addtl files
    for i in range(1, len(directory['means'])):
        with open(f"{fp}/{directory['means'][i]}", 'r') as f:
            addition = json.load(f)
        mean_dict.extend(addition)
                  
    for i in range(1, len(directory['vars'])):
        with open(f"{fp}/{directory['vars'][i]}", 'r') as f:
            addition = json.load(f)
        vars_dict.extend(addition)
    
    return mean_dict, vars_dict

In [4]:
means, variances = combine_section_mean_var('../data')

In [32]:
len(variances)

651452

In [34]:
len(new_tracks)

651517

In [35]:
len(matched_songs)

159438

In [25]:
[value for value in means[0].values()][0]

{'confidence': 0.601625,
 'duration': 26.5366675,
 'loudness': -14.359750000000002,
 'mode': 0.875,
 'mode_confidence': 0.3652500000000001,
 'tempo': 121.35325,
 'tempo_confidence': 0.4575}

In [31]:
def unpack_lists(lst):
    '''
    Unpacks a list of dictionaries into a dictionary
    '''
    dct = {}
    for i in range(len(lst)):
        dct[[key for key in lst[i].keys()][0]] \
            = [value for value in lst[i].values()][0]
    return dct

In [34]:
mean_dict = unpack_lists(means)
var_dict = unpack_lists(variances)

In [48]:
def find_errors(dct):
    '''
    returns error messages if found in mean or var dicts
    '''
    error_dct = {}
    for i in dct.copy():
        if isinstance(dct[i], str):
            error_dct[i] = dct[i]
            del dct[i]
    return dct, error_dct

In [49]:
mean_dict, mean_errors = find_errors(mean_dict)

In [50]:
var_dict, var_errors = find_errors(var_dict)

In [51]:
len(mean_dict)

650263

In [45]:
len(mean_errors)

1189

In [52]:
mean_df = pd.DataFrame.from_dict(mean_dict, orient='index')

In [55]:
var_df = pd.DataFrame.from_dict(var_dict, orient='index')

In [57]:
mean_df.head()

Unnamed: 0,confidence,duration,loudness,mode,mode_confidence,tempo,tempo_confidence
0007aPK8VmXN4ycL2OcBFa,0.552778,32.807407,-7.355333,0.888889,0.354889,114.153,0.286667
0008G8TW7eiVfwlRRsKlgW,0.549364,19.008486,-11.250909,0.909091,0.629455,133.749182,0.371545
000BqzNd7gRYnK6umzTNZX,0.657167,20.035555,-5.852,0.833333,0.285667,145.424167,0.365333
000CSIqE1KcjAiZYYWXV18,0.477333,24.455557,-12.882333,0.166667,0.552833,124.057583,0.438833
000G1xMMuwxNHmwVsBdtj1,0.489167,15.195622,-6.957667,0.833333,0.262083,190.880667,0.272


In [58]:
new_tracks['0008G8TW7eiVfwlRRsKlgW']

{'Song Title': 'Don`t Go', 'Artist': 'Stevie B'}

In [80]:
new_tracks_df = pd.DataFrame.from_dict(new_tracks, orient='index')

In [60]:
artists.head()

Unnamed: 0_level_0,name,popularity,followers
s_artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,71,747136.0
1bDWGdIC2hardyt55nlQgG,"""Weird Al"" Yankovic",59,337751.0
0REMf7H0VP6DwfZ9MbuWph,10 Years,60,243895.0
0MBIKH9DjtBkv8O3nS6szj,"10,000 Maniacs",52,108829.0
7urq0VfqxEYEEiZUkebXT4,112,68,455231.0


In [61]:
new_tracks_df.head()

Unnamed: 0,Song Title,Artist
0007aPK8VmXN4ycL2OcBFa,Bodhisattva - Live,Toto
0008G8TW7eiVfwlRRsKlgW,Don`t Go,Stevie B
000BqzNd7gRYnK6umzTNZX,You Still Want Me - 2014 Remastered Version,The Kinks
000CSIqE1KcjAiZYYWXV18,Under The Sun (Ecclesiastes),Michael Card
000G1xMMuwxNHmwVsBdtj1,Will Anything Happen,Blondie


In [81]:
new_tracks_df.reset_index(inplace=True)

In [90]:
new_tracks_df.rename(columns={'index': 'track_id'}, inplace=True)

In [82]:
artists.reset_index(inplace=True)

In [84]:
artists.drop(columns=['index'], inplace=True)

In [74]:
len(artists)

2426

#### Adding in Artist IDs (and subsequently Genres into new tracks)

It appears as though I'm loosing ~72k tracks when completing the merge. My guess is that it's likely these songs have multiple artists tied to them, and I did not pull the artist that matched the listing I currently have.

In [91]:
df = new_tracks_df.merge(artists, 
                         left_on='Artist',
                         right_on='name')

In [86]:
len(new_tracks_df)

651517

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 578062 entries, 0 to 578061
Data columns (total 7 columns):
track_id       578062 non-null object
Song Title     578062 non-null object
Artist         578062 non-null object
s_artist_id    578062 non-null object
name           578062 non-null object
popularity     578062 non-null int64
followers      578062 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 35.3+ MB


In [72]:
df.head()

Unnamed: 0,Song Title,Artist,s_artist_id,name,popularity,followers
0,Bodhisattva - Live,Toto,0PFtn5NtBbbUNbU9EAmIWF,Toto,76,753405.0
1,Don't Chain My Heart - Live,Toto,0PFtn5NtBbbUNbU9EAmIWF,Toto,76,753405.0
2,Taint Your World (Live),Toto,0PFtn5NtBbbUNbU9EAmIWF,Toto,76,753405.0
3,Caught In the Balance - Live Version,Toto,0PFtn5NtBbbUNbU9EAmIWF,Toto,76,753405.0
4,Girl Goodbye - Live,Toto,0PFtn5NtBbbUNbU9EAmIWF,Toto,76,753405.0


In [73]:
df['name'].nunique()

2330

#### Looking at Matched Songs

In [76]:
matched_songs.head()

Unnamed: 0,artist_name,song_id,song_title,CID,PID,Title,Performer Name
0,robyn,6SluaPiV04KOaRTOIScoff,show me love,260151.0,14428.0,show me love,robyn
3,lukas graham,5kqIPrATaCc2LqxVWzQGbk,7 years,43137.0,53876.0,7 years,lukas graham
10,the killers,3aVyHFxRkf8lSjhWdJ68AW,just another girl,76427.0,39793.0,just another girl,the killers
11,tamia,0zIyxS6QxZogHOpGkI6IZH,deeper,8897.0,14338.0,deeper,tamia
14,kanye west,12D0n7hKpPcjuUpcbAKjjr,dont like.1,68936.0,2118.0,dont like.1,kanye west


In [77]:
len(matched_songs)

159438

#### Comparing Titles Against Matched Tracks -w- Artist IDs

In [93]:
master_df = new_tracks_df.merge(matched_songs,
                                left_on='track_id',
                                right_on='song_id')

In [96]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156000 entries, 0 to 155999
Data columns (total 10 columns):
track_id          156000 non-null object
Song Title        156000 non-null object
Artist            156000 non-null object
artist_name       156000 non-null object
song_id           156000 non-null object
song_title        156000 non-null object
CID               156000 non-null float64
PID               156000 non-null float64
Title             156000 non-null object
Performer Name    156000 non-null object
dtypes: float64(2), object(8)
memory usage: 13.1+ MB
