# Data Processing
In this notebook, I will be loading the data necessary for this project, combining the data sources and performing initial data processing steps. I will also be performing some exploratory data analysis for the purposes of identifying missing values and outliers, which will then be followed up with the appropriate processing steps.

In [190]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import os
import re
import pandas as pd
import pdpipe as pdp
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
import string
import pickle
from functools import partial
import nltk
from tqdm import tqdm
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from functools import partial
tqdm.pandas()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
from data.util.paths import DATA_PATH
from data.scripts.project_data import DataLoader
from data.util.environment_variables import COUNTRIES, SUPERREGIONS, REGIONS

from lib.processing import load_geoscheme_df, get_country_to_dict_mapping, encode_country_column, encode_genre_column, encode_style_column, clean_artist_column, clean_label_column, save_to_pkl, load_from_pkl, artist_ngrams, cossine_similarity, get_matches_df, label_ngrams, get_cosine_similarity_matches, get_ngrams, create_match_lookup, clean_format_text

# Extracted Data
## Loading and Cleaning

In [3]:
data_loader = DataLoader()

In [4]:
extracted_data = data_loader.load_extracted_data()

In [5]:
extracted_data.head()
extracted_data.drop_duplicates('release_id',inplace=True)

In [6]:
def make_market_value_col(median_col,market_price_col):
    median_col, market_price_col = median_col.copy(), market_price_col.copy()
    
    market_value_col = median_col
    
    market_value_null_idx = market_value_col[market_value_col.isnull()].index
    
    market_value_col[market_value_null_idx] = market_price_col[market_value_null_idx]
    
    return market_value_col

In [7]:
extracted_data['market_value'] = make_market_value_col(extracted_data['median'],extracted_data['market_price'])

In [8]:
extracted_data.drop('id',inplace=True,axis=1)

In [9]:
ids_to_drop = extracted_data[extracted_data['market_value'].isnull()]['release_id']

In [10]:
extracted_data.drop(ids_to_drop.index,inplace=True)

# API Data
## Loading and Cleaning

In [11]:
api_df = data_loader.load_api_data()

In [12]:
api_df = pd.concat([api_df,api_df['title'].str.split(' - ',n=1,expand=True)],axis=1)

In [13]:
geoscheme_df = load_geoscheme_df()

In [14]:
country_to_dict_mapping = get_country_to_dict_mapping()

In [15]:
pipeline = pdp.DropDuplicates('release_id')
pipeline += pdp.ColDrop(['title','id'])
pipeline += pdp.ColRename({0: 'artist', 1: 'title'})
pipeline += pdp.ApplyByCols(['genre','style','label','formats'],pickle.loads)
pipeline += pdp.MapColVals('country',country_to_dict_mapping)

In [16]:
api_df = pipeline.apply(api_df)

In [17]:
api_df = extracted_data.merge(api_df,how='left',on='release_id')

## Encoding Country

In [None]:
encoded_country_df = encode_country_column(api_df['country'])

In [None]:
save_to_pkl(encoded_country_df,'country')

## Encoding Genre

In [None]:
encoded_genre_df = encode_genre_column(api_df['genre'])

In [None]:
save_to_pkl(encoded_genre_df,'genre')

## Encoding Style

In [None]:
encoded_style_df = encode_style_column(api_df['style'])

In [None]:
save_to_pkl(encoded_style_df.astype(np.uint8),'style')

## Cleaning Label and Artist for Later Encoding

In [18]:
api_df['artist_clean'] = api_df['artist'].apply(clean_artist_column)
api_df['label_clean'] = api_df['label'].apply(lambda x: x[0]).apply(clean_label_column)

### Artists Cosine Similarity Matching

In [19]:
artist_matches_df = get_cosine_similarity_matches(api_df['artist_clean'],get_ngrams)
artist_matches_df

Unnamed: 0,left_side,right_side,similarity
4,benny goodman benny goodman quartet benny goodman,benny goodman,0.941538
5,benny goodman benny goodman quartet benny goodman,benny goodman benny goodman sextet,0.921802
6,benny goodman benny goodman quartet benny goodman,benny goodman quartet,0.902357
9,benny goodman,benny goodman benny goodman quartet benny goodman,0.941538
10,benny goodman,benny goodman benny goodman sextet,0.921013
...,...,...,...
9894,wynton marsalis branford marsalis ellis marsalis,ellis marsalis branford marsalis,0.939691
9895,wynton marsalis branford marsalis ellis marsalis,wynton marsalis ellis marsalis,0.931852
9930,abdullah ibrahim,abdullah ibrahim trio,0.905000
9955,masahiko togashi richard beirach,richard beirach masahiko togashi,0.906908


In [28]:
artist_match_lookup = create_match_lookup(artist_matches_df)

#Check that all lookup values are final nodes
for key, value in artist_match_lookup.items():
    try:
        assert artist_match_lookup[value] == value
    except KeyError:
        continue
        
#Convert artist names to best compressed value according to match lookup table
api_df['artist_clean'] = api_df['artist_clean'].apply(lambda x: artist_match_lookup[x] if artist_match_lookup.get(x) else x)

### Label Cosine Similarity Matching

In [29]:
label_matches_df = get_cosine_similarity_matches(api_df['label_clean'],get_ngrams)
label_matches_df

Unnamed: 0,left_side,right_side,similarity
8,general,generali,0.921196
32,signature,signatur,0.941035
64,atlantic,atlantica,0.950156
65,atlantic,atlanti,0.929364
72,custom fidelity,custom fidelity co,0.914510
...,...,...,...
9613,digital,digitali,0.923328
9622,boriginal,original,0.910276
9855,apollo,apollon,0.901117
9920,hush,hushush,0.935384


In [30]:
label_match_lookup = create_match_lookup(label_matches_df)
label_match_lookup

{'generali': 'general',
 'atlantica': 'atlanti',
 'custom fidelity co': 'custom fidelity',
 'summitt': 'summit',
 'pp polskie nagrania muza': 'polskie nagrania muza',
 'pantone': 'panton',
 'marista': 'arista',
 'gruppo editoriale lespresso spa': 'gruppo editoriale lespresso',
 'membrane': 'membran',
 'polskie radio sa': 'polskie radio',
 'emivalentim de carvalho': 'valentim de carvalho',
 'universal international bv': 'universal international',
 'americana': 'america',
 'genuine': 'genuin',
 'tokuma japan co': 'tokuma japan',
 'avanguard': 'vanguard',
 'norman': 'norma',
 'bel aire': 'bel air',
 'mca special product': 'rca special product',
 'higher octave jazz': 'higher octave',
 'beggars banquet u': 'beggars banquet',
 'hermitage': 'ermitage',
 'wydawnictwo a': 'wydawnictwo',
 'unisono': 'unison',
 'outsider': 'outside',
 'dover': 'dove',
 'ionyx': 'onyx',
 'signature': 'signatur',
 'royale': 'royal',
 'spirale': 'spiral',
 'varèse sarabande jazz': 'varèse sarabande',
 'universal po

As there are fairly similar labels with almost identical names, only those matches will be kept for which there is a 2 character difference in their names, in order to avoid falsely grouping entries together while reducing the variances of names associated with specific labels

In [31]:
label_match_lookup = {key:value for key, value in label_match_lookup.items() if np.abs(len(key) - len(value)) > 2}
#Convert artist names to best compressed value according to match lookup table
api_df['label_clean'] = api_df['label_clean'].apply(lambda x: label_match_lookup[x] if label_match_lookup.get(x) else x)

## Encoding Formats

In [32]:
from lib.processing import make_format_description_column, make_format_name_column, make_format_quantity_column, make_format_text_column, expand_format_description_column

In [33]:
format_columns = ['format_{}'.format(column) for column in ('description','name','quantity','text')]
format_functions = (make_format_description_column, make_format_name_column, make_format_quantity_column, make_format_text_column)

for column, function in zip(format_columns,format_functions):
    api_df[column] = api_df['formats'].apply(function)

**Format Name**

Prime candidate for one-hot encoding/hash encoding

In [34]:
api_df['format_name'].unique()

array(['Shellac', 'Vinyl', 'Box Set', 'CD', 'Reel-To-Reel', 'Cassette',
       'SACD', 'CDr', '8-Track Cartridge', 'All Media', 'Minidisc', 'DVD',
       'Hybrid', '4-Track Cartridge', 'Laserdisc', 'Blu-ray', 'Acetate',
       'DVDr', 'VHS', 'Memory Stick', 'File', 'PlayTape', 'DAT',
       'Flexi-disc', 'HD DVD', 'DCC', 'Lathe Cut', 'RCA Tape Cartridge',
       'Floppy Disk'], dtype=object)

In [35]:
format_name_dummies = pd.get_dummies(api_df['format_name'],drop_first=True)
api_df = pd.concat([api_df,format_name_dummies],axis=1).drop('format_name',axis=1)

**Format Quantity**

Can be left as is in integer format, after some cleaning of values

In [36]:
api_df['format_quantity'].unique()

array([      6,       4,       5,       3,       2,       1,      25,
             7,      16,      10,       8,       9,      26,      14,
            13,      24,      28,      20,      55,      11,      50,
             0,      37,      12, 1010201,      15,      30,     500,
           300,      22,      34,      29,      52])

**Format Text**
Might be eligible for tf-idf vectorizing & label encoding

In [37]:
api_df['format_text'].unique()

array([None, 'Indianapolis pressing', 'Hollywood', ...,
       'Gatefold, 180 Grams Vinyl', 'dolby HX PRO', 'Papersleve, K2HD'],
      dtype=object)

In [38]:
len(api_df['format_text'].unique())

5552

In [92]:
api_df['format_text_clean'] = api_df['format_text'].apply(clean_format_text)

In [93]:
format_text_match_df = get_cosine_similarity_matches(api_df['format_text_clean'],get_ngrams)

In [94]:
format_text_lookup = create_match_lookup(format_text_match_df)

In [97]:
api_df['format_text'] = api_df['format_text_clean'].apply(lambda x: format_text_lookup[x] if format_text_lookup.get(x) else x)
len(api_df['format_text'].unique())

4352

**Format Description**

As the entries in the ``format_description`` column are saved as lists, the entries will be dummy encoded for applicability

In [39]:
api_df['format_description']

0                               [10", 78 RPM, Album]
1                      [10", 78 RPM, Album, Reissue]
2         [10", 78 RPM, Album, Compilation, Repress]
3                               [10", 78 RPM, Album]
4                               [10", 78 RPM, Album]
                             ...                    
297541                          [LP, Album, Reissue]
297542                                       [Album]
297543                                       [Album]
297544                          [LP, Album, Reissue]
297545                  [12", 33 ⅓ RPM, Album, Mono]
Name: format_description, Length: 297546, dtype: object

In [40]:
format_description_df = expand_format_description_column(api_df)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




SystemError: <built-in function is_scalar> returned a result with an error set

In [None]:
save_to_pkl(format_description_df,'format_description')

## Encoded DF

## Feature Engineering
## Eras

In [98]:
def make_period_column(year,start,end):
    if start <= year <= end:
        return 1
    return 0

def make_big_band_period_column(year):
    return make_period_column(year,1930,1950)

def make_bebop_period_column(year):
    return make_period_column(year,1940,1955)

def make_cool_jazz_period_column(year):
    return make_period_column(year,1950,1970)

def make_jazz_fusion_period_column(year):
    return make_period_column(year,1970,2020)

def make_swing_era_column(year):
    return make_period_column(year,1925,1945)

def make_modern_jazz_era_column(year):
    return make_period_column(year,1940,1970)

In [99]:
period_columns = ('big_band_period','bebop_period','cool_jazz_period','jazz_fusion_period','swing_era','modern_jazz_era')
period_functions = (make_big_band_period_column, make_bebop_period_column, make_cool_jazz_period_column, make_jazz_fusion_period_column, make_swing_era_column, make_modern_jazz_era_column)

for column, func in zip(period_columns,period_functions):
    api_df[column] = api_df['year'].apply(func)

## Jazz Standard Count

In [100]:
with open(os.path.join(DATA_PATH,'standards.pkl'),'rb') as f:
    standards = pickle.load(f)
    
lowercase_no_punctuation = lambda x: x.lower().translate(str.maketrans('','',string.punctuation))

In [101]:
standards_lookup = {lowercase_no_punctuation(standard):0 for standard in standards}

In [189]:
def count_jazz_standards(standards_lookup, title_list):
    standards_counter = 0
    for title in title_list:
        title = title.lower().translate(str.maketrans('', '', string.punctuation))
        if title in standards_lookup:
            standards_counter += 1               
    return standards_counter

In [111]:
from lib.processing import match_track_titles_to_standards

In [173]:
matched_track_titles = match_track_titles_to_standards(standards,api_df['track_titles'])


  0%|          | 0/297546 [00:00<?, ?it/s][A
  0%|          | 1/297546 [00:00<12:45:07,  6.48it/s][A
  1%|          | 2641/297546 [00:00<8:50:53,  9.26it/s][A
  2%|▏         | 5419/297546 [00:00<6:08:10, 13.22it/s][A
  3%|▎         | 8256/297546 [00:00<4:15:16, 18.89it/s][A
  4%|▍         | 11253/297546 [00:00<2:56:53, 26.98it/s][A
  5%|▍         | 14195/297546 [00:00<2:02:35, 38.52it/s][A
  6%|▌         | 17036/297546 [00:00<1:25:00, 55.00it/s][A
  7%|▋         | 19985/297546 [00:00<58:55, 78.51it/s]  [A
  8%|▊         | 22952/297546 [00:00<40:51, 112.02it/s][A
  9%|▊         | 26008/297546 [00:01<28:19, 159.78it/s][A
 10%|▉         | 29057/297546 [00:01<19:38, 227.75it/s][A
 11%|█         | 32137/297546 [00:01<13:38, 324.33it/s][A
 12%|█▏        | 35177/297546 [00:01<09:28, 461.21it/s][A
 13%|█▎        | 38192/297546 [00:01<06:36, 654.59it/s][A
 14%|█▍        | 41183/297546 [00:01<04:36, 926.09it/s][A
 15%|█▍        | 44218/297546 [00:01<03:13, 1305.91it/s][A
 16%|█

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




After some testing of match confidence values, the cutoff for matches will be set at 0.7, leaving 10427 titles which will be matched to the list of jazz standards. After this threshold, the accuracy of the matches degrades to an extent that it will introduce too much bias into the ``no_of_jazz_standards`` feature.

In [180]:
matched_track_titles.sort_values('Match Confidence',ascending=False)[matched_track_titles['Match Confidence'] < 0.7]

  """Entry point for launching an IPython kernel.


Unnamed: 0,Original Name,Matched Name,Match Confidence
842357,calma trânsito i remember you,i remember you,0.69
360915,donna ドナ,donna lee,0.69
718057,doing the boogie woogie,pinetops boogie woogie,0.69
139754,c jam blues tv audio,c jam blues,0.69
137736,take the a train parts i ii,take the a train,0.69
...,...,...,...
765060,new orleans\t,new orleans,0.00
808681,pinetops boogie woogie,pinetops boogie woogie,0.00
74157,youd be so nice to come home to,youd be so nice to come home to,0.00
793572,back water blues,back water blues,0.00


In [185]:
match_title_to_standards_df = matched_track_titles[matched_track_titles['Match Confidence'] < 0.7]
tfidf_lookup = {row['Original Name']:row['Matched Name'] for _, row in match_title_to_standards_df.iterrows() if row['Original Name'] not in standards_lookup}

In [195]:
standards_lookup_ = dict(**standards_lookup,**tfidf_lookup)
count_jazz_standards_ = partial(count_jazz_standards,standards_lookup_)

In [196]:
api_df['standards_count'] = api_df['track_titles'].apply(pickle.loads).apply(count_jazz_standards_)

In [197]:
api_df['standards_count'].describe()

count    297546.00000
mean          0.87851
std           1.66438
min           0.00000
25%           0.00000
50%           0.00000
75%           1.00000
max          84.00000
Name: standards_count, dtype: float64

## last_sold

In [199]:
max_last_sold_value = api_df['last_sold'].max()

In [200]:
api_df['no_of_days_since_last_sale'] = api_df['last_sold'].apply(lambda x: (max_last_sold_value - x).days)

## Saving api_df

In [201]:
save_to_pkl(api_df,'api')

## Joining dfs and Saving results to hdf

In [202]:
try:
    api_df
except NameError:
    api_df = load_from_pkl('api')
    
try:
    encoded_country_df
except NameError:
    encoded_country_df = load_from_pkl('country')

try:
    encoded_genre_df
except NameError:
    encoded_genre_df = load_from_pkl('genre')

try:
    encoded_style_df
except NameError:
    encoded_style_df = load_from_pkl('style')

try:
    format_description_df
except NameError:
    format_description_df = load_from_pkl('format_description')

In [203]:
concat_df = pd.concat([
    api_df,
    encoded_country_df,
    encoded_genre_df,
    encoded_style_df,
    format_description_df
],axis=1)

In [204]:
save_to_pkl(concat_df,'concat')

In [None]:
try:
    concat_df
except NameError:
    concat_df = load_from_pkl('concat')

# High Level Features
## Loading and Cleaning

In [None]:
high_level_features = data_loader.load_high_level_features()
high_level_feature_df = pd.DataFrame()
for feature_chunk in tqdm(high_level_features):
    high_level_feature_df = pd.concat([high_level_feature_df,feature_chunk],axis=0)
high_level_feature_df = high_level_feature_df.reset_index(drop=True).drop('index').astype({'release_id':np.uint32,'bitmap':np.uint8})