# Data Processing
In this notebook, I will be loading the data necessary for this project, combining the data sources and performing initial data processing steps. I will also be performing some exploratory data analysis for the purposes of identifying missing values and outliers, which will then be followed up with the appropriate processing steps.

In [1]:
%load_ext autoreload
%autoreload 2
import nltk
import numpy as np
import os
import pandas as pd
import pdpipe as pdp
import pickle
import random
import re
import string
from collections import Counter
from functools import partial
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
tqdm.pandas()

In [2]:
from data.util.paths import DATA_PATH
from data.scripts.project_data import DataLoader
from data.util.environment_variables import COUNTRIES, SUPERREGIONS, REGIONS

from lib.processing import load_geoscheme_df, get_country_to_dict_mapping, encode_country_column, encode_genre_column, encode_style_column,clean_artist_column, clean_label_column, save_to_pkl, load_from_pkl, artist_ngrams, cossine_similarity, get_matches_df, label_ngrams, get_cosine_similarity_matches, get_ngrams, create_match_lookup, clean_format_text, make_market_value_col,make_format_description_column, make_format_name_column, make_format_quantity_column, make_format_text_column, expand_format_description_column, match_track_titles_to_standards 

# Extracted Data
## Loading and Cleaning

In [3]:
data_loader = DataLoader()

In [4]:
extracted_data = data_loader.load_extracted_data()

In [5]:
extracted_data.head()
extracted_data.drop_duplicates('release_id',inplace=True)

In [6]:
extracted_data['market_value'] = make_market_value_col(extracted_data['median'],extracted_data['market_price'])

In [7]:
extracted_data.drop('id',inplace=True,axis=1)

In [8]:
ids_to_drop = extracted_data[extracted_data['market_value'].isnull()]['release_id']

In [9]:
extracted_data.drop(ids_to_drop.index,inplace=True)

# API Data
## Loading and Cleaning

In [10]:
api_df = data_loader.load_api_data()

In [11]:
api_df = pd.concat([api_df,api_df['title'].str.split(' - ',n=1,expand=True)],axis=1)

In [12]:
geoscheme_df = load_geoscheme_df()

In [None]:
country_to_dict_mapping = get_country_to_dict_mapping()

In [None]:
pipeline = pdp.DropDuplicates('release_id')
pipeline += pdp.ColDrop(['title','id'])
pipeline += pdp.ColRename({0: 'artist', 1: 'title'})
pipeline += pdp.ApplyByCols(['genre','style','label','formats'],pickle.loads)
pipeline += pdp.MapColVals('country',country_to_dict_mapping)

In [None]:
api_df = pipeline.apply(api_df)

In [None]:
api_df = extracted_data.merge(api_df,how='left',on='release_id')

## Encoding Country

In [None]:
encoded_country_df = encode_country_column(api_df['country'])

In [None]:
save_to_pkl(encoded_country_df,'country')

## Encoding Genre

In [None]:
encoded_genre_df = encode_genre_column(api_df['genre'])

In [None]:
save_to_pkl(encoded_genre_df,'genre')

## Encoding Style

In [None]:
encoded_style_df = encode_style_column(api_df['style'])

In [None]:
save_to_pkl(encoded_style_df,'style')

## Cleaning Label and Artist for Later Encoding

In [None]:
api_df['artist_clean'] = api_df['artist'].apply(clean_artist_column)
api_df['label_clean'] = api_df['label'].apply(lambda x: x[0]).apply(clean_label_column)

### Artists Cosine Similarity Matching

In [None]:
artist_matches_df = get_cosine_similarity_matches(api_df['artist_clean'],get_ngrams)
artist_matches_df

In [None]:
artist_match_lookup = create_match_lookup(artist_matches_df)

#Check that all lookup values are final nodes
for key, value in artist_match_lookup.items():
    try:
        assert artist_match_lookup[value] == value
    except KeyError:
        continue
        
#Convert artist names to best compressed value according to match lookup table
api_df['artist_clean'] = api_df['artist_clean'].apply(lambda x: artist_match_lookup[x] if artist_match_lookup.get(x) else x)

### Label Cosine Similarity Matching

In [None]:
label_matches_df = get_cosine_similarity_matches(api_df['label_clean'],get_ngrams)
label_matches_df

In [None]:
label_match_lookup = create_match_lookup(label_matches_df)
label_match_lookup

As there are fairly similar labels with almost identical names, only those matches will be kept for which there is a 2 character difference in their names, in order to avoid falsely grouping entries together while reducing the variances of names associated with specific labels

In [None]:
label_match_lookup = {key:value for key, value in label_match_lookup.items() if np.abs(len(key) - len(value)) > 2}
#Convert artist names to best compressed value according to match lookup table
api_df['label_clean'] = api_df['label_clean'].apply(lambda x: label_match_lookup[x] if label_match_lookup.get(x) else x)

## Encoding Formats

In [None]:
format_columns = ['format_{}'.format(column) for column in ('description','name','quantity','text')]
format_functions = (make_format_description_column, make_format_name_column, make_format_quantity_column, make_format_text_column)

for column, function in zip(format_columns,format_functions):
    api_df[column] = api_df['formats'].apply(function)

**Format Name**

Prime candidate for one-hot encoding/hash encoding

In [None]:
api_df['format_name'].unique()

In [None]:
format_name_dummies = pd.get_dummies(api_df['format_name'],drop_first=True)
format_name_dummies.rename(columns={column: 'format_name_{}'.format(column) for column in format_name_dummies.columns},inplace=True)
api_df = pd.concat([api_df,format_name_dummies],axis=1).drop('format_name',axis=1)

**Format Quantity**

Can be left as is in integer format, after some cleaning of values

In [None]:
api_df['format_quantity'].unique()

**Format Text**
Might be eligible for tf-idf vectorizing & label encoding

In [None]:
api_df['format_text'].unique()

In [None]:
len(api_df['format_text'].unique())

In [None]:
api_df['format_text_clean'] = api_df['format_text'].apply(clean_format_text)

In [None]:
format_text_match_df = get_cosine_similarity_matches(api_df['format_text_clean'],get_ngrams)

In [None]:
format_text_lookup = create_match_lookup(format_text_match_df)

In [None]:
api_df['format_text'] = api_df['format_text_clean'].apply(lambda x: format_text_lookup[x] if format_text_lookup.get(x) else x)
len(api_df['format_text'].unique())

**Format Description**

As the entries in the ``format_description`` column are saved as lists, the entries will be dummy encoded for applicability

In [None]:
api_df['format_description']

In [None]:
format_description_df = expand_format_description_column(api_df)

In [None]:
save_to_pkl(format_description_df,'format_description')

## Encoded DF

## Feature Engineering
## Eras

In [None]:
def make_period_column(year,start,end):
    if start <= year <= end:
        return 1
    return 0

def make_big_band_period_column(year):
    return make_period_column(year,1930,1950)

def make_bebop_period_column(year):
    return make_period_column(year,1940,1955)

def make_cool_jazz_period_column(year):
    return make_period_column(year,1950,1970)

def make_jazz_fusion_period_column(year):
    return make_period_column(year,1970,2020)

def make_swing_era_column(year):
    return make_period_column(year,1925,1945)

def make_modern_jazz_era_column(year):
    return make_period_column(year,1940,1970)

In [None]:
period_columns = ('big_band_period','bebop_period','cool_jazz_period','jazz_fusion_period','swing_era','modern_jazz_era')
period_functions = (make_big_band_period_column, make_bebop_period_column, make_cool_jazz_period_column, make_jazz_fusion_period_column, make_swing_era_column, make_modern_jazz_era_column)

for column, func in zip(period_columns,period_functions):
    api_df[column] = api_df['year'].apply(func)

## Jazz Standard Count

In [None]:
with open(os.path.join(DATA_PATH,'standards.pkl'),'rb') as f:
    standards = pickle.load(f)
    
lowercase_no_punctuation = lambda x: x.lower().translate(str.maketrans('','',string.punctuation))

In [None]:
standards_lookup = {lowercase_no_punctuation(standard):0 for standard in standards}

In [None]:
def count_jazz_standards(standards_lookup, title_list):
    standards_counter = 0
    for title in title_list:
        title = title.lower().translate(str.maketrans('', '', string.punctuation))
        if title in standards_lookup:
            standards_counter += 1               
    return standards_counter

In [None]:
matched_track_titles = match_track_titles_to_standards(standards,api_df['track_titles'])

After some testing of match confidence values, the cutoff for matches will be set at 0.7, leaving 10427 titles which will be matched to the list of jazz standards. After this threshold, the accuracy of the matches degrades to an extent that it will introduce too much bias into the ``no_of_jazz_standards`` feature.

In [None]:
matched_track_titles.sort_values('Match Confidence',ascending=False)[matched_track_titles['Match Confidence'] < 0.7]

In [None]:
match_title_to_standards_df = matched_track_titles[matched_track_titles['Match Confidence'] < 0.7]
tfidf_lookup = {row['Original Name']:row['Matched Name'] for _, row in match_title_to_standards_df.iterrows() if row['Original Name'] not in standards_lookup}

In [None]:
standards_lookup_ = dict(**standards_lookup,**tfidf_lookup)
count_jazz_standards_ = partial(count_jazz_standards,standards_lookup_)

In [None]:
api_df['standards_count'] = api_df['track_titles'].apply(pickle.loads).apply(count_jazz_standards_)

In [None]:
api_df['standards_count'].describe()

## last_sold

In [None]:
max_last_sold_value = api_df['last_sold'].max()

In [None]:
api_df['no_of_days_since_last_sale'] = api_df['last_sold'].apply(lambda x: (max_last_sold_value - x).days)

## Saving api_df

In [None]:
save_to_pkl(api_df,'api')

## Joining dfs and Saving results to hdf

In [None]:
try:
    api_df
except NameError:
    api_df = load_from_pkl('api')
    
try:
    encoded_country_df
except NameError:
    encoded_country_df = load_from_pkl('country')

try:
    encoded_genre_df
except NameError:
    encoded_genre_df = load_from_pkl('genre')

try:
    encoded_style_df
except NameError:
    encoded_style_df = load_from_pkl('style')

try:
    format_description_df
except NameError:
    format_description_df = load_from_pkl('format_description')

In [None]:
concat_df = pd.concat([
    api_df,
    encoded_country_df,
    encoded_genre_df,
    encoded_style_df,
    format_description_df
],axis=1)

In [None]:
save_to_pkl(concat_df,'concat')