# Data Processing
In this notebook, I will be loading the data necessary for this project, combining the data sources and performing initial data processing steps. I will also be performing some exploratory data analysis for the purposes of identifying missing values and outliers, which will then be followed up with the appropriate processing steps.

In [2]:
%load_ext autoreload
%autoreload 2
import nltk
import numpy as np
import os
import pandas as pd
import pdpipe as pdp
import pickle
import random
import re
import string
from collections import Counter
from functools import partial
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
tqdm.pandas()

In [3]:
from data.util.paths import DATA_PATH
from data.scripts.project_data import DataLoader
from data.util.environment_variables import COUNTRIES, SUPERREGIONS, REGIONS
from lib.transformers import *

from lib.processing import load_geoscheme_df, encode_country_column, encode_style_column,clean_artist_column, clean_label_column, save_to_pkl, load_from_pkl, artist_ngrams, cossine_similarity, get_matches_df, label_ngrams, get_cosine_similarity_matches, get_ngrams, create_match_lookup, clean_format_text, make_market_value_col,make_format_description_column, make_format_name_column, make_format_quantity_column, make_format_text_column, expand_format_description_column, match_track_titles_to_standards 

# Extracted Data
## Loading and Cleaning

In [4]:
data_loader = DataLoader()
extracted_data = data_loader.load_extracted_data()

In [5]:
extracted_data_pipe = Pipeline([
    ('remove_id', ColumnRemover('_id')),
    ('unpickle', Unpickler(['track_titles'])),
    ('make_market_value', ColumnCombiner('median','market_price','market_value')),
    ('remove_duplicates', DuplicateRemover('release_id')),
    ('remove_nulls',NullRemover('market_value'))
])

# API Data
## Loading and Cleaning

In [None]:
clean_text_pipe = Pipeline([
    ('label', LabelCleaner()),
    ('artist', ArtistCleaner())
])

In [None]:
column_encoding_pipe = Pipeline([
    ('country',CountryEncoder()),
    ('genre',GenreEncoder()),
    ('style', MultiValueCategoricalEncoder(feature='style'))
])

In [None]:
format_pipe = Pipeline([
    ('make_columns', FormatEncoder()),
    ('encode_descriptions',MultiValueCategoricalEncoder('format_description')),
    ('clean_format_text',FormatTextCleaner())
])

In [None]:
api_pipe = Pipeline([
    ('remove_columns', ColumnRemover(['id'])),
    ('split_title', TitleSplitter()),
    ('unpickle', Unpickler(['genre','style','label','formats'])),
    ('clean_text',clean_text_pipe),
    ('remove_duplicates', DuplicateRemover('release_id')),
    ('encode_columns',column_encoding_pipe),
    ('format_columns', format_pipe)
    ('encode_time_periods', TimePeriodEncoder)
])

In [None]:
api_df = data_loader.load_api_data()

In [None]:
api_df = api_pipe.fit_transform(api_df)

api_df = extracted_data.merge(api_df,how='left',on='release_id')

## Jazz Standard Count

In [6]:
from lib.transformers import StandardCountEncoder

In [7]:
extracted_data['track_titles'].apply(pickle.loads)

0         [With Every Breath I Take, It's Easy To Rememb...
1         [With Every Breath I Take, It's Easy To Rememb...
2         [The Music Goes 'Round And 'Round, Sailing At ...
3         [Lonely Melody, Mississippi Mud, San, When, Su...
4                              [For Dancers Only, Coquette]
                                ...                        
350580    [My Funny Valentine, Blues By Five, Airegin, T...
350581    [Mauvais Garcon, Petite Fleur, Bei Mir Bist Du...
350582    [Let Me Love Tonight, To See You, Let's Just K...
350583    [Köhntark  (Part 1), Köhntark  (Part 2), Kobah...
350584    [Cambridge, Andrew Dreaming, Boutique Music, S...
Name: track_titles, Length: 350585, dtype: object

In [None]:
StandardCountEncoder('track_titles').fit_transform(extracted_data)

100%|██████████| 350585/350585 [00:12<00:00, 27932.11it/s]


In [None]:
with open(os.path.join(DATA_PATH,'standards.pkl'),'rb') as f:
    standards = pickle.load(f)
    
lowercase_no_punctuation = lambda x: x.lower().translate(str.maketrans('','',string.punctuation))

In [None]:
string.punctuation

In [None]:
from collections import defaultdict

In [None]:
standards_lookup = defaultdict(int)

In [None]:
standards_lookup = {lowercase_no_punctuation(standard):0 for standard in standards}

In [None]:
def count_jazz_standards(standards_lookup, title_list):
    standards_counter = 0
    for title in title_list:
        title = title.lower().translate(str.maketrans('', '', string.punctuation))
        if title in standards_lookup:
            standards_counter += 1               
    return standards_counter

In [None]:
matched_track_titles = match_track_titles_to_standards(standards,api_df['track_titles'])

After some testing of match confidence values, the cutoff for matches will be set at 0.7, leaving 10427 titles which will be matched to the list of jazz standards. After this threshold, the accuracy of the matches degrades to an extent that it will introduce too much bias into the ``no_of_jazz_standards`` feature.

In [None]:
matched_track_titles.sort_values('Match Confidence',ascending=False)[matched_track_titles['Match Confidence'] < 0.7]

In [None]:
match_title_to_standards_df = matched_track_titles[matched_track_titles['Match Confidence'] < 0.7]
tfidf_lookup = {row['Original Name']:row['Matched Name'] for _, row in match_title_to_standards_df.iterrows() if row['Original Name'] not in standards_lookup}

In [None]:
standards_lookup_ = dict(**standards_lookup,**tfidf_lookup)
count_jazz_standards_ = partial(count_jazz_standards,standards_lookup_)

In [None]:
api_df['standards_count'] = api_df['track_titles'].apply(pickle.loads).apply(count_jazz_standards_)

In [None]:
api_df['standards_count'].describe()

## last_sold

In [None]:
max_last_sold_value = api_df['last_sold'].max()

In [None]:
api_df['no_of_days_since_last_sale'] = api_df['last_sold'].apply(lambda x: (max_last_sold_value - x).days)

## Saving api_df

In [None]:
save_to_pkl(api_df,'api')

## Joining dfs and Saving results to hdf

In [None]:
try:
    api_df
except NameError:
    api_df = load_from_pkl('api')
    
try:
    encoded_country_df
except NameError:
    encoded_country_df = load_from_pkl('country')

try:
    encoded_genre_df
except NameError:
    encoded_genre_df = load_from_pkl('genre')

try:
    encoded_style_df
except NameError:
    encoded_style_df = load_from_pkl('style')

try:
    format_description_df
except NameError:
    format_description_df = load_from_pkl('format_description')

In [None]:
concat_df = pd.concat([
    api_df,
    encoded_country_df,
    encoded_genre_df,
    encoded_style_df,
    format_description_df
],axis=1)

In [None]:
save_to_pkl(concat_df,'concat')

In [None]:
concat_df['format_text_clean'].describe()