# Data Processing
In this notebook, I will be loading the data necessary for this project, combining the data sources and performing initial data processing steps. I will also be performing some exploratory data analysis for the purposes of identifying missing values and outliers, which will then be followed up with the appropriate processing steps.

In [9]:
%load_ext autoreload
%autoreload 2
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import string
from collections import Counter
from functools import partial
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
tqdm.pandas()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from data.util.paths import DATA_PATH
from data.scripts.project_data import DataLoader
from data.util.environment_variables import COUNTRIES, SUPERREGIONS, REGIONS
from lib.transformers import *

from lib.processing import load_geoscheme_df, encode_country_column, encode_style_column,clean_artist_column, clean_label_column, save_to_pkl, load_from_pkl, artist_ngrams, cossine_similarity, get_matches_df, label_ngrams, get_cosine_similarity_matches, get_ngrams, create_match_lookup, clean_format_text, make_market_value_col,make_format_description_column, make_format_name_column, make_format_quantity_column, make_format_text_column, expand_format_description_column, match_track_titles_to_standards 

# Extracted Data
## Loading and Cleaning

In [11]:
data_loader = DataLoader(db_name='jazz_album',db_path=DATA_PATH,db_dialect='sqlite')
extracted_data = data_loader.load_extracted_data()

In [12]:
extracted_data_pipe = Pipeline([
    ('remove_id', ColumnRemover('_id')),
    ('unpickle', Unpickler(['track_titles'])),
    ('make_market_value', ColumnCombiner('median','market_price','market_value')),
    ('remove_duplicates', DuplicateRemover('release_id')),
    ('remove_nulls',NullRemover('market_value'))
])

# API Data
## Loading and Cleaning

In [None]:
clean_text_pipe = Pipeline([
    ('label', LabelCleaner()),
    ('artist', ArtistCleaner())
])

In [None]:
column_encoding_pipe = Pipeline([
    ('country',CountryEncoder()),
    ('genre',GenreEncoder()),
    ('style', MultiValueCategoricalEncoder(feature='style'))
])

In [None]:
format_pipe = Pipeline([
    ('make_columns', FormatEncoder()),
    ('encode_descriptions',MultiValueCategoricalEncoder('format_description')),
    ('clean_format_text',FormatTextCleaner())
])

In [None]:
api_pipe = Pipeline([
    ('remove_columns', ColumnRemover(['id'])),
    ('split_title', TitleSplitter()),
    ('unpickle', Unpickler(['genre','style','label','formats'])),
    ('clean_text',clean_text_pipe),
    ('remove_duplicates', DuplicateRemover('release_id')),
    ('encode_columns',column_encoding_pipe),
    ('format_columns', format_pipe)
    ('encode_time_periods', TimePeriodEncoder)
])

In [None]:
api_df = data_loader.load_api_data()

In [None]:
api_df = api_pipe.fit_transform(api_df)

api_df = extracted_data.merge(api_df,how='left',on='release_id')

## Jazz Standard Count

In [13]:
from lib.transformers import StandardCountEncoder

In [14]:
extracted_data['track_titles'].apply(pickle.loads)

0         [With Every Breath I Take, It's Easy To Rememb...
1         [With Every Breath I Take, It's Easy To Rememb...
2         [The Music Goes 'Round And 'Round, Sailing At ...
3         [Lonely Melody, Mississippi Mud, San, When, Su...
4                              [For Dancers Only, Coquette]
                                ...                        
350580    [My Funny Valentine, Blues By Five, Airegin, T...
350581    [Mauvais Garcon, Petite Fleur, Bei Mir Bist Du...
350582    [Let Me Love Tonight, To See You, Let's Just K...
350583    [Köhntark  (Part 1), Köhntark  (Part 2), Kobah...
350584    [Cambridge, Andrew Dreaming, Boutique Music, S...
Name: track_titles, Length: 350585, dtype: object

In [23]:
StandardCountEncoder('track_titles').fit_transform(extracted_data)



  0%|          | 0/350585 [00:00<?, ?it/s][A[A

  1%|          | 2833/350585 [00:00<00:12, 28328.07it/s][A[A

  2%|▏         | 5737/350585 [00:00<00:12, 28535.36it/s][A[A

  2%|▏         | 8665/350585 [00:00<00:11, 28752.61it/s][A[A

  3%|▎         | 11600/350585 [00:00<00:11, 28928.49it/s][A[A

  4%|▍         | 14481/350585 [00:00<00:11, 28892.75it/s][A[A

  5%|▍         | 17399/350585 [00:00<00:11, 28972.20it/s][A[A

  6%|▌         | 19964/350585 [00:00<00:12, 25817.14it/s][A[A

  7%|▋         | 22872/350585 [00:00<00:12, 26685.63it/s][A[A

  7%|▋         | 25709/350585 [00:00<00:11, 27169.16it/s][A[A

  8%|▊         | 28489/350585 [00:01<00:11, 27354.66it/s][A[A

  9%|▉         | 31371/350585 [00:01<00:11, 27777.41it/s][A[A

 10%|▉         | 34206/350585 [00:01<00:11, 27945.74it/s][A[A

 11%|█         | 37179/350585 [00:01<00:11, 28457.14it/s][A[A

 11%|█▏        | 40240/350585 [00:01<00:10, 29069.95it/s][A[A

 12%|█▏        | 43140/350585 [00:01<00:1

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,id,release_id,market_price,units_for_sale,have,want,average_rating,rating_count,last_sold,number_of_tracks,running_time,lowest,median,highest,track_titles,track_titles_count
0,1,11918321,,,1.0,2.0,,0.0,NaT,12.0,,,,,"b""\x80\x04\x955\x01\x00\x00\x00\x00\x00\x00]\x...",0
1,2,10550056,3.99,3.0,4.0,4.0,4.00,1.0,NaT,12.0,,,,,"b""\x80\x04\x955\x01\x00\x00\x00\x00\x00\x00]\x...",0
2,3,6910984,7.21,3.0,4.0,2.0,,0.0,2019-03-04,8.0,,5.99,5.99,5.99,"b""\x80\x04\x95\xc1\x00\x00\x00\x00\x00\x00\x00...",0
3,4,12959431,,,3.0,3.0,,0.0,NaT,12.0,,,,,"b""\x80\x04\x95\xc1\x00\x00\x00\x00\x00\x00\x00...",0
4,5,4453491,,,5.0,2.0,,0.0,NaT,2.0,,,,,b'\x80\x04\x95#\x00\x00\x00\x00\x00\x00\x00]\x...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350580,350581,7340022,11.85,10.0,182.0,89.0,4.42,19.0,2019-12-29,5.0,,10.13,12.61,20.99,b'\x80\x03]q\x00(X\x12\x00\x00\x00My Funny Val...,0
350581,350582,12058618,5.61,5.0,2.0,2.0,4.00,1.0,NaT,24.0,,,,,"b""\x80\x03]q\x00(X\x0e\x00\x00\x00Mauvais Garc...",0
350582,350583,11534331,1.32,3.0,13.0,1.0,,0.0,NaT,10.0,75.383333,,,,"b""\x80\x03]q\x00(X\x13\x00\x00\x00Let Me Love ...",0
350583,350584,1572860,17.97,9.0,42.0,36.0,4.50,6.0,2019-03-28,6.0,72.200000,19.10,22.88,30.34,b'\x80\x03]q\x00(X\x13\x00\x00\x00K\xc3\xb6hnt...,0


## last_sold

In [None]:
max_last_sold_value = api_df['last_sold'].max()

In [None]:
api_df['no_of_days_since_last_sale'] = api_df['last_sold'].apply(lambda x: (max_last_sold_value - x).days)

## Saving api_df

In [None]:
save_to_pkl(api_df,'api')

## Joining dfs and Saving results to hdf

In [None]:
try:
    api_df
except NameError:
    api_df = load_from_pkl('api')
    
try:
    encoded_country_df
except NameError:
    encoded_country_df = load_from_pkl('country')

try:
    encoded_genre_df
except NameError:
    encoded_genre_df = load_from_pkl('genre')

try:
    encoded_style_df
except NameError:
    encoded_style_df = load_from_pkl('style')

try:
    format_description_df
except NameError:
    format_description_df = load_from_pkl('format_description')

In [None]:
concat_df = pd.concat([
    api_df,
    encoded_country_df,
    encoded_genre_df,
    encoded_style_df,
    format_description_df
],axis=1)

In [None]:
save_to_pkl(concat_df,'concat')

In [None]:
concat_df['format_text_clean'].describe()