# Data Processing
In this notebook, I will be loading the data necessary for this project, combining the data sources and performing initial data processing steps. I will also be performing some exploratory data analysis for the purposes of identifying missing values and outliers, which will then be followed up with the appropriate processing steps.

In [51]:
%load_ext autoreload
%autoreload 2
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import string
from collections import Counter
from functools import partial
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
tqdm.pandas()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
from data.util.paths import DATA_PATH
from data.scripts.project_data import DataLoader
from data.util.environment_variables import COUNTRIES, SUPERREGIONS, REGIONS
from lib.transformers import *

from lib.processing import load_geoscheme_df, encode_country_column, encode_style_column,clean_artist_column, clean_label_column, save_to_pkl, load_from_pkl, artist_ngrams, cossine_similarity, get_matches_df, label_ngrams, get_cosine_similarity_matches, get_ngrams, create_match_lookup, clean_format_text, make_market_value_col,make_format_description_column, make_format_name_column, make_format_quantity_column, make_format_text_column, expand_format_description_column, match_track_titles_to_standards 

# Extracted Data

In [34]:
extracted_pipe = Pipeline([
    ('remove_id', ColumnRemover('id')),
    ('unpickle', Unpickler(['track_titles'])),
    ('make_market_value', ColumnCombiner('median','market_price','market_value')),
    ('remove_duplicates', DuplicateRemover('release_id')),
    ('remove_nulls',NullRemover('market_value')),
    ('count_standards',StandardCountEncoder('track_titles')),
    ('count_days_since_last_sale',LastSoldEncoder(feature='last_sold',new_feature='days_since_last_sale'))
])

# API Data

In [8]:
clean_text_pipe = Pipeline([
    ('label', LabelCleaner()),
    ('artist', ArtistCleaner())
])

In [9]:
column_encoding_pipe = Pipeline([
    ('country',CountryEncoder()),
    ('genre',GenreEncoder()),
    ('style', MultiValueCategoricalEncoder(feature='style'))
])

In [10]:
format_pipe = Pipeline([
    ('make_columns', FormatEncoder()),
    ('encode_descriptions',MultiValueCategoricalEncoder('format_description')),
    ('clean_format_text',FormatTextCleaner())
])

In [12]:
api_pipe = Pipeline([
    ('remove_columns', ColumnRemover('id')),
    ('split_title', TitleSplitter()),
    ('unpickle', Unpickler(['genre','style','label','formats'])),
    ('clean_text',clean_text_pipe),
    ('remove_duplicates', DuplicateRemover('release_id')),
    ('encode_columns',column_encoding_pipe),
    ('format_columns', format_pipe),
    ('encode_time_periods', TimePeriodEncoder)
])

# Transforming

In [32]:
data_loader = DataLoader(db_name='jazz_album',db_path=DATA_PATH,db_dialect='sqlite')
extracted_df = data_loader.load_extracted_data()
api_df = data_loader.load_api_data()

In [47]:
extracted_data = extracted_pipe.fit_transform(extracted_df)


  0%|          | 0/297546 [00:00<?, ?it/s][A
  1%|          | 2012/297546 [00:00<00:14, 20118.44it/s][A
  1%|▏         | 4222/297546 [00:00<00:14, 20672.16it/s][A
  2%|▏         | 6353/297546 [00:00<00:13, 20857.28it/s][A
  3%|▎         | 8590/297546 [00:00<00:13, 21286.61it/s][A
  4%|▎         | 10793/297546 [00:00<00:13, 21504.18it/s][A
  4%|▍         | 13176/297546 [00:00<00:12, 22152.75it/s][A
  5%|▌         | 15465/297546 [00:00<00:12, 22366.58it/s][A
  6%|▌         | 17680/297546 [00:00<00:12, 22300.85it/s][A
  7%|▋         | 19861/297546 [00:00<00:12, 22148.11it/s][A
  7%|▋         | 21998/297546 [00:01<00:12, 21774.18it/s][A
  8%|▊         | 24213/297546 [00:01<00:12, 21885.28it/s][A
  9%|▉         | 26535/297546 [00:01<00:12, 22268.19it/s][A
 10%|▉         | 28738/297546 [00:01<00:12, 21857.80it/s][A
 10%|█         | 30909/297546 [00:01<00:12, 21352.61it/s][A
 11%|█         | 33098/297546 [00:01<00:12, 21510.83it/s][A
 12%|█▏        | 35428/297546 [00:01<00:11

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
api_df = api_pipe.fit_transform(api_df)