# Data Processing
In this notebook, I will be loading the data necessary for this project, combining the data sources and performing initial data processing steps. I will also be performing some exploratory data analysis for the purposes of identifying missing values and outliers, which will then be followed up with the appropriate processing steps.

In [None]:
%load_ext autoreload
%autoreload 2
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import string
from collections import Counter
from functools import partial
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
tqdm.pandas()

In [None]:
from data.util.paths import DATA_PATH
from data.scripts.project_data import DataLoader
from data.util.environment_variables import COUNTRIES, SUPERREGIONS, REGIONS
from lib.transformers import *

from lib.processing import load_geoscheme_df, encode_country_column, encode_style_column,clean_artist_column, clean_label_column, save_to_pkl, load_from_pkl, artist_ngrams, cossine_similarity, get_matches_df, label_ngrams, get_cosine_similarity_matches, get_ngrams, create_match_lookup, clean_format_text, make_market_value_col,make_format_description_column, make_format_name_column, make_format_quantity_column, make_format_text_column, expand_format_description_column, match_track_titles_to_standards 

# Extracted Data

In [None]:
extracted_pipe = Pipeline([
    ('remove_id', ColumnRemover('id')),
    ('unpickle', Unpickler(['track_titles'])),
    ('make_market_value', ColumnCombiner('median','market_price','market_value')),
    ('remove_duplicates', DuplicateRemover('release_id')),
    ('remove_nulls',NullRemover('market_value')),
    ('count_standards',StandardCountEncoder('track_titles')),
    ('count_days_since_last_sale',LastSoldEncoder(feature='last_sold',new_feature='days_since_last_sale'))
])

# API Data

In [None]:
clean_text_pipe = Pipeline([
    ('label', LabelCleaner()),
    ('artist', ArtistCleaner())
])

In [None]:
column_encoding_pipe = Pipeline([
    ('country',CountryEncoder()),
    ('genre',GenreEncoder()),
    ('style', MultiValueCategoricalEncoder(feature='style'))
])

In [None]:
format_pipe = Pipeline([
    ('make_columns', FormatEncoder()),
    ('encode_descriptions',MultiValueCategoricalEncoder('format_description')),
    ('clean_format_text',FormatTextCleaner())
])

In [None]:
api_pipe = Pipeline([
    ('remove_columns', ColumnRemover('id')),
    ('split_title', TitleSplitter()),
    ('unpickle', Unpickler(['genre','style','label','formats'])),
    ('clean_text',clean_text_pipe),
    ('remove_duplicates', DuplicateRemover('release_id')),
    ('encode_columns',column_encoding_pipe),
    ('format_columns', format_pipe),
    ('encode_time_periods', TimePeriodEncoder())
])

# Transforming

In [None]:
data_loader = DataLoader(db_name='jazz_album',db_path=DATA_PATH,db_dialect='sqlite')
extracted_df = data_loader.load_extracted_data()
api_df = data_loader.load_api_data()

In [None]:
extracted_df = extracted_pipe.fit_transform(extracted_df)

In [None]:
api_df = api_pipe.fit_transform(api_df)

## Pickling Pipelines

In [None]:
from lib.util.paths import PIPELINE_PATH

In [None]:
pipeline_pickle_config = (
    (extracted_pipe,'extracted_pipe','extracted'),
    (clean_text_pipe,'clean_text_pipe','api'),
    (column_encoding_pipe,'column_encoding_pipe','api'),
    (format_pipe,'format_pipe','api'),
    (api_pipe,'api_pipe','api')
)
for pipe, pipe_name, folder in pipeline_pickle_config:
    save_to_pkl(pipe,pipe_name,os.path.join(PIPELINE_PATH,folder))