# Data Processing
In this notebook, I will be loading the data necessary for this project, combining the data sources and performing initial data processing steps. I will also be performing some exploratory data analysis for the purposes of identifying missing values and outliers, which will then be followed up with the appropriate processing steps.

In [58]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import os
import re
import pandas as pd
import pdpipe as pdp
import numpy as np
import matplotlib.pyplot as plt
import random
import seaborn as sns
import string
import vaex
import pickle
from functools import partial
import nltk
from tqdm import tqdm
from collections import defaultdict
tqdm.pandas()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from data.util.paths import DATA_PATH
from data.scripts.project_data import DataLoader
from data.util.environment_variables import COUNTRIES, SUPERREGIONS, REGIONS

from lib.processing import load_geoscheme_df, get_country_to_dict_mapping, encode_country_column, encode_genre_column, encode_style_column, clean_artist_column, clean_label_column

# Extracted Data
## Loading and Cleaning

In [3]:
data_loader = DataLoader()
extracted_data = data_loader.load_extracted_data()

In [4]:
extracted_data.head()
extracted_data.drop_duplicates('release_id',inplace=True)

In [5]:
def make_market_value_col(median_col,market_price_col):
    median_col, market_price_col = median_col.copy(), market_price_col.copy()
    
    market_value_col = median_col
    
    market_value_null_idx = market_value_col[market_value_col.isnull()].index
    
    market_value_col[market_value_null_idx] = market_price_col[market_value_null_idx]
    
    return market_value_col

In [6]:
extracted_data['market_value'] = make_market_value_col(extracted_data['median'],extracted_data['market_price'])

In [7]:
extracted_data.drop('id',inplace=True,axis=1)

In [8]:
ids_to_drop = extracted_data[extracted_data['market_value'].isnull()]['release_id']

In [9]:
extracted_data.drop(ids_to_drop.index,inplace=True)

# API Data
## Loading and Cleaning

In [10]:
api_df = data_loader.load_api_data()

In [11]:
api_df = pd.concat([api_df,api_df['title'].str.split(' - ',n=1,expand=True)],axis=1)

In [12]:
geoscheme_df = load_geoscheme_df()

In [13]:
country_to_dict_mapping = get_country_to_dict_mapping()

In [14]:
pipeline = pdp.DropDuplicates('release_id')
pipeline += pdp.ColDrop(['title','id'])
pipeline += pdp.ColRename({0: 'artist', 1: 'title'})
pipeline += pdp.ApplyByCols(['genre','style','label','formats','track_titles'],pickle.loads)
pipeline += pdp.MapColVals('country',country_to_dict_mapping)

In [15]:
api_df = pipeline.apply(api_df)

In [16]:
api_df = extracted_data.merge(api_df,how='left',on='release_id')

## Encoding Country

In [17]:
encoded_country_df = encode_country_column(api_df['country'])

100%|██████████| 28/28 [00:00<00:00, 185.89it/s]


## Encoding Genre

In [18]:
encoded_genre_df = encode_genre_column(api_df['genre'])

100%|██████████| 297546/297546 [00:00<00:00, 829454.70it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[list_][idx] = 1
  0%|          | 67/297546 [00:00<15:53, 311.92it/s]

0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[[element.replace("'","") for element in list_]][idx] = 1
 10%|█         | 30063/297546 [01:21<11:24, 390.70it/s]

30000


 20%|██        | 60036/297546 [02:42<11:03, 357.92it/s]

60000


 30%|███       | 90061/297546 [04:04<09:50, 351.30it/s]

90000


 40%|████      | 120081/297546 [05:26<07:36, 388.84it/s]

120000


 50%|█████     | 150061/297546 [06:47<05:37, 437.44it/s]

150000


 61%|██████    | 180050/297546 [08:00<05:26, 359.75it/s]

180000


 71%|███████   | 210063/297546 [09:14<03:24, 428.30it/s]

210000


 81%|████████  | 240073/297546 [10:19<02:05, 457.84it/s]

240000


 91%|█████████ | 270025/297546 [11:24<01:38, 279.79it/s]

270000


100%|██████████| 297546/297546 [12:37<00:00, 392.60it/s]


In [19]:
encoded_genre_df.astype(np.int32)

Unnamed: 0,Hip Hop,Childrens,Classical,Jazz,Blues,Latin,"Folk, World, & Country",Rock,Funk / Soul,Electronic,Stage & Screen,Reggae,Non-Music,Pop,Brass & Military
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297541,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
297542,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
297543,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
297544,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Encoding Style

In [20]:
encoded_style_df = encode_style_column(api_df['style'])

100%|██████████| 297546/297546 [00:00<00:00, 895421.63it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[element][idx] = 1
  0%|          | 36/297546 [00:00<13:56, 355.63it/s]

0


 10%|█         | 30071/297546 [01:34<12:41, 351.29it/s]

30000


 20%|██        | 60062/297546 [03:08<12:35, 314.18it/s]

60000


 30%|███       | 90049/297546 [04:41<11:06, 311.45it/s]

90000


 40%|████      | 120070/297546 [06:15<08:47, 336.72it/s]

120000


 50%|█████     | 150035/297546 [07:50<07:31, 326.61it/s]

150000


 61%|██████    | 180065/297546 [09:24<05:52, 333.66it/s]

180000


 71%|███████   | 210063/297546 [10:58<04:31, 322.43it/s]

210000


 81%|████████  | 240070/297546 [12:33<02:54, 328.83it/s]

240000


 91%|█████████ | 270065/297546 [14:07<01:23, 330.15it/s]

270000


100%|██████████| 297546/297546 [15:33<00:00, 318.64it/s]


## Cleaning Label and Artist for Later Encoding

In [21]:
cleaned_artist_col = api_df['artist'].apply(clean_artist_column)
cleaned_label_col = api_df['label'].apply(clean_label_column)

## Encoding Formats

In [78]:
api_df['formats']

0         [{'qty': '6', 'descriptions': ['10"', '78 RPM'...
1         [{'descriptions': ['10"', '78 RPM', 'Album', '...
2         [{'descriptions': ['10"', '78 RPM', 'Album', '...
3         [{'descriptions': ['10"', '78 RPM', 'Album'], ...
4         [{'descriptions': ['10"', '78 RPM', 'Album'], ...
                                ...                        
297541    [{'descriptions': ['LP', 'Album', 'Reissue'], ...
297542    [{'descriptions': ['Album'], 'name': 'CD', 'qt...
297543    [{'descriptions': ['Album'], 'name': 'CD', 'qt...
297544    [{'descriptions': ['LP', 'Album', 'Reissue'], ...
297545    [{'qty': '1', 'descriptions': ['12"', '33 ⅓ RP...
Name: formats, Length: 297546, dtype: object

In [186]:
from lib.processing import make_format_description_column, make_format_name_column, make_format_quantity_column, make_format_text_column

In [195]:
format_columns = ['format_{}'.format(column) for column in ('description','name','quantity','text')]
format_functions = (make_format_description_column, make_format_name_column, make_format_quantity_column, make_format_text_column)

for column, function in zip(format_columns,format_functions):
    api_df[column] = api_df['formats'].apply(function)

In [196]:
api_df['format_quantity'].unique()

array([      6,       4,       5,       3,       2,       1,      25,
             7,      16,      10,       8,       9,      26,      14,
            13,      24,      28,      20,      55,      11,      50,
             0,      37,      12, 1010201,      15,      30,     500,
           300,      22,      34,      29,      52])

In [None]:
api_df['format_description'].progress_apply(pd.Series)






  0%|          | 0/297546 [00:00<?, ?it/s][A[A[A[A[A




  0%|          | 366/297546 [00:00<01:21, 3653.50it/s][A[A[A[A[A




  0%|          | 892/297546 [00:00<01:13, 4021.65it/s][A[A[A[A[A




  0%|          | 1417/297546 [00:00<01:08, 4324.37it/s][A[A[A[A[A




  1%|          | 1997/297546 [00:00<01:03, 4681.01it/s][A[A[A[A[A




  1%|          | 2594/297546 [00:00<00:58, 5004.92it/s][A[A[A[A[A




  1%|          | 3185/297546 [00:00<00:56, 5244.79it/s][A[A[A[A[A




  1%|▏         | 3772/297546 [00:00<00:54, 5415.99it/s][A[A[A[A[A




  1%|▏         | 4354/297546 [00:00<00:53, 5529.23it/s][A[A[A[A[A




  2%|▏         | 4961/297546 [00:00<00:51, 5678.91it/s][A[A[A[A[A




  2%|▏         | 5524/297546 [00:01<00:51, 5663.40it/s][A[A[A[A[A




  2%|▏         | 6090/297546 [00:01<00:51, 5659.86it/s][A[A[A[A[A




  2%|▏         | 6677/297546 [00:01<00:50, 5720.74it/s][A[A[A[A[A




  2%|▏         | 7247/297546 [00:01

In [None]:
api_df

# Encoding last_sold

In [103]:
max_last_sold_value = api_df['last_sold'].max()

In [104]:
api_df['no_of_days_since_last_sale'] = api_df['last_sold'].apply(lambda x: (max_last_sold_value - x).days)

## Joining dfs and Saving results to hdf

In [22]:
concat_df = pd.concat([
    api_df,
    encoded_country_df,
    encoded_genre_df,
    encoded_style_df,
    cleaned_artist_col,
    cleaned_label_col
],axis=1)

In [25]:
if 'concat.pkl' not in os.listdir(DATA_PATH):
    with open(os.path.join(DATA_PATH,'concat.pkl'),'wb') as f:
        pickle.dump(concat_df,f)

In [None]:
with open(os.path.join(DATA_PATH,'concat.pkl'),'rb') as f:
    concat_df = pickle.load(f)

In [None]:
concat_df

## Encoded DF

## Feature Engineering
## Eras

In [26]:
def make_period_column(year,start,end):
    if start <= year <= end:
        return 1
    return 0

def make_big_band_period_column(year):
    return make_period_column(year,1930,1950)

def make_bebop_period_column(year):
    return make_period_column(year,1940,1955)

def make_cool_jazz_period_column(year):
    return make_period_column(year,1950,1970)

def make_jazz_fusion_period_column(year):
    return make_period_column(year,1970,2020)

def make_swing_era_column(year):
    return make_period_column(year,1925,1945)

def make_modern_jazz_era_column(year):
    return make_period_column(year,1940,1970)

In [42]:
period_columns = ('big_band_period','bebop_period','cool_jazz_period','jazz_fusion_period','swing_era','modern_jazz_era')
period_functions = (make_big_band_period_column, make_bebop_period_column, make_cool_jazz_period_column, make_jazz_fusion_period_column, make_swing_era_column, make_modern_jazz_era_column)

for column, func in zip(period_columns,period_functions):
    concat_df[column] = concat_df['year'].apply(func)

## Jazz Standard Count

In [56]:
with open(os.path.join(DATA_PATH,'standards.pkl'),'rb') as f:
    standards = pickle.load(f)
    
lowercase_no_punctuation = lambda x: x.lower().translate(str.maketrans('','',string.punctuation))

In [60]:
standards_lookup = {lowercase_no_punctuation(standard):0 for standard in standards}

In [61]:
def count_jazz_standards(standards_list):
    standards_counter = 0
    for standard in standards_list:
        standard = standard.lower().translate(str.maketrans('', '', string.punctuation))
        if standard in standards_lookup:
            standards_counter += 1
    return standards_counter

In [62]:
concat_df['standards_count'] = concat_df['track_titles'].apply(count_jazz_standards)

# High Level Features
## Loading and Cleaning

In [None]:
high_level_features = data_loader.load_high_level_features()

In [None]:
high_level_features.head()

# Test Estimation

In [None]:
[column for column in api_df.columns if 'year_' not in column]

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
testing_df = concat_df.dropna()

In [None]:
X_tr,X_te,y_tr,y_te = train_test_split(testing_df.drop(['market_value','lowest','median','highest','want','have','market_price','last_sold','units_for_sale','average_rating','rating_count','track_titles','community_have','community_want','title','formats','style','genre','master_id','thumb_url','release_url','release_id','country'],axis=1),testing_df['market_value'])

In [None]:
loo_encoder = LeaveOneOutEncoder(cols=['artist','label'],sigma=0.25)

In [None]:
loo_encoder.fit(X_tr,y_tr)

In [None]:
X_tr = loo_encoder.transform(X_tr)
X_te = loo_encoder.transform(X_te)

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
ridge = SGDRegressor(loss='huber',max_iter=100,early_stopping=True,verbose=100)
ridge_search_params = {
    'alpha': np.logspace(-10,-5,10),
}
ridge_search = GridSearchCV(ridge,ridge_search_params)

In [None]:
ridge_search.fit(X_tr,y_tr)

In [None]:
ridge_search.best_params_

In [None]:
np.logspace(-5,0,10)