In [18]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import pandas as pd
import geopandas as gpd
import numpy as np
import os
import pickle
import random
import umap
from tqdm import tqdm
from data.util.paths import DATA_PATH
from lib.util.paths import PIPELINE_PATH
from mpl_toolkits.axes_grid1 import make_axes_locatable
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from lib.transformers import *
from sklearn.pipeline import Pipeline
tqdm.pandas()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
from lib.processing import *
from data.scripts.project_data import DataLoader
from data.util.environment_variables import COUNTRY_CODES, M49_TO_ISO3

## Setting up Dataset

In [12]:
api_pipe, extracted_pipe = load_from_pkl('api_pipe',path=os.path.join(PIPELINE_PATH,'api')), load_from_pkl('extracted_pipe',path=os.path.join(PIPELINE_PATH,'extracted'))

In [26]:
data_loader = DataLoader(db_name='jazz_album',db_path=DATA_PATH,db_dialect='sqlite')
extracted_df, api_df = data_loader.load_extracted_data(), data_loader.load_api_data()
extracted_df = extracted_pipe.fit_transform(extracted_df)
api_df = api_pipe.fit_transform(api_df)

HBox(children=(FloatProgress(value=0.0, max=297546.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
df = pd.merge(extracted_df,api_df,on='release_id')

In [None]:
del extracted_df, api_df

## Create Column Store

In [None]:
col_set = {
    'format': {
        'description': 'format_description_', 
        'name': 'format_name_', 
        'text': ('format_text_clean'),
        'quantity': ('format_quantity')
    },
    'geography': {
        'superregion': 'superregion_',
        'region': 'region_',
        'country': 'country_'
    },
    'timeperiod': {
        'period': 'period_',
        'era': 'era_'
    },
    'genre': 'genre_',
    'style': 'style_',
    'null': None,
    'indicator': lambda x: x.max() == 1 and x.min() == 0
}

In [None]:
column_store = ColumnStore()
column_store.fit(df,col_set)

## Overview of Number of Records over Time

In [None]:
period_year_range, era_year_range = make_year_range_dict(column_store._timeperiod_period,df=df), make_year_range_dict(column_store._timeperiod_era,df=df)
time_period_year_range = dict(**period_year_range,**era_year_range)

In [None]:
year_count_series = df.groupby(by='year')['market_value'].count()
year_count_series[1936] = 0
year_count_series.sort_index(inplace=True)

In [None]:
period_colors = {
    'period_big_band': 'red',
    'period_bebop': 'pink',
    'period_cool': 'blue',
    'period_fusion': 'orange',
}
era_colors = {
    'era_swing': 'purple',
    'era_modern': 'green',
    'era_contemporary': 'gold'
}
time_period_colors = dict(**period_colors,**era_colors)

In [None]:
def plot_variable_with_time_periods(year_series,time_periods,time_period_colors,**kwargs):
    plt.figure(figsize=(20,10))

    #Setting up plotting helpers
    dev_constant = 200
    convert_to_label = lambda x: ' '.join(x.split('_')).title()

    #Plotting
    plt.plot(year_series)
    for time_period, year_range in time_periods.items():
        if 'period' in time_period:
            try:
                min_, max_, hatch_ = 0, year_series.loc[year_range], '/'
            except KeyError:
                print(year_range)
        else:
            min_, max_, hatch_ = year_series.loc[year_range], year_series.loc[year_range]+dev_constant, None

        plt.fill_between(year_range,max_,min_,alpha=0.25,color=time_period_colors[time_period],hatch=hatch_,label=convert_to_label(time_period))
    
    for attr in (kwargs):
        try:
            getattr(plt,attr)(kwargs[attr])
        except Exception as e:
            print(e)
    
    plt.legend()
    plt.show()
    

In [None]:
plot_variable_with_time_periods(year_count_series,time_period_year_range,time_period_colors,xlabel='Year',ylabel='Albums',title='Albums released per Year according to Dominant Jazz Time Period')

As we can see from the above, there was a massive explosion in the number of Jazz albums released per year in the 1950s, with a cyclical rise through the rest of the 20th century and into the 21st. In the Contemporary Jazz Era/Jazz Fusion Period, we see that the number of Jazz Albums released it at its highest, most likely due to the massive influence of Jazz on other genres.

In [None]:
pure_jazz_album_count = df[df[column_store._genre].sum(axis=1)==0].groupby(by='year').count()['market_value']
pure_jazz_album_count[1934], pure_jazz_album_count[1936] = 0,0
pure_jazz_album_count.sort_index(inplace=True)

In [None]:
plot_variable_with_time_periods(pure_jazz_album_count,time_period_year_range,time_period_colors,xlabel='Year',ylabel='Albums',title='Albums released per Year according to Dominant Jazz Time Period')

As we can see, the figure above is effectively identical to the one which includes non 'Pure Jazz' albums, indicating that the rising trend in album releases is not primarily due to the incorporation of Jazz into other styles, but instead a growth of the music in its purest form over time.

## Market Value
### Distribution

In [None]:
sns.distplot(df['market_value'])
plt.ylabel('Distribution')
plt.show()

From the above we see that there is a huge left skew for market_value, which means we need to remove outliers

In [None]:
outlier_remover = OutlierRemover('market_value')
outlier_remover.fit_transform(df)
print('OutlierRemover remover %s rows' % (len(df)-len(outlier_remover.fit_transform(df))))

In [None]:
sns.distplot(outlier_remover.transform(df)['market_value']);

After removing entries with ``market_value`` values exceeding 3 standard deviations from the mean, we see that the distribution has skewed less, making it a prime candidate for log treatment

In [None]:
sns.distplot(np.log(outlier_remover.transform(df)['market_value']));

## Evolution of Market Value over Time

In [None]:
plt.figure(figsize=(20,10))
plt.xlabel('Year')
plt.ylabel('2020 Market Value in USD')
plt.plot(df.groupby(by='year')['market_value'].mean())
mean_value = df.groupby(by='year')['market_value'].mean()
std_error = df.groupby(by='year')['market_value'].std()
plt.fill_between(std_error.index, mean_value-std_error, mean_value+std_error, alpha=0.25)
plt.show()

From the above, we see that even after removing the most offending outliers, there is still tremendous variation in the price of Jazz records, whic is particularly large in the mid 20th century, and slowly reduces over time, but never to a very small margin

## Correlations 

In [None]:
pairplot_columns = list(filter(lambda x: df[x].dtype in (float,int) and x not in ['master_id','release_id'],column_store._rest))

In [None]:
def plot_corr_subplots(df, var, var_list,nrows,ncols,figsize=(60,60)):
    
    fig, ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=figsize)


    for row_idx, row in enumerate(ax):
        for col_idx, col in enumerate(row):
            row_modifier = len(var_list)-ncols*row_idx
            list_idx = len(var_list)-row_modifier+col_idx
            col_column = var_list[list_idx]
            col.set_title(col_column,fontdict={'fontsize':50})
            col.scatter(df[col_column],df[var],edgecolor='white')
    plt.show()

In [None]:
plot_corr_subplots(df,'market_value',pairplot_columns,4,4)

As we can see from the plots above, there are not many clear relationships that can be identified between the ``market_value`` feature and the numerical features of the dataset. This implies that performance of the model will rely heavily on an efficient encoding of non-numerical data, such as genre, style, artist etc. in order to be able to predict the market value of a given record with any certainty.

In [None]:
def get_correlation_series(df,correlation_column,columns=None):
    df = df.copy()
    
    if not columns:
        columns = df.columns
    
    return pd.Series({
        column: np.corrcoef(
            df[correlation_column].values,
            df[column].values
        )[0][1]
        for column in columns
    })

In [None]:
market_value_indicator_correlations = get_correlation_series(df,'market_value',column_store._indicator)

In [None]:
market_value_indicator_correlations.describe()

From the above, it is clear that the indicator variables are not very highly correlated with ``market_value`` either. As such, we hope that via the leveraging of the multitude of categorical variables, we can improve the result of our price predictions

In [None]:
sign = market_value_indicator_correlations.apply(lambda x: 'positive' if x >= 0 else 'negative')
market_value_indicator_correlations = pd.DataFrame(
    {'correlation': market_value_indicator_correlations.abs().values,'Sign': sign.values},
    index = market_value_indicator_correlations.index
)

In [None]:
market_value_indicator_correlations.sort_values(by='correlation',ascending=False).head(25)

As we can see by investigating the top correlations, albums from Japan in particular seem to be the most highly correlated with market value, followed by the formats of Vinyl and CD. Interestingly, there seems to be a positive correlation on price for Vinyl albums, and a negative correlation for CDs, which is what one would generally expect. We also observe that limited edition and reissued albums tend to be priced higher, which also makes sense. An interesting takeaway is that albums from europe and north america tend to be negatively correlated with price, which may be linked to their dominance in the genre and the sheer volume of albums they release. Furthermore, we see that the Hard Bop and Modal jazz styles tend to be the most positively correlated with ``market_value``, which is also understandable, given that these were the dominant styles during the modern jazz era, which defines the genre as a whole.

## Genres

In [None]:
genre_sum = df[column_store._genre].sum().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(20,10))
plt.xlabel('Additional Genre')
plt.ylabel('# of Albums')
plt.xticks(rotation=30)
plt.bar(genre_sum.index,genre_sum);
plt.show()

In [None]:
#How many 'pure' jazz albums
pure_jazz_albums = len(df[df[column_store._genre].sum(axis=1)==0])
print('There are {} albums which exclusively list Jazz as a genre, {}% of the dataset'.format(pure_jazz_albums,round(100*pure_jazz_albums/len(df),2)))

## Styles

In [None]:
style_sum = df[column_store._style].sum()

In [None]:
style_sum.describe()

In [None]:
style_sum_top_10 = style_sum[style_sum > style_sum.quantile(0.9)].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(25,10))
plt.xlabel('Listed Styles')
plt.ylabel('# of Albums')
plt.xticks(rotation=90)
plt.bar(style_sum_top_10.index,style_sum_top_10)
plt.show()

As some of the estimators we will use as part of our estimation are sensitive to excessive dimensionality, we will implement a transformer which retains only those indicator variables which are positive for over a certain threshold of entries. This is introduced below as the IndicatorConsolidator transformer.

In [None]:
style_consolidator = IndicatorConsolidator(columns=column_store._style,output_column='style_Other',threshold=250,counter_name='counter_style')

In [None]:
df[column_store._style].sum().median()

In [None]:
style_consolidator.fit_transform(df)[['style_Other','counter_style']]

## Format Description

In [None]:
format_description_sum = df[column_store._format_description].sum().sort_values(ascending=False)
format_description_sum.describe()

In [None]:
format_description_sum.iloc[:25]

In [None]:
description_consolidator = IndicatorConsolidator(columns=column_store._format_description,output_column='format_description_Other',threshold=None,counter_name='counter_format_description')

In [None]:
description_consolidator.fit_transform(df)

## Format Text

In [None]:
format_name_sum = pd.get_dummies(df['format_name']).sum().sort_values(ascending=False)
format_name_sum.describe()

In [None]:
plt.figure(figsize=(25,10))
plt.xlabel('Format Names')
plt.ylabel('# of Albums')
plt.xticks(rotation=90)
plt.bar(format_name_sum.index,format_name_sum)
plt.show()

From the above, we can see that it would be wise to reduce the ``format_name`` indicator variable dimensionality, as there are really three non-trivially large format categories, namely ``CD``, ``Vinyl`` and ``Cassette``. The rest we will combine into an ``Other`` indicator. We will also combine ``CD`` and ``CDr``, as these formats are essentially equivalent

In [None]:
format_name_consolidator = IndicatorConsolidator(output_column='format_name_other',threshold=5000)
format_name_consolidator.fit_transform(pd.get_dummies(df['format_name']))

## Mapping Most Albums

In [None]:
map_df = gpd.read_file(os.path.join(DATA_PATH,'countries/ne_110m_admin_0_countries.shp'))

In [None]:
visualization_countries = list(filter(lambda x: x not in ['country_yugoslavia','country_ussr','country_taiwan'],column_store._geography_country))
country_album_count = df[visualization_countries].sum()

In [None]:
country_codes = pd.Series([COUNTRY_CODES[country.split('_')[-1]] for country in visualization_countries],index=visualization_countries)

In [None]:
country_df = pd.DataFrame(country_album_count)

In [None]:
country_df['codes'] = country_codes

In [None]:
country_df['ISO_A3'] = country_df.loc[:,'codes'].map(M49_TO_ISO3) 

In [None]:
country_df.sort_values(0,ascending=False)

In [None]:
merge = map_df.set_index('ISO_A3').join(country_df.set_index('ISO_A3'))

In [None]:
merge.fillna(0,inplace=True)

In [None]:
vmin, vmax = min(merge[0]),max(merge[0])
fig, ax = plt.subplots(1,figsize=(20,20))
merge.plot(column=0,cmap='gnuplot',linewidth=0.8,ax=ax,edgecolor='0.8')
ax.axis('off')
sm = plt.cm.ScalarMappable(cmap='gnuplot',norm=plt.Normalize(vmin=vmin,vmax=vmax))
sm._a = []
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.05)
cbar = fig.colorbar(sm,cax=cax)
plt.show()

# High Level Features
## Loading and Cleaning

In [None]:
data_loader = DataLoader(db_name='jazz_album',db_path=DATA_PATH,db_dialect='sqlite')
image_embeddings = data_loader.load_high_level_features()
image_embedding_df = pd.DataFrame()
for feature_chunk in tqdm(high_level_features):
    image_embedding_df = pd.concat([image_embedding_df,feature_chunk],axis=0)

In [None]:
image_embedding_df = image_embedding_df.reset_index(drop=True).drop('index',axis=1).astype({'release_id':np.uint32,'bitmap':np.uint8})

In [None]:
df = df.merge(image_embedding_df,on='release_id',how='inner')

In [None]:
del image_embedding_df

In [None]:
col_set['image_embedding'] = 'feature_'
column_store = ColumnStore()
column_store.fit(df,col_set)

In [None]:
pca_2d = ColumnTransformer([
    ('scale',StandardScaler(),list(column_store._image_embedding)),
    ('pca_2d',PCA(n_components=2,random_state=0),list(column_store._image_embedding))
])

In [None]:
umap_2d = ColumnTransformer([
    ('scale', StandardScaler(), list(column_store._image_embedding)),
    ('umap', umap.UMAP(n_components=2,random_state=0,verbose=True), list(column_store._image_embedding))
])

In [None]:
umap_output = umap_2d.fit_transform(df)

In [None]:
output = pca_2d.fit_transform(df)

In [None]:
output.shape

In [None]:
scaler = StandardScaler()
pca_2d = PCA(n_components=2,random_state=0)
pca_3d = PCA(n_components=3,random_state=0)
scale_pca_2d = Pipeline([('scaler',scaler),('pca',pca_2d)])
scale_pca_3d = Pipeline([('scaler',scaler),('pca',pca_3d)])

In [None]:
embedding_2d = scale_pca_2d.fit_transform(data)
embedding_3d = scale_pca_3d.fit_transform(data)

In [None]:
embedding_nd_columns = lambda n: ['embedding_%sd_%s' % (n,i) for i in range(n)]
embedding_2d_columns, embedding_3d_columns = embedding_nd_columns(2), embedding_nd_columns(3)

In [None]:
combined_df = pd.concat(
    [combined_df,
     pd.DataFrame(embedding_2d,columns=embedding_2d_columns),
     pd.DataFrame(embedding_3d,columns=embedding_3d_columns)
    ],
    axis=1)

In [None]:
def get_cmap(n, name='hsv'):
    return plt.cm.get_cmap(name, n)

In [None]:
def plot_indicator_3d(df, columns, embedding_columns=None,**kwargs):
    df = df.copy()
    if not embedding_columns:
        embedding_columns = list(filter(lambda x: 'embedding_3d' in x,df.columns))
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111, projection='3d')
    
    if kwargs.get('colors'):
        colors = kwargs.get('colors')
    else:
        cmap = get_cmap(len(columns))
        colors = [cmap(index) for index in range(len(columns))]
                  
    for index,column in enumerate(columns):
        column_embedding = df[df[column]==1][embedding_columns]
        ax.scatter(
            column_embedding.iloc[:,0],
            column_embedding.iloc[:,1],
            column_embedding.iloc[:,2],
            label=column.split('_')[-1],
            color=colors[index],
            alpha=0.75
        )
    
    ax.legend()

In [None]:
def plot_indicator_2d(df, columns,embedding_columns=None,**kwargs):
    df = df.copy()
    if not embedding_columns:
        embedding_columns = list(filter(lambda x: 'embedding_2d' in x,df.columns))
    cmap = get_cmap(len(columns))
    plt.figure(figsize=(10,10))
    
    if kwargs.get('colors'):
        colors = kwargs.get('colors')
    else:
        cmap = get_cmap(len(columns))
        colors = [cmap(index) for index in range(len(columns))]  
    
    for index,column in enumerate(columns):
        column_embedding = df[df[column]==1][embedding_columns]
        plt.scatter(
            column_embedding.iloc[:,0],
            column_embedding.iloc[:,1],
            label=column.split('_')[-1],
            color=colors[index],
            alpha=0.25,
            edgecolor='white'
        )
    
    plt.legend()

In [None]:
plot_indicator_2d(combined_df,region_columns,embedding_2d_columns)

In [None]:
combined_df['market_value'].quantile([0.25,0.5,0.75,1])

In [None]:
def identify_quantile(x,lower,upper):
    if x >= lower and x <= upper:
        return 1
    return 0

In [None]:
for quantile in [0.25,0.5,0.75,1]:
    combined_df['market_value_quantile_%s' % quantile] = combined_df['market_value'].apply(identify_quantile,lower=combined_df['market_value'].quantile(quantile-0.25),upper=combined_df['market_value'].quantile(quantile))

In [None]:
market_value_quantiles = list(filter(lambda x: 'market_value_quantile' in x,combined_df.columns))
combined_df[market_value_quantiles].describe()

In [None]:
%matplotlib widget

In [None]:
random_sample = [int(random.random()*len(combined_df)) for i in range(5000)]

In [None]:
test = combined_df.loc[random_sample,:]

In [None]:
plot_indicator_3d(test,market_value_quantiles,embedding_3d_columns,colors=['blue','orange','red','green'])

In [None]:
market_value_embedding_3d_correlations = pd.Series({column:np.corrcoef(combined_df['market_value'].values,combined_df[column].values)[0][1] for column in ['embedding_3d_%s' % i for i in range(3)]})

In [None]:
market_value_embedding_3d_correlations

In [None]:
pca_10d = PCA(n_components=10,random_state=0)
scale_pca_10d = Pipeline([('scaler',scaler),('pca',pca_10d)])

In [None]:
embedding_10d = pca_10d.fit_transform(data)
embedding_10d_columns = embedding_nd_columns(10)

In [None]:
combined_df = pd.concat([
    combined_df,
    pd.DataFrame(embedding_10d,columns=embedding_10d_columns)
    ],axis=1
)

In [None]:
get_corr_series(combined_df,'market_value',embedding_10d_columns)

In [None]:
combined_df.drop(embedding_2d_columns+embedding_3d_columns,axis=1,inplace=True)

In [None]:
save_to_pkl(combined_df,'combined')

## Preparing for Machine Learning

## Exporting Data

In [None]:
train_df = pd.concat([X_tr,y_tr],axis=1)
test_df = pd.concat([X_te,y_te],axis=1)

In [None]:
save_to_pkl(train_df,'train')
save_to_pkl(test_df,'test')